Total coverage: 14546 (2%)of 1203497
146 144 6 4 146 142 1 1 116 1 75 11 76 2 2 1 2 1 1 1 2 2 1 2 31 1 111 101 9 5 1 3 100 4 4 1 1 1 1 1 2 1 1 5 3 2 2 2 3 3 1 1 4 4 4 4 4 3 4 2 2 13 2 2 7 1 6 6 3 1 1 1 1 1 1 1 1 1 4 4 2 2 2 2 81 1 10 34 4 6 6 26 169 1 21 112 6 7 7 104 1 1 2 1 1 3 6 8 2 6 2 6 4 2 2 1 1 1 4 4 1 1 1 1 43 1 31 5 6 11 1 4 3 3 6 1 3 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2012,2013 - ARM Ltd * Author: Marc Zyngier <marc.zyngier@arm.com> * * Derived from arch/arm/kvm/guest.c: * Copyright (C) 2012 - Virtual Open Systems and Columbia University * Author: Christoffer Dall <c.dall@virtualopensystems.com> */ #include <linux/bits.h> #include <linux/errno.h> #include <linux/err.h> #include <linux/nospec.h> #include <linux/kvm_host.h> #include <linux/module.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/vmalloc.h> #include <linux/fs.h> #include <kvm/arm_hypercalls.h> #include <asm/cputype.h> #include <linux/uaccess.h> #include <asm/fpsimd.h> #include <asm/kvm.h> #include <asm/kvm_emulate.h> #include <asm/kvm_nested.h> #include <asm/sigcontext.h> #include "trace.h" const struct _kvm_stats_desc kvm_vm_stats_desc[] = { KVM_GENERIC_VM_STATS() }; const struct kvm_stats_header kvm_vm_stats_header = { .name_size = KVM_STATS_NAME_SIZE, .num_desc = ARRAY_SIZE(kvm_vm_stats_desc), .id_offset = sizeof(struct kvm_stats_header), .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE, .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE + sizeof(kvm_vm_stats_desc), }; const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { KVM_GENERIC_VCPU_STATS(), STATS_DESC_COUNTER(VCPU, hvc_exit_stat), STATS_DESC_COUNTER(VCPU, wfe_exit_stat), STATS_DESC_COUNTER(VCPU, wfi_exit_stat), STATS_DESC_COUNTER(VCPU, mmio_exit_user), STATS_DESC_COUNTER(VCPU, mmio_exit_kernel), STATS_DESC_COUNTER(VCPU, signal_exits), STATS_DESC_COUNTER(VCPU, exits) }; const struct kvm_stats_header kvm_vcpu_stats_header = { .name_size = KVM_STATS_NAME_SIZE, .num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc), .id_offset = sizeof(struct kvm_stats_header), .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE, .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE + sizeof(kvm_vcpu_stats_desc), }; static bool core_reg_offset_is_vreg(u64 off) { return off >= KVM_REG_ARM_CORE_REG(fp_regs.vregs) && off < KVM_REG_ARM_CORE_REG(fp_regs.fpsr); } static u64 core_reg_offset_from_id(u64 id) { return id & ~(KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK | KVM_REG_ARM_CORE); } static int core_reg_size_from_offset(const struct kvm_vcpu *vcpu, u64 off) { int size; switch (off) { case KVM_REG_ARM_CORE_REG(regs.regs[0]) ... KVM_REG_ARM_CORE_REG(regs.regs[30]): case KVM_REG_ARM_CORE_REG(regs.sp): case KVM_REG_ARM_CORE_REG(regs.pc): case KVM_REG_ARM_CORE_REG(regs.pstate): case KVM_REG_ARM_CORE_REG(sp_el1): case KVM_REG_ARM_CORE_REG(elr_el1): case KVM_REG_ARM_CORE_REG(spsr[0]) ... KVM_REG_ARM_CORE_REG(spsr[KVM_NR_SPSR - 1]): size = sizeof(__u64); break; case KVM_REG_ARM_CORE_REG(fp_regs.vregs[0]) ... KVM_REG_ARM_CORE_REG(fp_regs.vregs[31]): size = sizeof(__uint128_t); break; case KVM_REG_ARM_CORE_REG(fp_regs.fpsr): case KVM_REG_ARM_CORE_REG(fp_regs.fpcr): size = sizeof(__u32); break; default: return -EINVAL; } if (!IS_ALIGNED(off, size / sizeof(__u32))) return -EINVAL; /* * The KVM_REG_ARM64_SVE regs must be used instead of * KVM_REG_ARM_CORE for accessing the FPSIMD V-registers on * SVE-enabled vcpus: */ if (vcpu_has_sve(vcpu) && core_reg_offset_is_vreg(off)) return -EINVAL; return size; } static void *core_reg_addr(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { u64 off = core_reg_offset_from_id(reg->id); int size = core_reg_size_from_offset(vcpu, off); if (size < 0) return NULL; if (KVM_REG_SIZE(reg->id) != size) return NULL; switch (off) { case KVM_REG_ARM_CORE_REG(regs.regs[0]) ... KVM_REG_ARM_CORE_REG(regs.regs[30]): off -= KVM_REG_ARM_CORE_REG(regs.regs[0]); off /= 2; return &vcpu->arch.ctxt.regs.regs[off]; case KVM_REG_ARM_CORE_REG(regs.sp): return &vcpu->arch.ctxt.regs.sp; case KVM_REG_ARM_CORE_REG(regs.pc): return &vcpu->arch.ctxt.regs.pc; case KVM_REG_ARM_CORE_REG(regs.pstate): return &vcpu->arch.ctxt.regs.pstate; case KVM_REG_ARM_CORE_REG(sp_el1): return __ctxt_sys_reg(&vcpu->arch.ctxt, SP_EL1); case KVM_REG_ARM_CORE_REG(elr_el1): return __ctxt_sys_reg(&vcpu->arch.ctxt, ELR_EL1); case KVM_REG_ARM_CORE_REG(spsr[KVM_SPSR_EL1]): return __ctxt_sys_reg(&vcpu->arch.ctxt, SPSR_EL1); case KVM_REG_ARM_CORE_REG(spsr[KVM_SPSR_ABT]): return &vcpu->arch.ctxt.spsr_abt; case KVM_REG_ARM_CORE_REG(spsr[KVM_SPSR_UND]): return &vcpu->arch.ctxt.spsr_und; case KVM_REG_ARM_CORE_REG(spsr[KVM_SPSR_IRQ]): return &vcpu->arch.ctxt.spsr_irq; case KVM_REG_ARM_CORE_REG(spsr[KVM_SPSR_FIQ]): return &vcpu->arch.ctxt.spsr_fiq; case KVM_REG_ARM_CORE_REG(fp_regs.vregs[0]) ... KVM_REG_ARM_CORE_REG(fp_regs.vregs[31]): off -= KVM_REG_ARM_CORE_REG(fp_regs.vregs[0]); off /= 4; return &vcpu->arch.ctxt.fp_regs.vregs[off]; case KVM_REG_ARM_CORE_REG(fp_regs.fpsr): return &vcpu->arch.ctxt.fp_regs.fpsr; case KVM_REG_ARM_CORE_REG(fp_regs.fpcr): return &vcpu->arch.ctxt.fp_regs.fpcr; default: return NULL; } } static int get_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { /* * Because the kvm_regs structure is a mix of 32, 64 and * 128bit fields, we index it as if it was a 32bit * array. Hence below, nr_regs is the number of entries, and * off the index in the "array". */ __u32 __user *uaddr = (__u32 __user *)(unsigned long)reg->addr; int nr_regs = sizeof(struct kvm_regs) / sizeof(__u32); void *addr; u32 off; /* Our ID is an index into the kvm_regs struct. */ off = core_reg_offset_from_id(reg->id); if (off >= nr_regs || (off + (KVM_REG_SIZE(reg->id) / sizeof(__u32))) >= nr_regs) return -ENOENT; addr = core_reg_addr(vcpu, reg); if (!addr) return -EINVAL; if (copy_to_user(uaddr, addr, KVM_REG_SIZE(reg->id))) return -EFAULT; return 0; } static int set_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { __u32 __user *uaddr = (__u32 __user *)(unsigned long)reg->addr; int nr_regs = sizeof(struct kvm_regs) / sizeof(__u32); __uint128_t tmp; void *valp = &tmp, *addr; u64 off; int err = 0; /* Our ID is an index into the kvm_regs struct. */ off = core_reg_offset_from_id(reg->id); if (off >= nr_regs || (off + (KVM_REG_SIZE(reg->id) / sizeof(__u32))) >= nr_regs) return -ENOENT; addr = core_reg_addr(vcpu, reg); if (!addr) return -EINVAL; if (KVM_REG_SIZE(reg->id) > sizeof(tmp)) return -EINVAL; if (copy_from_user(valp, uaddr, KVM_REG_SIZE(reg->id))) { err = -EFAULT; goto out; } if (off == KVM_REG_ARM_CORE_REG(regs.pstate)) { u64 mode = (*(u64 *)valp) & PSR_AA32_MODE_MASK; switch (mode) { case PSR_AA32_MODE_USR: if (!kvm_supports_32bit_el0()) return -EINVAL; break; case PSR_AA32_MODE_FIQ: case PSR_AA32_MODE_IRQ: case PSR_AA32_MODE_SVC: case PSR_AA32_MODE_ABT: case PSR_AA32_MODE_UND: case PSR_AA32_MODE_SYS: if (!vcpu_el1_is_32bit(vcpu)) return -EINVAL; break; case PSR_MODE_EL2h: case PSR_MODE_EL2t: if (!vcpu_has_nv(vcpu)) return -EINVAL; fallthrough; case PSR_MODE_EL0t: case PSR_MODE_EL1t: case PSR_MODE_EL1h: if (vcpu_el1_is_32bit(vcpu)) return -EINVAL; break; default: err = -EINVAL; goto out; } } memcpy(addr, valp, KVM_REG_SIZE(reg->id)); if (*vcpu_cpsr(vcpu) & PSR_MODE32_BIT) { int i, nr_reg; switch (*vcpu_cpsr(vcpu) & PSR_AA32_MODE_MASK) { /* * Either we are dealing with user mode, and only the * first 15 registers (+ PC) must be narrowed to 32bit. * AArch32 r0-r14 conveniently map to AArch64 x0-x14. */ case PSR_AA32_MODE_USR: case PSR_AA32_MODE_SYS: nr_reg = 15; break; /* * Otherwise, this is a privileged mode, and *all* the * registers must be narrowed to 32bit. */ default: nr_reg = 31; break; } for (i = 0; i < nr_reg; i++) vcpu_set_reg(vcpu, i, (u32)vcpu_get_reg(vcpu, i)); *vcpu_pc(vcpu) = (u32)*vcpu_pc(vcpu); } out: return err; } #define vq_word(vq) (((vq) - SVE_VQ_MIN) / 64) #define vq_mask(vq) ((u64)1 << ((vq) - SVE_VQ_MIN) % 64) #define vq_present(vqs, vq) (!!((vqs)[vq_word(vq)] & vq_mask(vq))) static int get_sve_vls(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { unsigned int max_vq, vq; u64 vqs[KVM_ARM64_SVE_VLS_WORDS]; if (!vcpu_has_sve(vcpu)) return -ENOENT; if (WARN_ON(!sve_vl_valid(vcpu->arch.sve_max_vl))) return -EINVAL; memset(vqs, 0, sizeof(vqs)); max_vq = vcpu_sve_max_vq(vcpu); for (vq = SVE_VQ_MIN; vq <= max_vq; ++vq) if (sve_vq_available(vq)) vqs[vq_word(vq)] |= vq_mask(vq); if (copy_to_user((void __user *)reg->addr, vqs, sizeof(vqs))) return -EFAULT; return 0; } static int set_sve_vls(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { unsigned int max_vq, vq; u64 vqs[KVM_ARM64_SVE_VLS_WORDS]; if (!vcpu_has_sve(vcpu)) return -ENOENT; if (kvm_arm_vcpu_sve_finalized(vcpu)) return -EPERM; /* too late! */ if (WARN_ON(vcpu->arch.sve_state)) return -EINVAL; if (copy_from_user(vqs, (const void __user *)reg->addr, sizeof(vqs))) return -EFAULT; max_vq = 0; for (vq = SVE_VQ_MIN; vq <= SVE_VQ_MAX; ++vq) if (vq_present(vqs, vq)) max_vq = vq; if (max_vq > sve_vq_from_vl(kvm_sve_max_vl)) return -EINVAL; /* * Vector lengths supported by the host can't currently be * hidden from the guest individually: instead we can only set a * maximum via ZCR_EL2.LEN. So, make sure the available vector * lengths match the set requested exactly up to the requested * maximum: */ for (vq = SVE_VQ_MIN; vq <= max_vq; ++vq) if (vq_present(vqs, vq) != sve_vq_available(vq)) return -EINVAL; /* Can't run with no vector lengths at all: */ if (max_vq < SVE_VQ_MIN) return -EINVAL; /* vcpu->arch.sve_state will be alloc'd by kvm_vcpu_finalize_sve() */ vcpu->arch.sve_max_vl = sve_vl_from_vq(max_vq); return 0; } #define SVE_REG_SLICE_SHIFT 0 #define SVE_REG_SLICE_BITS 5 #define SVE_REG_ID_SHIFT (SVE_REG_SLICE_SHIFT + SVE_REG_SLICE_BITS) #define SVE_REG_ID_BITS 5 #define SVE_REG_SLICE_MASK \ GENMASK(SVE_REG_SLICE_SHIFT + SVE_REG_SLICE_BITS - 1, \ SVE_REG_SLICE_SHIFT) #define SVE_REG_ID_MASK \ GENMASK(SVE_REG_ID_SHIFT + SVE_REG_ID_BITS - 1, SVE_REG_ID_SHIFT) #define SVE_NUM_SLICES (1 << SVE_REG_SLICE_BITS) #define KVM_SVE_ZREG_SIZE KVM_REG_SIZE(KVM_REG_ARM64_SVE_ZREG(0, 0)) #define KVM_SVE_PREG_SIZE KVM_REG_SIZE(KVM_REG_ARM64_SVE_PREG(0, 0)) /* * Number of register slices required to cover each whole SVE register. * NOTE: Only the first slice every exists, for now. * If you are tempted to modify this, you must also rework sve_reg_to_region() * to match: */ #define vcpu_sve_slices(vcpu) 1 /* Bounds of a single SVE register slice within vcpu->arch.sve_state */ struct sve_state_reg_region { unsigned int koffset; /* offset into sve_state in kernel memory */ unsigned int klen; /* length in kernel memory */ unsigned int upad; /* extra trailing padding in user memory */ }; /* * Validate SVE register ID and get sanitised bounds for user/kernel SVE * register copy */ static int sve_reg_to_region(struct sve_state_reg_region *region, struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { /* reg ID ranges for Z- registers */ const u64 zreg_id_min = KVM_REG_ARM64_SVE_ZREG(0, 0); const u64 zreg_id_max = KVM_REG_ARM64_SVE_ZREG(SVE_NUM_ZREGS - 1, SVE_NUM_SLICES - 1); /* reg ID ranges for P- registers and FFR (which are contiguous) */ const u64 preg_id_min = KVM_REG_ARM64_SVE_PREG(0, 0); const u64 preg_id_max = KVM_REG_ARM64_SVE_FFR(SVE_NUM_SLICES - 1); unsigned int vq; unsigned int reg_num; unsigned int reqoffset, reqlen; /* User-requested offset and length */ unsigned int maxlen; /* Maximum permitted length */ size_t sve_state_size; const u64 last_preg_id = KVM_REG_ARM64_SVE_PREG(SVE_NUM_PREGS - 1, SVE_NUM_SLICES - 1); /* Verify that the P-regs and FFR really do have contiguous IDs: */ BUILD_BUG_ON(KVM_REG_ARM64_SVE_FFR(0) != last_preg_id + 1); /* Verify that we match the UAPI header: */ BUILD_BUG_ON(SVE_NUM_SLICES != KVM_ARM64_SVE_MAX_SLICES); reg_num = (reg->id & SVE_REG_ID_MASK) >> SVE_REG_ID_SHIFT; if (reg->id >= zreg_id_min && reg->id <= zreg_id_max) { if (!vcpu_has_sve(vcpu) || (reg->id & SVE_REG_SLICE_MASK) > 0) return -ENOENT; vq = vcpu_sve_max_vq(vcpu); reqoffset = SVE_SIG_ZREG_OFFSET(vq, reg_num) - SVE_SIG_REGS_OFFSET; reqlen = KVM_SVE_ZREG_SIZE; maxlen = SVE_SIG_ZREG_SIZE(vq); } else if (reg->id >= preg_id_min && reg->id <= preg_id_max) { if (!vcpu_has_sve(vcpu) || (reg->id & SVE_REG_SLICE_MASK) > 0) return -ENOENT; vq = vcpu_sve_max_vq(vcpu); reqoffset = SVE_SIG_PREG_OFFSET(vq, reg_num) - SVE_SIG_REGS_OFFSET; reqlen = KVM_SVE_PREG_SIZE; maxlen = SVE_SIG_PREG_SIZE(vq); } else { return -EINVAL; } sve_state_size = vcpu_sve_state_size(vcpu); if (WARN_ON(!sve_state_size)) return -EINVAL; region->koffset = array_index_nospec(reqoffset, sve_state_size); region->klen = min(maxlen, reqlen); region->upad = reqlen - region->klen; return 0; } static int get_sve_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { int ret; struct sve_state_reg_region region; char __user *uptr = (char __user *)reg->addr; /* Handle the KVM_REG_ARM64_SVE_VLS pseudo-reg as a special case: */ if (reg->id == KVM_REG_ARM64_SVE_VLS) return get_sve_vls(vcpu, reg); /* Try to interpret reg ID as an architectural SVE register... */ ret = sve_reg_to_region(&region, vcpu, reg); if (ret) return ret; if (!kvm_arm_vcpu_sve_finalized(vcpu)) return -EPERM; if (copy_to_user(uptr, vcpu->arch.sve_state + region.koffset, region.klen) || clear_user(uptr + region.klen, region.upad)) return -EFAULT; return 0; } static int set_sve_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { int ret; struct sve_state_reg_region region; const char __user *uptr = (const char __user *)reg->addr; /* Handle the KVM_REG_ARM64_SVE_VLS pseudo-reg as a special case: */ if (reg->id == KVM_REG_ARM64_SVE_VLS) return set_sve_vls(vcpu, reg); /* Try to interpret reg ID as an architectural SVE register... */ ret = sve_reg_to_region(&region, vcpu, reg); if (ret) return ret; if (!kvm_arm_vcpu_sve_finalized(vcpu)) return -EPERM; if (copy_from_user(vcpu->arch.sve_state + region.koffset, uptr, region.klen)) return -EFAULT; return 0; } int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { return -EINVAL; } int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { return -EINVAL; } static int copy_core_reg_indices(const struct kvm_vcpu *vcpu, u64 __user *uindices) { unsigned int i; int n = 0; for (i = 0; i < sizeof(struct kvm_regs) / sizeof(__u32); i++) { u64 reg = KVM_REG_ARM64 | KVM_REG_ARM_CORE | i; int size = core_reg_size_from_offset(vcpu, i); if (size < 0) continue; switch (size) { case sizeof(__u32): reg |= KVM_REG_SIZE_U32; break; case sizeof(__u64): reg |= KVM_REG_SIZE_U64; break; case sizeof(__uint128_t): reg |= KVM_REG_SIZE_U128; break; default: WARN_ON(1); continue; } if (uindices) { if (put_user(reg, uindices)) return -EFAULT; uindices++; } n++; } return n; } static unsigned long num_core_regs(const struct kvm_vcpu *vcpu) { return copy_core_reg_indices(vcpu, NULL); } static const u64 timer_reg_list[] = { KVM_REG_ARM_TIMER_CTL, KVM_REG_ARM_TIMER_CNT, KVM_REG_ARM_TIMER_CVAL, KVM_REG_ARM_PTIMER_CTL, KVM_REG_ARM_PTIMER_CNT, KVM_REG_ARM_PTIMER_CVAL, }; #define NUM_TIMER_REGS ARRAY_SIZE(timer_reg_list) static bool is_timer_reg(u64 index) { switch (index) { case KVM_REG_ARM_TIMER_CTL: case KVM_REG_ARM_TIMER_CNT: case KVM_REG_ARM_TIMER_CVAL: case KVM_REG_ARM_PTIMER_CTL: case KVM_REG_ARM_PTIMER_CNT: case KVM_REG_ARM_PTIMER_CVAL: return true; } return false; } static int copy_timer_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) { for (int i = 0; i < NUM_TIMER_REGS; i++) { if (put_user(timer_reg_list[i], uindices)) return -EFAULT; uindices++; } return 0; } static int set_timer_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { void __user *uaddr = (void __user *)(long)reg->addr; u64 val; int ret; ret = copy_from_user(&val, uaddr, KVM_REG_SIZE(reg->id)); if (ret != 0) return -EFAULT; return kvm_arm_timer_set_reg(vcpu, reg->id, val); } static int get_timer_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { void __user *uaddr = (void __user *)(long)reg->addr; u64 val; val = kvm_arm_timer_get_reg(vcpu, reg->id); return copy_to_user(uaddr, &val, KVM_REG_SIZE(reg->id)) ? -EFAULT : 0; } static unsigned long num_sve_regs(const struct kvm_vcpu *vcpu) { const unsigned int slices = vcpu_sve_slices(vcpu); if (!vcpu_has_sve(vcpu)) return 0; /* Policed by KVM_GET_REG_LIST: */ WARN_ON(!kvm_arm_vcpu_sve_finalized(vcpu)); return slices * (SVE_NUM_PREGS + SVE_NUM_ZREGS + 1 /* FFR */) + 1; /* KVM_REG_ARM64_SVE_VLS */ } static int copy_sve_reg_indices(const struct kvm_vcpu *vcpu, u64 __user *uindices) { const unsigned int slices = vcpu_sve_slices(vcpu); u64 reg; unsigned int i, n; int num_regs = 0; if (!vcpu_has_sve(vcpu)) return 0; /* Policed by KVM_GET_REG_LIST: */ WARN_ON(!kvm_arm_vcpu_sve_finalized(vcpu)); /* * Enumerate this first, so that userspace can save/restore in * the order reported by KVM_GET_REG_LIST: */ reg = KVM_REG_ARM64_SVE_VLS; if (put_user(reg, uindices++)) return -EFAULT; ++num_regs; for (i = 0; i < slices; i++) { for (n = 0; n < SVE_NUM_ZREGS; n++) { reg = KVM_REG_ARM64_SVE_ZREG(n, i); if (put_user(reg, uindices++)) return -EFAULT; num_regs++; } for (n = 0; n < SVE_NUM_PREGS; n++) { reg = KVM_REG_ARM64_SVE_PREG(n, i); if (put_user(reg, uindices++)) return -EFAULT; num_regs++; } reg = KVM_REG_ARM64_SVE_FFR(i); if (put_user(reg, uindices++)) return -EFAULT; num_regs++; } return num_regs; } /** * kvm_arm_num_regs - how many registers do we present via KVM_GET_ONE_REG * @vcpu: the vCPU pointer * * This is for all registers. */ unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu) { unsigned long res = 0; res += num_core_regs(vcpu); res += num_sve_regs(vcpu); res += kvm_arm_num_sys_reg_descs(vcpu); res += kvm_arm_get_fw_num_regs(vcpu); res += NUM_TIMER_REGS; return res; } /** * kvm_arm_copy_reg_indices - get indices of all registers. * @vcpu: the vCPU pointer * @uindices: register list to copy * * We do core registers right here, then we append system regs. */ int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) { int ret; ret = copy_core_reg_indices(vcpu, uindices); if (ret < 0) return ret; uindices += ret; ret = copy_sve_reg_indices(vcpu, uindices); if (ret < 0) return ret; uindices += ret; ret = kvm_arm_copy_fw_reg_indices(vcpu, uindices); if (ret < 0) return ret; uindices += kvm_arm_get_fw_num_regs(vcpu); ret = copy_timer_indices(vcpu, uindices); if (ret < 0) return ret; uindices += NUM_TIMER_REGS; return kvm_arm_copy_sys_reg_indices(vcpu, uindices); } int kvm_arm_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { /* We currently use nothing arch-specific in upper 32 bits */ if ((reg->id & ~KVM_REG_SIZE_MASK) >> 32 != KVM_REG_ARM64 >> 32) return -EINVAL; switch (reg->id & KVM_REG_ARM_COPROC_MASK) { case KVM_REG_ARM_CORE: return get_core_reg(vcpu, reg); case KVM_REG_ARM_FW: case KVM_REG_ARM_FW_FEAT_BMAP: return kvm_arm_get_fw_reg(vcpu, reg); case KVM_REG_ARM64_SVE: return get_sve_reg(vcpu, reg); } if (is_timer_reg(reg->id)) return get_timer_reg(vcpu, reg); return kvm_arm_sys_reg_get_reg(vcpu, reg); } int kvm_arm_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { /* We currently use nothing arch-specific in upper 32 bits */ if ((reg->id & ~KVM_REG_SIZE_MASK) >> 32 != KVM_REG_ARM64 >> 32) return -EINVAL; switch (reg->id & KVM_REG_ARM_COPROC_MASK) { case KVM_REG_ARM_CORE: return set_core_reg(vcpu, reg); case KVM_REG_ARM_FW: case KVM_REG_ARM_FW_FEAT_BMAP: return kvm_arm_set_fw_reg(vcpu, reg); case KVM_REG_ARM64_SVE: return set_sve_reg(vcpu, reg); } if (is_timer_reg(reg->id)) return set_timer_reg(vcpu, reg); return kvm_arm_sys_reg_set_reg(vcpu, reg); } int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { return -EINVAL; } int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { return -EINVAL; } int __kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) { events->exception.serror_has_esr = cpus_have_final_cap(ARM64_HAS_RAS_EXTN); events->exception.serror_pending = (vcpu->arch.hcr_el2 & HCR_VSE) || vcpu_get_flag(vcpu, NESTED_SERROR_PENDING); if (events->exception.serror_pending && events->exception.serror_has_esr) events->exception.serror_esr = vcpu_get_vsesr(vcpu); /* * We never return a pending ext_dabt here because we deliver it to * the virtual CPU directly when setting the event and it's no longer * 'pending' at this point. */ return 0; } static void commit_pending_events(struct kvm_vcpu *vcpu) { if (!vcpu_get_flag(vcpu, PENDING_EXCEPTION)) return; /* * Reset the MMIO emulation state to avoid stepping PC after emulating * the exception entry. */ vcpu->mmio_needed = false; kvm_call_hyp(__kvm_adjust_pc, vcpu); } int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) { bool serror_pending = events->exception.serror_pending; bool has_esr = events->exception.serror_has_esr; bool ext_dabt_pending = events->exception.ext_dabt_pending; u64 esr = events->exception.serror_esr; int ret = 0; /* * Immediately commit the pending SEA to the vCPU's architectural * state which is necessary since we do not return a pending SEA * to userspace via KVM_GET_VCPU_EVENTS. */ if (ext_dabt_pending) { ret = kvm_inject_sea_dabt(vcpu, kvm_vcpu_get_hfar(vcpu)); commit_pending_events(vcpu); } if (ret < 0) return ret; if (!serror_pending) return 0; if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN) && has_esr) return -EINVAL; if (has_esr && (esr & ~ESR_ELx_ISS_MASK)) return -EINVAL; if (has_esr) ret = kvm_inject_serror_esr(vcpu, esr); else ret = kvm_inject_serror(vcpu); /* * We could've decided that the SError is due for immediate software * injection; commit the exception in case userspace decides it wants * to inject more exceptions for some strange reason. */ commit_pending_events(vcpu); return (ret < 0) ? ret : 0; } u32 __attribute_const__ kvm_target_cpu(void) { unsigned long implementor = read_cpuid_implementor(); unsigned long part_number = read_cpuid_part_number(); switch (implementor) { case ARM_CPU_IMP_ARM: switch (part_number) { case ARM_CPU_PART_AEM_V8: return KVM_ARM_TARGET_AEM_V8; case ARM_CPU_PART_FOUNDATION: return KVM_ARM_TARGET_FOUNDATION_V8; case ARM_CPU_PART_CORTEX_A53: return KVM_ARM_TARGET_CORTEX_A53; case ARM_CPU_PART_CORTEX_A57: return KVM_ARM_TARGET_CORTEX_A57; } break; case ARM_CPU_IMP_APM: switch (part_number) { case APM_CPU_PART_XGENE: return KVM_ARM_TARGET_XGENE_POTENZA; } break; } /* Return a default generic target */ return KVM_ARM_TARGET_GENERIC_V8; } int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { return -EINVAL; } int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { return -EINVAL; } int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, struct kvm_translation *tr) { return -EINVAL; } /** * kvm_arch_vcpu_ioctl_set_guest_debug - set up guest debugging * @vcpu: the vCPU pointer * @dbg: the ioctl data buffer * * This sets up and enables the VM for guest debugging. Userspace * passes in a control flag to enable different debug types and * potentially other architecture specific information in the rest of * the structure. */ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) { trace_kvm_set_guest_debug(vcpu, dbg->control); if (dbg->control & ~KVM_GUESTDBG_VALID_MASK) return -EINVAL; if (!(dbg->control & KVM_GUESTDBG_ENABLE)) { vcpu->guest_debug = 0; vcpu_clear_flag(vcpu, HOST_SS_ACTIVE_PENDING); return 0; } vcpu->guest_debug = dbg->control; /* Hardware assisted Break and Watch points */ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) vcpu->arch.external_debug_state = dbg->arch; return 0; } int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { int ret; switch (attr->group) { case KVM_ARM_VCPU_PMU_V3_CTRL: mutex_lock(&vcpu->kvm->arch.config_lock); ret = kvm_arm_pmu_v3_set_attr(vcpu, attr); mutex_unlock(&vcpu->kvm->arch.config_lock); break; case KVM_ARM_VCPU_TIMER_CTRL: ret = kvm_arm_timer_set_attr(vcpu, attr); break; case KVM_ARM_VCPU_PVTIME_CTRL: ret = kvm_arm_pvtime_set_attr(vcpu, attr); break; default: ret = -ENXIO; break; } return ret; } int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { int ret; switch (attr->group) { case KVM_ARM_VCPU_PMU_V3_CTRL: ret = kvm_arm_pmu_v3_get_attr(vcpu, attr); break; case KVM_ARM_VCPU_TIMER_CTRL: ret = kvm_arm_timer_get_attr(vcpu, attr); break; case KVM_ARM_VCPU_PVTIME_CTRL: ret = kvm_arm_pvtime_get_attr(vcpu, attr); break; default: ret = -ENXIO; break; } return ret; } int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { int ret; switch (attr->group) { case KVM_ARM_VCPU_PMU_V3_CTRL: ret = kvm_arm_pmu_v3_has_attr(vcpu, attr); break; case KVM_ARM_VCPU_TIMER_CTRL: ret = kvm_arm_timer_has_attr(vcpu, attr); break; case KVM_ARM_VCPU_PVTIME_CTRL: ret = kvm_arm_pvtime_has_attr(vcpu, attr); break; default: ret = -ENXIO; break; } return ret; } int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, struct kvm_arm_copy_mte_tags *copy_tags) { gpa_t guest_ipa = copy_tags->guest_ipa; size_t length = copy_tags->length; void __user *tags = copy_tags->addr; gpa_t gfn; bool write = !(copy_tags->flags & KVM_ARM_TAGS_FROM_GUEST); int ret = 0; if (!kvm_has_mte(kvm)) return -EINVAL; if (copy_tags->reserved[0] || copy_tags->reserved[1]) return -EINVAL; if (copy_tags->flags & ~KVM_ARM_TAGS_FROM_GUEST) return -EINVAL; if (length & ~PAGE_MASK || guest_ipa & ~PAGE_MASK) return -EINVAL; /* Lengths above INT_MAX cannot be represented in the return value */ if (length > INT_MAX) return -EINVAL; gfn = gpa_to_gfn(guest_ipa); mutex_lock(&kvm->slots_lock); if (write && atomic_read(&kvm->nr_memslots_dirty_logging)) { ret = -EBUSY; goto out; } while (length > 0) { struct page *page = __gfn_to_page(kvm, gfn, write); void *maddr; unsigned long num_tags; struct folio *folio; if (!page) { ret = -EFAULT; goto out; } if (!pfn_to_online_page(page_to_pfn(page))) { /* Reject ZONE_DEVICE memory */ kvm_release_page_unused(page); ret = -EFAULT; goto out; } folio = page_folio(page); maddr = page_address(page); if (!write) { if ((folio_test_hugetlb(folio) && folio_test_hugetlb_mte_tagged(folio)) || page_mte_tagged(page)) num_tags = mte_copy_tags_to_user(tags, maddr, MTE_GRANULES_PER_PAGE); else /* No tags in memory, so write zeros */ num_tags = MTE_GRANULES_PER_PAGE - clear_user(tags, MTE_GRANULES_PER_PAGE); kvm_release_page_clean(page); } else { /* * Only locking to serialise with a concurrent * __set_ptes() in the VMM but still overriding the * tags, hence ignoring the return value. */ if (folio_test_hugetlb(folio)) folio_try_hugetlb_mte_tagging(folio); else try_page_mte_tagging(page); num_tags = mte_copy_tags_from_user(maddr, tags, MTE_GRANULES_PER_PAGE); /* uaccess failed, don't leave stale tags */ if (num_tags != MTE_GRANULES_PER_PAGE) mte_clear_page_tags(maddr); if (folio_test_hugetlb(folio)) folio_set_hugetlb_mte_tagged(folio); else set_page_mte_tagged(page); kvm_release_page_dirty(page); } if (num_tags != MTE_GRANULES_PER_PAGE) { ret = -EFAULT; goto out; } gfn++; tags += num_tags; length -= PAGE_SIZE; } out: mutex_unlock(&kvm->slots_lock); /* If some data has been copied report the number of bytes copied */ if (length != copy_tags->length) return copy_tags->length - length; return ret; }
14 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCHED_TASK_H #define _LINUX_SCHED_TASK_H /* * Interface between the scheduler and various task lifetime (fork()/exit()) * functionality: */ #include <linux/rcupdate.h> #include <linux/refcount.h> #include <linux/sched.h> #include <linux/uaccess.h> struct task_struct; struct rusage; union thread_union; struct css_set; /* All the bits taken by the old clone syscall. */ #define CLONE_LEGACY_FLAGS 0xffffffffULL struct kernel_clone_args { u64 flags; int __user *pidfd; int __user *child_tid; int __user *parent_tid; const char *name; int exit_signal; u32 kthread:1; u32 io_thread:1; u32 user_worker:1; u32 no_files:1; unsigned long stack; unsigned long stack_size; unsigned long tls; pid_t *set_tid; /* Number of elements in *set_tid */ size_t set_tid_size; int cgroup; int idle; int (*fn)(void *); void *fn_arg; struct cgroup *cgrp; struct css_set *cset; unsigned int kill_seq; }; /* * This serializes "schedule()" and also protects * the run-queue from deletions/modifications (but * _adding_ to the beginning of the run-queue has * a separate lock). */ extern rwlock_t tasklist_lock; extern spinlock_t mmlist_lock; extern union thread_union init_thread_union; extern struct task_struct init_task; extern int lockdep_tasklist_lock_is_held(void); extern asmlinkage void schedule_tail(struct task_struct *prev); extern void init_idle(struct task_struct *idle, int cpu); extern int sched_fork(unsigned long clone_flags, struct task_struct *p); extern int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs); extern void sched_cancel_fork(struct task_struct *p); extern void sched_post_fork(struct task_struct *p); extern void sched_dead(struct task_struct *p); void __noreturn do_task_dead(void); void __noreturn make_task_dead(int signr); extern void mm_cache_init(void); extern void proc_caches_init(void); extern void fork_init(void); extern void release_task(struct task_struct * p); extern int copy_thread(struct task_struct *, const struct kernel_clone_args *); extern void flush_thread(void); #ifdef CONFIG_HAVE_EXIT_THREAD extern void exit_thread(struct task_struct *tsk); #else static inline void exit_thread(struct task_struct *tsk) { } #endif extern __noreturn void do_group_exit(int); extern void exit_files(struct task_struct *); extern void exit_itimers(struct task_struct *); extern pid_t kernel_clone(struct kernel_clone_args *kargs); struct task_struct *copy_process(struct pid *pid, int trace, int node, struct kernel_clone_args *args); struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node); struct task_struct *fork_idle(int); extern pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name, unsigned long flags); extern pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags); extern long kernel_wait4(pid_t, int __user *, int, struct rusage *); int kernel_wait(pid_t pid, int *stat); extern void free_task(struct task_struct *tsk); /* sched_exec is called by processes performing an exec */ extern void sched_exec(void); static inline struct task_struct *get_task_struct(struct task_struct *t) { refcount_inc(&t->usage); return t; } static inline struct task_struct *tryget_task_struct(struct task_struct *t) { return refcount_inc_not_zero(&t->usage) ? t : NULL; } extern void __put_task_struct(struct task_struct *t); extern void __put_task_struct_rcu_cb(struct rcu_head *rhp); static inline void put_task_struct(struct task_struct *t) { if (!refcount_dec_and_test(&t->usage)) return; /* * Under PREEMPT_RT, we can't call __put_task_struct * in atomic context because it will indirectly * acquire sleeping locks. The same is true if the * current process has a mutex enqueued (blocked on * a PI chain). * * In !RT, it is always safe to call __put_task_struct(). * Though, in order to simplify the code, resort to the * deferred call too. * * call_rcu() will schedule __put_task_struct_rcu_cb() * to be called in process context. * * __put_task_struct() is called when * refcount_dec_and_test(&t->usage) succeeds. * * This means that it can't "conflict" with * put_task_struct_rcu_user() which abuses ->rcu the same * way; rcu_users has a reference so task->usage can't be * zero after rcu_users 1 -> 0 transition. * * delayed_free_task() also uses ->rcu, but it is only called * when it fails to fork a process. Therefore, there is no * way it can conflict with __put_task_struct(). */ call_rcu(&t->rcu, __put_task_struct_rcu_cb); } DEFINE_FREE(put_task, struct task_struct *, if (_T) put_task_struct(_T)) static inline void put_task_struct_many(struct task_struct *t, int nr) { if (refcount_sub_and_test(nr, &t->usage)) __put_task_struct(t); } void put_task_struct_rcu_user(struct task_struct *task); /* Free all architecture-specific resources held by a thread. */ void release_thread(struct task_struct *dead_task); #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT extern int arch_task_struct_size __read_mostly; #else # define arch_task_struct_size (sizeof(struct task_struct)) #endif #ifndef CONFIG_HAVE_ARCH_THREAD_STRUCT_WHITELIST /* * If an architecture has not declared a thread_struct whitelist we * must assume something there may need to be copied to userspace. */ static inline void arch_thread_struct_whitelist(unsigned long *offset, unsigned long *size) { *offset = 0; /* Handle dynamically sized thread_struct. */ *size = arch_task_struct_size - offsetof(struct task_struct, thread); } #endif #ifdef CONFIG_VMAP_STACK static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t) { return t->stack_vm_area; } #else static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t) { return NULL; } #endif /* * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring * subscriptions and synchronises with wait4(). Also used in procfs. Also * pins the final release of task.io_context. Also protects ->cpuset and * ->cgroup.subsys[]. And ->vfork_done. And ->sysvshm.shm_clist. * * Nests both inside and outside of read_lock(&tasklist_lock). * It must not be nested with write_lock_irq(&tasklist_lock), * neither inside nor outside. */ static inline void task_lock(struct task_struct *p) { spin_lock(&p->alloc_lock); } static inline void task_unlock(struct task_struct *p) { spin_unlock(&p->alloc_lock); } DEFINE_GUARD(task_lock, struct task_struct *, task_lock(_T), task_unlock(_T)) #endif /* _LINUX_SCHED_TASK_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef _NET_ETHTOOL_NETLINK_H #define _NET_ETHTOOL_NETLINK_H #include <linux/ethtool_netlink.h> #include <linux/netdevice.h> #include <net/genetlink.h> #include <net/sock.h> struct ethnl_req_info; u32 ethnl_bcast_seq_next(void); int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info, const struct nlattr *nest, struct net *net, struct netlink_ext_ack *extack, bool require_dev); int ethnl_fill_reply_header(struct sk_buff *skb, struct net_device *dev, u16 attrtype); struct sk_buff *ethnl_reply_init(size_t payload, struct net_device *dev, u8 cmd, u16 hdr_attrtype, struct genl_info *info, void **ehdrp); void *ethnl_dump_put(struct sk_buff *skb, struct netlink_callback *cb, u8 cmd); void *ethnl_bcastmsg_put(struct sk_buff *skb, u8 cmd); void *ethnl_unicast_put(struct sk_buff *skb, u32 portid, u32 seq, u8 cmd); int ethnl_multicast(struct sk_buff *skb, struct net_device *dev); void ethnl_notify(struct net_device *dev, unsigned int cmd, const struct ethnl_req_info *req_info); /** * ethnl_strz_size() - calculate attribute length for fixed size string * @s: ETH_GSTRING_LEN sized string (may not be null terminated) * * Return: total length of an attribute with null terminated string from @s */ static inline int ethnl_strz_size(const char *s) { return nla_total_size(strnlen(s, ETH_GSTRING_LEN) + 1); } /** * ethnl_put_strz() - put string attribute with fixed size string * @skb: skb with the message * @attrtype: attribute type * @s: ETH_GSTRING_LEN sized string (may not be null terminated) * * Puts an attribute with null terminated string from @s into the message. * * Return: 0 on success, negative error code on failure */ static inline int ethnl_put_strz(struct sk_buff *skb, u16 attrtype, const char *s) { unsigned int len = strnlen(s, ETH_GSTRING_LEN); struct nlattr *attr; attr = nla_reserve(skb, attrtype, len + 1); if (!attr) return -EMSGSIZE; memcpy(nla_data(attr), s, len); ((char *)nla_data(attr))[len] = '\0'; return 0; } /** * ethnl_update_u32() - update u32 value from NLA_U32 attribute * @dst: value to update * @attr: netlink attribute with new value or null * @mod: pointer to bool for modification tracking * * Copy the u32 value from NLA_U32 netlink attribute @attr into variable * pointed to by @dst; do nothing if @attr is null. Bool pointed to by @mod * is set to true if this function changed the value of *dst, otherwise it * is left as is. */ static inline void ethnl_update_u32(u32 *dst, const struct nlattr *attr, bool *mod) { u32 val; if (!attr) return; val = nla_get_u32(attr); if (*dst == val) return; *dst = val; *mod = true; } /** * ethnl_update_u8() - update u8 value from NLA_U8 attribute * @dst: value to update * @attr: netlink attribute with new value or null * @mod: pointer to bool for modification tracking * * Copy the u8 value from NLA_U8 netlink attribute @attr into variable * pointed to by @dst; do nothing if @attr is null. Bool pointed to by @mod * is set to true if this function changed the value of *dst, otherwise it * is left as is. */ static inline void ethnl_update_u8(u8 *dst, const struct nlattr *attr, bool *mod) { u8 val; if (!attr) return; val = nla_get_u8(attr); if (*dst == val) return; *dst = val; *mod = true; } /** * ethnl_update_bool32() - update u32 used as bool from NLA_U8 attribute * @dst: value to update * @attr: netlink attribute with new value or null * @mod: pointer to bool for modification tracking * * Use the u8 value from NLA_U8 netlink attribute @attr to set u32 variable * pointed to by @dst to 0 (if zero) or 1 (if not); do nothing if @attr is * null. Bool pointed to by @mod is set to true if this function changed the * logical value of *dst, otherwise it is left as is. */ static inline void ethnl_update_bool32(u32 *dst, const struct nlattr *attr, bool *mod) { u8 val; if (!attr) return; val = !!nla_get_u8(attr); if (!!*dst == val) return; *dst = val; *mod = true; } /** * ethnl_update_bool() - updateb bool used as bool from NLA_U8 attribute * @dst: value to update * @attr: netlink attribute with new value or null * @mod: pointer to bool for modification tracking * * Use the bool value from NLA_U8 netlink attribute @attr to set bool variable * pointed to by @dst to 0 (if zero) or 1 (if not); do nothing if @attr is * null. Bool pointed to by @mod is set to true if this function changed the * logical value of *dst, otherwise it is left as is. */ static inline void ethnl_update_bool(bool *dst, const struct nlattr *attr, bool *mod) { u8 val; if (!attr) return; val = !!nla_get_u8(attr); if (!!*dst == val) return; *dst = val; *mod = true; } /** * ethnl_update_binary() - update binary data from NLA_BINARY attribute * @dst: value to update * @len: destination buffer length * @attr: netlink attribute with new value or null * @mod: pointer to bool for modification tracking * * Use the u8 value from NLA_U8 netlink attribute @attr to rewrite data block * of length @len at @dst by attribute payload; do nothing if @attr is null. * Bool pointed to by @mod is set to true if this function changed the logical * value of *dst, otherwise it is left as is. */ static inline void ethnl_update_binary(void *dst, unsigned int len, const struct nlattr *attr, bool *mod) { if (!attr) return; if (nla_len(attr) < len) len = nla_len(attr); if (!memcmp(dst, nla_data(attr), len)) return; memcpy(dst, nla_data(attr), len); *mod = true; } /** * ethnl_update_bitfield32() - update u32 value from NLA_BITFIELD32 attribute * @dst: value to update * @attr: netlink attribute with new value or null * @mod: pointer to bool for modification tracking * * Update bits in u32 value which are set in attribute's mask to values from * attribute's value. Do nothing if @attr is null or the value wouldn't change; * otherwise, set bool pointed to by @mod to true. */ static inline void ethnl_update_bitfield32(u32 *dst, const struct nlattr *attr, bool *mod) { struct nla_bitfield32 change; u32 newval; if (!attr) return; change = nla_get_bitfield32(attr); newval = (*dst & ~change.selector) | (change.value & change.selector); if (*dst == newval) return; *dst = newval; *mod = true; } /** * ethnl_reply_header_size() - total size of reply header * * This is an upper estimate so that we do not need to hold RTNL lock longer * than necessary (to prevent rename between size estimate and composing the * message). Accounts only for device ifindex and name as those are the only * attributes ethnl_fill_reply_header() puts into the reply header. */ static inline unsigned int ethnl_reply_header_size(void) { return nla_total_size(nla_total_size(sizeof(u32)) + nla_total_size(IFNAMSIZ)); } /* GET request handling */ /* Unified processing of GET requests uses two data structures: request info * and reply data. Request info holds information parsed from client request * and its stays constant through all request processing. Reply data holds data * retrieved from ethtool_ops callbacks or other internal sources which is used * to compose the reply. When processing a dump request, request info is filled * only once (when the request message is parsed) but reply data is filled for * each reply message. * * Both structures consist of part common for all request types (struct * ethnl_req_info and struct ethnl_reply_data defined below) and optional * parts specific for each request type. Common part always starts at offset 0. */ /** * struct ethnl_req_info - base type of request information for GET requests * @dev: network device the request is for (may be null) * @dev_tracker: refcount tracker for @dev reference * @flags: request flags common for all request types * @phy_index: phy_device index connected to @dev this request is for. Can be * 0 if the request doesn't target a phy, or if the @dev's attached * phy is targeted. * * This is a common base for request specific structures holding data from * parsed userspace request. These always embed struct ethnl_req_info at * zero offset. */ struct ethnl_req_info { struct net_device *dev; netdevice_tracker dev_tracker; u32 flags; u32 phy_index; }; static inline void ethnl_parse_header_dev_put(struct ethnl_req_info *req_info) { netdev_put(req_info->dev, &req_info->dev_tracker); } /** * ethnl_req_get_phydev() - Gets the phy_device targeted by this request, * if any. Must be called under rntl_lock(). * @req_info: The ethnl request to get the phy from. * @tb: The netlink attributes array, for error reporting. * @header: The netlink header index, used for error reporting. * @extack: The netlink extended ACK, for error reporting. * * The caller must hold RTNL, until it's done interacting with the returned * phy_device. * * Return: A phy_device pointer corresponding either to the passed phy_index * if one is provided. If not, the phy_device attached to the * net_device targeted by this request is returned. If there's no * targeted net_device, or no phy_device is attached, NULL is * returned. If the provided phy_index is invalid, an error pointer * is returned. */ struct phy_device *ethnl_req_get_phydev(const struct ethnl_req_info *req_info, struct nlattr **tb, unsigned int header, struct netlink_ext_ack *extack); /** * struct ethnl_reply_data - base type of reply data for GET requests * @dev: device for current reply message; in single shot requests it is * equal to &ethnl_req_info.dev; in dumps it's different for each * reply message * * This is a common base for request specific structures holding data for * kernel reply message. These always embed struct ethnl_reply_data at zero * offset. */ struct ethnl_reply_data { struct net_device *dev; }; int ethnl_ops_begin(struct net_device *dev); void ethnl_ops_complete(struct net_device *dev); enum ethnl_sock_type { ETHTOOL_SOCK_TYPE_MODULE_FW_FLASH, }; struct ethnl_sock_priv { struct net_device *dev; u32 portid; enum ethnl_sock_type type; }; int ethnl_sock_priv_set(struct sk_buff *skb, struct net_device *dev, u32 portid, enum ethnl_sock_type type); /** * struct ethnl_request_ops - unified handling of GET and SET requests * @request_cmd: command id for request (GET) * @reply_cmd: command id for reply (GET_REPLY) * @hdr_attr: attribute type for request header * @req_info_size: size of request info * @reply_data_size: size of reply data * @allow_nodev_do: allow non-dump request with no device identification * @set_ntf_cmd: notification to generate on changes (SET) * @parse_request: * Parse request except common header (struct ethnl_req_info). Common * header is already filled on entry, the rest up to @repdata_offset * is zero initialized. This callback should only modify type specific * request info by parsed attributes from request message. * Called for both GET and SET. Information parsed for SET will * be conveyed to the req_info used during NTF generation. * @prepare_data: * Retrieve and prepare data needed to compose a reply message. Calls to * ethtool_ops handlers are limited to this callback. Common reply data * (struct ethnl_reply_data) is filled on entry, type specific part after * it is zero initialized. This callback should only modify the type * specific part of reply data. Device identification from struct * ethnl_reply_data is to be used as for dump requests, it iterates * through network devices while dev member of struct ethnl_req_info * points to the device from client request. * @reply_size: * Estimate reply message size. Returned value must be sufficient for * message payload without common reply header. The callback may returned * estimate higher than actual message size if exact calculation would * not be worth the saved memory space. * @fill_reply: * Fill reply message payload (except for common header) from reply data. * The callback must not generate more payload than previously called * ->reply_size() estimated. * @cleanup_data: * Optional cleanup called when reply data is no longer needed. Can be * used e.g. to free any additional data structures outside the main * structure which were allocated by ->prepare_data(). When processing * dump requests, ->cleanup() is called for each message. * @set_validate: * Check if set operation is supported for a given device, and perform * extra input checks. Expected return values: * - 0 if the operation is a noop for the device (rare) * - 1 if operation should proceed to calling @set * - negative errno on errors * Called without any locks, just a reference on the netdev. * @set: * Execute the set operation. The implementation should return * - 0 if no configuration has changed * - 1 if configuration changed and notification should be generated * - negative errno on errors * * Description of variable parts of GET request handling when using the * unified infrastructure. When used, a pointer to an instance of this * structure is to be added to &ethnl_default_requests array and generic * handlers ethnl_default_doit(), ethnl_default_dumpit(), * ethnl_default_start() and ethnl_default_done() used in @ethtool_genl_ops; * ethnl_default_notify() can be used in @ethnl_notify_handlers to send * notifications of the corresponding type. */ struct ethnl_request_ops { u8 request_cmd; u8 reply_cmd; u16 hdr_attr; unsigned int req_info_size; unsigned int reply_data_size; bool allow_nodev_do; u8 set_ntf_cmd; int (*parse_request)(struct ethnl_req_info *req_info, struct nlattr **tb, struct netlink_ext_ack *extack); int (*prepare_data)(const struct ethnl_req_info *req_info, struct ethnl_reply_data *reply_data, const struct genl_info *info); int (*reply_size)(const struct ethnl_req_info *req_info, const struct ethnl_reply_data *reply_data); int (*fill_reply)(struct sk_buff *skb, const struct ethnl_req_info *req_info, const struct ethnl_reply_data *reply_data); void (*cleanup_data)(struct ethnl_reply_data *reply_data); int (*set_validate)(struct ethnl_req_info *req_info, struct genl_info *info); int (*set)(struct ethnl_req_info *req_info, struct genl_info *info); }; /* request handlers */ extern const struct ethnl_request_ops ethnl_strset_request_ops; extern const struct ethnl_request_ops ethnl_linkinfo_request_ops; extern const struct ethnl_request_ops ethnl_linkmodes_request_ops; extern const struct ethnl_request_ops ethnl_linkstate_request_ops; extern const struct ethnl_request_ops ethnl_debug_request_ops; extern const struct ethnl_request_ops ethnl_wol_request_ops; extern const struct ethnl_request_ops ethnl_features_request_ops; extern const struct ethnl_request_ops ethnl_privflags_request_ops; extern const struct ethnl_request_ops ethnl_rings_request_ops; extern const struct ethnl_request_ops ethnl_channels_request_ops; extern const struct ethnl_request_ops ethnl_coalesce_request_ops; extern const struct ethnl_request_ops ethnl_pause_request_ops; extern const struct ethnl_request_ops ethnl_eee_request_ops; extern const struct ethnl_request_ops ethnl_tsinfo_request_ops; extern const struct ethnl_request_ops ethnl_fec_request_ops; extern const struct ethnl_request_ops ethnl_module_eeprom_request_ops; extern const struct ethnl_request_ops ethnl_stats_request_ops; extern const struct ethnl_request_ops ethnl_phc_vclocks_request_ops; extern const struct ethnl_request_ops ethnl_module_request_ops; extern const struct ethnl_request_ops ethnl_pse_request_ops; extern const struct ethnl_request_ops ethnl_rss_request_ops; extern const struct ethnl_request_ops ethnl_plca_cfg_request_ops; extern const struct ethnl_request_ops ethnl_plca_status_request_ops; extern const struct ethnl_request_ops ethnl_mm_request_ops; extern const struct ethnl_request_ops ethnl_phy_request_ops; extern const struct ethnl_request_ops ethnl_tsconfig_request_ops; extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1]; extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1]; extern const struct nla_policy ethnl_header_policy_phy[ETHTOOL_A_HEADER_PHY_INDEX + 1]; extern const struct nla_policy ethnl_header_policy_phy_stats[ETHTOOL_A_HEADER_PHY_INDEX + 1]; extern const struct nla_policy ethnl_strset_get_policy[ETHTOOL_A_STRSET_COUNTS_ONLY + 1]; extern const struct nla_policy ethnl_linkinfo_get_policy[ETHTOOL_A_LINKINFO_HEADER + 1]; extern const struct nla_policy ethnl_linkinfo_set_policy[ETHTOOL_A_LINKINFO_TP_MDIX_CTRL + 1]; extern const struct nla_policy ethnl_linkmodes_get_policy[ETHTOOL_A_LINKMODES_HEADER + 1]; extern const struct nla_policy ethnl_linkmodes_set_policy[ETHTOOL_A_LINKMODES_LANES + 1]; extern const struct nla_policy ethnl_linkstate_get_policy[ETHTOOL_A_LINKSTATE_HEADER + 1]; extern const struct nla_policy ethnl_debug_get_policy[ETHTOOL_A_DEBUG_HEADER + 1]; extern const struct nla_policy ethnl_debug_set_policy[ETHTOOL_A_DEBUG_MSGMASK + 1]; extern const struct nla_policy ethnl_wol_get_policy[ETHTOOL_A_WOL_HEADER + 1]; extern const struct nla_policy ethnl_wol_set_policy[ETHTOOL_A_WOL_SOPASS + 1]; extern const struct nla_policy ethnl_features_get_policy[ETHTOOL_A_FEATURES_HEADER + 1]; extern const struct nla_policy ethnl_features_set_policy[ETHTOOL_A_FEATURES_WANTED + 1]; extern const struct nla_policy ethnl_privflags_get_policy[ETHTOOL_A_PRIVFLAGS_HEADER + 1]; extern const struct nla_policy ethnl_privflags_set_policy[ETHTOOL_A_PRIVFLAGS_FLAGS + 1]; extern const struct nla_policy ethnl_rings_get_policy[ETHTOOL_A_RINGS_HEADER + 1]; extern const struct nla_policy ethnl_rings_set_policy[ETHTOOL_A_RINGS_HDS_THRESH_MAX + 1]; extern const struct nla_policy ethnl_channels_get_policy[ETHTOOL_A_CHANNELS_HEADER + 1]; extern const struct nla_policy ethnl_channels_set_policy[ETHTOOL_A_CHANNELS_COMBINED_COUNT + 1]; extern const struct nla_policy ethnl_coalesce_get_policy[ETHTOOL_A_COALESCE_HEADER + 1]; extern const struct nla_policy ethnl_coalesce_set_policy[ETHTOOL_A_COALESCE_MAX + 1]; extern const struct nla_policy ethnl_pause_get_policy[ETHTOOL_A_PAUSE_STATS_SRC + 1]; extern const struct nla_policy ethnl_pause_set_policy[ETHTOOL_A_PAUSE_STATS_SRC + 1]; extern const struct nla_policy ethnl_eee_get_policy[ETHTOOL_A_EEE_HEADER + 1]; extern const struct nla_policy ethnl_eee_set_policy[ETHTOOL_A_EEE_TX_LPI_TIMER + 1]; extern const struct nla_policy ethnl_tsinfo_get_policy[ETHTOOL_A_TSINFO_MAX + 1]; extern const struct nla_policy ethnl_cable_test_act_policy[ETHTOOL_A_CABLE_TEST_HEADER + 1]; extern const struct nla_policy ethnl_cable_test_tdr_act_policy[ETHTOOL_A_CABLE_TEST_TDR_CFG + 1]; extern const struct nla_policy ethnl_tunnel_info_get_policy[ETHTOOL_A_TUNNEL_INFO_HEADER + 1]; extern const struct nla_policy ethnl_fec_get_policy[ETHTOOL_A_FEC_HEADER + 1]; extern const struct nla_policy ethnl_fec_set_policy[ETHTOOL_A_FEC_AUTO + 1]; extern const struct nla_policy ethnl_module_eeprom_get_policy[ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS + 1]; extern const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_SRC + 1]; extern const struct nla_policy ethnl_phc_vclocks_get_policy[ETHTOOL_A_PHC_VCLOCKS_HEADER + 1]; extern const struct nla_policy ethnl_module_get_policy[ETHTOOL_A_MODULE_HEADER + 1]; extern const struct nla_policy ethnl_module_set_policy[ETHTOOL_A_MODULE_POWER_MODE_POLICY + 1]; extern const struct nla_policy ethnl_pse_get_policy[ETHTOOL_A_PSE_HEADER + 1]; extern const struct nla_policy ethnl_pse_set_policy[ETHTOOL_A_PSE_MAX + 1]; extern const struct nla_policy ethnl_rss_get_policy[ETHTOOL_A_RSS_START_CONTEXT + 1]; extern const struct nla_policy ethnl_rss_set_policy[ETHTOOL_A_RSS_FLOW_HASH + 1]; extern const struct nla_policy ethnl_rss_create_policy[ETHTOOL_A_RSS_INPUT_XFRM + 1]; extern const struct nla_policy ethnl_rss_delete_policy[ETHTOOL_A_RSS_CONTEXT + 1]; extern const struct nla_policy ethnl_plca_get_cfg_policy[ETHTOOL_A_PLCA_HEADER + 1]; extern const struct nla_policy ethnl_plca_set_cfg_policy[ETHTOOL_A_PLCA_MAX + 1]; extern const struct nla_policy ethnl_plca_get_status_policy[ETHTOOL_A_PLCA_HEADER + 1]; extern const struct nla_policy ethnl_mm_get_policy[ETHTOOL_A_MM_HEADER + 1]; extern const struct nla_policy ethnl_mm_set_policy[ETHTOOL_A_MM_MAX + 1]; extern const struct nla_policy ethnl_module_fw_flash_act_policy[ETHTOOL_A_MODULE_FW_FLASH_PASSWORD + 1]; extern const struct nla_policy ethnl_phy_get_policy[ETHTOOL_A_PHY_HEADER + 1]; extern const struct nla_policy ethnl_tsconfig_get_policy[ETHTOOL_A_TSCONFIG_HEADER + 1]; extern const struct nla_policy ethnl_tsconfig_set_policy[ETHTOOL_A_TSCONFIG_MAX + 1]; int ethnl_set_features(struct sk_buff *skb, struct genl_info *info); int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info); int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info); int ethnl_tunnel_info_doit(struct sk_buff *skb, struct genl_info *info); int ethnl_tunnel_info_start(struct netlink_callback *cb); int ethnl_tunnel_info_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int ethnl_act_module_fw_flash(struct sk_buff *skb, struct genl_info *info); int ethnl_rss_dump_start(struct netlink_callback *cb); int ethnl_rss_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int ethnl_tsinfo_start(struct netlink_callback *cb); int ethnl_tsinfo_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int ethnl_tsinfo_done(struct netlink_callback *cb); int ethnl_rss_create_doit(struct sk_buff *skb, struct genl_info *info); int ethnl_rss_delete_doit(struct sk_buff *skb, struct genl_info *info); extern const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN]; extern const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN]; extern const char stats_eth_mac_names[__ETHTOOL_A_STATS_ETH_MAC_CNT][ETH_GSTRING_LEN]; extern const char stats_eth_ctrl_names[__ETHTOOL_A_STATS_ETH_CTRL_CNT][ETH_GSTRING_LEN]; extern const char stats_rmon_names[__ETHTOOL_A_STATS_RMON_CNT][ETH_GSTRING_LEN]; extern const char stats_phy_names[__ETHTOOL_A_STATS_PHY_CNT][ETH_GSTRING_LEN]; #endif /* _NET_ETHTOOL_NETLINK_H */
1 247 247 184 184 189 189 157 144 9 184 184 118 184 185 185 185 185 185 185 185 3 3 3 3 185 184 185 185 185 185 185 184 185 184 185 185 185 108 184 185 8 8 8 8 8 268 269 265 268 267 180 123 122 264 264 265 266 266 3 3 3 1 1 1 1 1 17 17 17 17 17 1 17 17 17 17 17 17 17 17 17 17 1 17 1 1 1 1 1 1 23 22 9 9 23 23 23 23 23 23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 // SPDX-License-Identifier: GPL-2.0-only /* * kernel/workqueue.c - generic async execution with shared worker pool * * Copyright (C) 2002 Ingo Molnar * * Derived from the taskqueue/keventd code by: * David Woodhouse <dwmw2@infradead.org> * Andrew Morton * Kai Petzke <wpp@marie.physik.tu-berlin.de> * Theodore Ts'o <tytso@mit.edu> * * Made to use alloc_percpu by Christoph Lameter. * * Copyright (C) 2010 SUSE Linux Products GmbH * Copyright (C) 2010 Tejun Heo <tj@kernel.org> * * This is the generic async execution mechanism. Work items as are * executed in process context. The worker pool is shared and * automatically managed. There are two worker pools for each CPU (one for * normal work items and the other for high priority ones) and some extra * pools for workqueues which are not bound to any specific CPU - the * number of these backing pools is dynamic. * * Please read Documentation/core-api/workqueue.rst for details. */ #include <linux/export.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/signal.h> #include <linux/completion.h> #include <linux/workqueue.h> #include <linux/slab.h> #include <linux/cpu.h> #include <linux/notifier.h> #include <linux/kthread.h> #include <linux/hardirq.h> #include <linux/mempolicy.h> #include <linux/freezer.h> #include <linux/debug_locks.h> #include <linux/lockdep.h> #include <linux/idr.h> #include <linux/jhash.h> #include <linux/hashtable.h> #include <linux/rculist.h> #include <linux/nodemask.h> #include <linux/moduleparam.h> #include <linux/uaccess.h> #include <linux/sched/isolation.h> #include <linux/sched/debug.h> #include <linux/nmi.h> #include <linux/kvm_para.h> #include <linux/delay.h> #include <linux/irq_work.h> #include "workqueue_internal.h" enum worker_pool_flags { /* * worker_pool flags * * A bound pool is either associated or disassociated with its CPU. * While associated (!DISASSOCIATED), all workers are bound to the * CPU and none has %WORKER_UNBOUND set and concurrency management * is in effect. * * While DISASSOCIATED, the cpu may be offline and all workers have * %WORKER_UNBOUND set and concurrency management disabled, and may * be executing on any CPU. The pool behaves as an unbound one. * * Note that DISASSOCIATED should be flipped only while holding * wq_pool_attach_mutex to avoid changing binding state while * worker_attach_to_pool() is in progress. * * As there can only be one concurrent BH execution context per CPU, a * BH pool is per-CPU and always DISASSOCIATED. */ POOL_BH = 1 << 0, /* is a BH pool */ POOL_MANAGER_ACTIVE = 1 << 1, /* being managed */ POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ POOL_BH_DRAINING = 1 << 3, /* draining after CPU offline */ }; enum worker_flags { /* worker flags */ WORKER_DIE = 1 << 1, /* die die die */ WORKER_IDLE = 1 << 2, /* is idle */ WORKER_PREP = 1 << 3, /* preparing to run works */ WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ WORKER_UNBOUND = 1 << 7, /* worker is unbound */ WORKER_REBOUND = 1 << 8, /* worker was rebound */ WORKER_NOT_RUNNING = WORKER_PREP | WORKER_CPU_INTENSIVE | WORKER_UNBOUND | WORKER_REBOUND, }; enum work_cancel_flags { WORK_CANCEL_DELAYED = 1 << 0, /* canceling a delayed_work */ WORK_CANCEL_DISABLE = 1 << 1, /* canceling to disable */ }; enum wq_internal_consts { NR_STD_WORKER_POOLS = 2, /* # standard pools per cpu */ UNBOUND_POOL_HASH_ORDER = 6, /* hashed by pool->attrs */ BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */ IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */ MAYDAY_INITIAL_TIMEOUT = HZ / 100 >= 2 ? HZ / 100 : 2, /* call for help after 10ms (min two ticks) */ MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ CREATE_COOLDOWN = HZ, /* time to breath after fail */ /* * Rescue workers are used only on emergencies and shared by * all cpus. Give MIN_NICE. */ RESCUER_NICE_LEVEL = MIN_NICE, HIGHPRI_NICE_LEVEL = MIN_NICE, WQ_NAME_LEN = 32, WORKER_ID_LEN = 10 + WQ_NAME_LEN, /* "kworker/R-" + WQ_NAME_LEN */ }; /* * We don't want to trap softirq for too long. See MAX_SOFTIRQ_TIME and * MAX_SOFTIRQ_RESTART in kernel/softirq.c. These are macros because * msecs_to_jiffies() can't be an initializer. */ #define BH_WORKER_JIFFIES msecs_to_jiffies(2) #define BH_WORKER_RESTARTS 10 /* * Structure fields follow one of the following exclusion rules. * * I: Modifiable by initialization/destruction paths and read-only for * everyone else. * * P: Preemption protected. Disabling preemption is enough and should * only be modified and accessed from the local cpu. * * L: pool->lock protected. Access with pool->lock held. * * LN: pool->lock and wq_node_nr_active->lock protected for writes. Either for * reads. * * K: Only modified by worker while holding pool->lock. Can be safely read by * self, while holding pool->lock or from IRQ context if %current is the * kworker. * * S: Only modified by worker self. * * A: wq_pool_attach_mutex protected. * * PL: wq_pool_mutex protected. * * PR: wq_pool_mutex protected for writes. RCU protected for reads. * * PW: wq_pool_mutex and wq->mutex protected for writes. Either for reads. * * PWR: wq_pool_mutex and wq->mutex protected for writes. Either or * RCU for reads. * * WQ: wq->mutex protected. * * WR: wq->mutex protected for writes. RCU protected for reads. * * WO: wq->mutex protected for writes. Updated with WRITE_ONCE() and can be read * with READ_ONCE() without locking. * * MD: wq_mayday_lock protected. * * WD: Used internally by the watchdog. */ /* struct worker is defined in workqueue_internal.h */ struct worker_pool { raw_spinlock_t lock; /* the pool lock */ int cpu; /* I: the associated cpu */ int node; /* I: the associated node ID */ int id; /* I: pool ID */ unsigned int flags; /* L: flags */ unsigned long watchdog_ts; /* L: watchdog timestamp */ bool cpu_stall; /* WD: stalled cpu bound pool */ /* * The counter is incremented in a process context on the associated CPU * w/ preemption disabled, and decremented or reset in the same context * but w/ pool->lock held. The readers grab pool->lock and are * guaranteed to see if the counter reached zero. */ int nr_running; struct list_head worklist; /* L: list of pending works */ int nr_workers; /* L: total number of workers */ int nr_idle; /* L: currently idle workers */ struct list_head idle_list; /* L: list of idle workers */ struct timer_list idle_timer; /* L: worker idle timeout */ struct work_struct idle_cull_work; /* L: worker idle cleanup */ struct timer_list mayday_timer; /* L: SOS timer for workers */ /* a workers is either on busy_hash or idle_list, or the manager */ DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER); /* L: hash of busy workers */ struct worker *manager; /* L: purely informational */ struct list_head workers; /* A: attached workers */ struct ida worker_ida; /* worker IDs for task name */ struct workqueue_attrs *attrs; /* I: worker attributes */ struct hlist_node hash_node; /* PL: unbound_pool_hash node */ int refcnt; /* PL: refcnt for unbound pools */ /* * Destruction of pool is RCU protected to allow dereferences * from get_work_pool(). */ struct rcu_head rcu; }; /* * Per-pool_workqueue statistics. These can be monitored using * tools/workqueue/wq_monitor.py. */ enum pool_workqueue_stats { PWQ_STAT_STARTED, /* work items started execution */ PWQ_STAT_COMPLETED, /* work items completed execution */ PWQ_STAT_CPU_TIME, /* total CPU time consumed */ PWQ_STAT_CPU_INTENSIVE, /* wq_cpu_intensive_thresh_us violations */ PWQ_STAT_CM_WAKEUP, /* concurrency-management worker wakeups */ PWQ_STAT_REPATRIATED, /* unbound workers brought back into scope */ PWQ_STAT_MAYDAY, /* maydays to rescuer */ PWQ_STAT_RESCUED, /* linked work items executed by rescuer */ PWQ_NR_STATS, }; /* * The per-pool workqueue. While queued, bits below WORK_PWQ_SHIFT * of work_struct->data are used for flags and the remaining high bits * point to the pwq; thus, pwqs need to be aligned at two's power of the * number of flag bits. */ struct pool_workqueue { struct worker_pool *pool; /* I: the associated pool */ struct workqueue_struct *wq; /* I: the owning workqueue */ int work_color; /* L: current color */ int flush_color; /* L: flushing color */ int refcnt; /* L: reference count */ int nr_in_flight[WORK_NR_COLORS]; /* L: nr of in_flight works */ bool plugged; /* L: execution suspended */ /* * nr_active management and WORK_STRUCT_INACTIVE: * * When pwq->nr_active >= max_active, new work item is queued to * pwq->inactive_works instead of pool->worklist and marked with * WORK_STRUCT_INACTIVE. * * All work items marked with WORK_STRUCT_INACTIVE do not participate in * nr_active and all work items in pwq->inactive_works are marked with * WORK_STRUCT_INACTIVE. But not all WORK_STRUCT_INACTIVE work items are * in pwq->inactive_works. Some of them are ready to run in * pool->worklist or worker->scheduled. Those work itmes are only struct * wq_barrier which is used for flush_work() and should not participate * in nr_active. For non-barrier work item, it is marked with * WORK_STRUCT_INACTIVE iff it is in pwq->inactive_works. */ int nr_active; /* L: nr of active works */ struct list_head inactive_works; /* L: inactive works */ struct list_head pending_node; /* LN: node on wq_node_nr_active->pending_pwqs */ struct list_head pwqs_node; /* WR: node on wq->pwqs */ struct list_head mayday_node; /* MD: node on wq->maydays */ u64 stats[PWQ_NR_STATS]; /* * Release of unbound pwq is punted to a kthread_worker. See put_pwq() * and pwq_release_workfn() for details. pool_workqueue itself is also * RCU protected so that the first pwq can be determined without * grabbing wq->mutex. */ struct kthread_work release_work; struct rcu_head rcu; } __aligned(1 << WORK_STRUCT_PWQ_SHIFT); /* * Structure used to wait for workqueue flush. */ struct wq_flusher { struct list_head list; /* WQ: list of flushers */ int flush_color; /* WQ: flush color waiting for */ struct completion done; /* flush completion */ }; struct wq_device; /* * Unlike in a per-cpu workqueue where max_active limits its concurrency level * on each CPU, in an unbound workqueue, max_active applies to the whole system. * As sharing a single nr_active across multiple sockets can be very expensive, * the counting and enforcement is per NUMA node. * * The following struct is used to enforce per-node max_active. When a pwq wants * to start executing a work item, it should increment ->nr using * tryinc_node_nr_active(). If acquisition fails due to ->nr already being over * ->max, the pwq is queued on ->pending_pwqs. As in-flight work items finish * and decrement ->nr, node_activate_pending_pwq() activates the pending pwqs in * round-robin order. */ struct wq_node_nr_active { int max; /* per-node max_active */ atomic_t nr; /* per-node nr_active */ raw_spinlock_t lock; /* nests inside pool locks */ struct list_head pending_pwqs; /* LN: pwqs with inactive works */ }; /* * The externally visible workqueue. It relays the issued work items to * the appropriate worker_pool through its pool_workqueues. */ struct workqueue_struct { struct list_head pwqs; /* WR: all pwqs of this wq */ struct list_head list; /* PR: list of all workqueues */ struct mutex mutex; /* protects this wq */ int work_color; /* WQ: current work color */ int flush_color; /* WQ: current flush color */ atomic_t nr_pwqs_to_flush; /* flush in progress */ struct wq_flusher *first_flusher; /* WQ: first flusher */ struct list_head flusher_queue; /* WQ: flush waiters */ struct list_head flusher_overflow; /* WQ: flush overflow list */ struct list_head maydays; /* MD: pwqs requesting rescue */ struct worker *rescuer; /* MD: rescue worker */ int nr_drainers; /* WQ: drain in progress */ /* See alloc_workqueue() function comment for info on min/max_active */ int max_active; /* WO: max active works */ int min_active; /* WO: min active works */ int saved_max_active; /* WQ: saved max_active */ int saved_min_active; /* WQ: saved min_active */ struct workqueue_attrs *unbound_attrs; /* PW: only for unbound wqs */ struct pool_workqueue __rcu *dfl_pwq; /* PW: only for unbound wqs */ #ifdef CONFIG_SYSFS struct wq_device *wq_dev; /* I: for sysfs interface */ #endif #ifdef CONFIG_LOCKDEP char *lock_name; struct lock_class_key key; struct lockdep_map __lockdep_map; struct lockdep_map *lockdep_map; #endif char name[WQ_NAME_LEN]; /* I: workqueue name */ /* * Destruction of workqueue_struct is RCU protected to allow walking * the workqueues list without grabbing wq_pool_mutex. * This is used to dump all workqueues from sysrq. */ struct rcu_head rcu; /* hot fields used during command issue, aligned to cacheline */ unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */ struct pool_workqueue __rcu * __percpu *cpu_pwq; /* I: per-cpu pwqs */ struct wq_node_nr_active *node_nr_active[]; /* I: per-node nr_active */ }; /* * Each pod type describes how CPUs should be grouped for unbound workqueues. * See the comment above workqueue_attrs->affn_scope. */ struct wq_pod_type { int nr_pods; /* number of pods */ cpumask_var_t *pod_cpus; /* pod -> cpus */ int *pod_node; /* pod -> node */ int *cpu_pod; /* cpu -> pod */ }; struct work_offq_data { u32 pool_id; u32 disable; u32 flags; }; static const char *wq_affn_names[WQ_AFFN_NR_TYPES] = { [WQ_AFFN_DFL] = "default", [WQ_AFFN_CPU] = "cpu", [WQ_AFFN_SMT] = "smt", [WQ_AFFN_CACHE] = "cache", [WQ_AFFN_NUMA] = "numa", [WQ_AFFN_SYSTEM] = "system", }; /* * Per-cpu work items which run for longer than the following threshold are * automatically considered CPU intensive and excluded from concurrency * management to prevent them from noticeably delaying other per-cpu work items. * ULONG_MAX indicates that the user hasn't overridden it with a boot parameter. * The actual value is initialized in wq_cpu_intensive_thresh_init(). */ static unsigned long wq_cpu_intensive_thresh_us = ULONG_MAX; module_param_named(cpu_intensive_thresh_us, wq_cpu_intensive_thresh_us, ulong, 0644); #ifdef CONFIG_WQ_CPU_INTENSIVE_REPORT static unsigned int wq_cpu_intensive_warning_thresh = 4; module_param_named(cpu_intensive_warning_thresh, wq_cpu_intensive_warning_thresh, uint, 0644); #endif /* see the comment above the definition of WQ_POWER_EFFICIENT */ static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT); module_param_named(power_efficient, wq_power_efficient, bool, 0444); static bool wq_online; /* can kworkers be created yet? */ static bool wq_topo_initialized __read_mostly = false; static struct kmem_cache *pwq_cache; static struct wq_pod_type wq_pod_types[WQ_AFFN_NR_TYPES]; static enum wq_affn_scope wq_affn_dfl = WQ_AFFN_CACHE; /* buf for wq_update_unbound_pod_attrs(), protected by CPU hotplug exclusion */ static struct workqueue_attrs *unbound_wq_update_pwq_attrs_buf; static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */ static DEFINE_RAW_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ /* wait for manager to go away */ static struct rcuwait manager_wait = __RCUWAIT_INITIALIZER(manager_wait); static LIST_HEAD(workqueues); /* PR: list of all workqueues */ static bool workqueue_freezing; /* PL: have wqs started freezing? */ /* PL: mirror the cpu_online_mask excluding the CPU in the midst of hotplugging */ static cpumask_var_t wq_online_cpumask; /* PL&A: allowable cpus for unbound wqs and work items */ static cpumask_var_t wq_unbound_cpumask; /* PL: user requested unbound cpumask via sysfs */ static cpumask_var_t wq_requested_unbound_cpumask; /* PL: isolated cpumask to be excluded from unbound cpumask */ static cpumask_var_t wq_isolated_cpumask; /* for further constrain wq_unbound_cpumask by cmdline parameter*/ static struct cpumask wq_cmdline_cpumask __initdata; /* CPU where unbound work was last round robin scheduled from this CPU */ static DEFINE_PER_CPU(int, wq_rr_cpu_last); /* * Local execution of unbound work items is no longer guaranteed. The * following always forces round-robin CPU selection on unbound work items * to uncover usages which depend on it. */ #ifdef CONFIG_DEBUG_WQ_FORCE_RR_CPU static bool wq_debug_force_rr_cpu = true; #else static bool wq_debug_force_rr_cpu = false; #endif module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644); /* to raise softirq for the BH worker pools on other CPUs */ static DEFINE_PER_CPU_SHARED_ALIGNED(struct irq_work [NR_STD_WORKER_POOLS], bh_pool_irq_works); /* the BH worker pools */ static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], bh_worker_pools); /* the per-cpu worker pools */ static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools); static DEFINE_IDR(worker_pool_idr); /* PR: idr of all pools */ /* PL: hash of all unbound pools keyed by pool->attrs */ static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER); /* I: attributes used when instantiating standard unbound pools on demand */ static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS]; /* I: attributes used when instantiating ordered pools on demand */ static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS]; /* * I: kthread_worker to release pwq's. pwq release needs to be bounced to a * process context while holding a pool lock. Bounce to a dedicated kthread * worker to avoid A-A deadlocks. */ static struct kthread_worker *pwq_release_worker __ro_after_init; struct workqueue_struct *system_wq __ro_after_init; EXPORT_SYMBOL(system_wq); struct workqueue_struct *system_percpu_wq __ro_after_init; EXPORT_SYMBOL(system_percpu_wq); struct workqueue_struct *system_highpri_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_highpri_wq); struct workqueue_struct *system_long_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_long_wq); struct workqueue_struct *system_unbound_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_unbound_wq); struct workqueue_struct *system_dfl_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_dfl_wq); struct workqueue_struct *system_freezable_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_freezable_wq); struct workqueue_struct *system_power_efficient_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_power_efficient_wq); struct workqueue_struct *system_freezable_power_efficient_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq); struct workqueue_struct *system_bh_wq; EXPORT_SYMBOL_GPL(system_bh_wq); struct workqueue_struct *system_bh_highpri_wq; EXPORT_SYMBOL_GPL(system_bh_highpri_wq); static int worker_thread(void *__worker); static void workqueue_sysfs_unregister(struct workqueue_struct *wq); static void show_pwq(struct pool_workqueue *pwq); static void show_one_worker_pool(struct worker_pool *pool); #define CREATE_TRACE_POINTS #include <trace/events/workqueue.h> #define assert_rcu_or_pool_mutex() \ RCU_LOCKDEP_WARN(!rcu_read_lock_any_held() && \ !lockdep_is_held(&wq_pool_mutex), \ "RCU or wq_pool_mutex should be held") #define assert_rcu_or_wq_mutex_or_pool_mutex(wq) \ RCU_LOCKDEP_WARN(!rcu_read_lock_any_held() && \ !lockdep_is_held(&wq->mutex) && \ !lockdep_is_held(&wq_pool_mutex), \ "RCU, wq->mutex or wq_pool_mutex should be held") #define for_each_bh_worker_pool(pool, cpu) \ for ((pool) = &per_cpu(bh_worker_pools, cpu)[0]; \ (pool) < &per_cpu(bh_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \ (pool)++) #define for_each_cpu_worker_pool(pool, cpu) \ for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \ (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \ (pool)++) /** * for_each_pool - iterate through all worker_pools in the system * @pool: iteration cursor * @pi: integer used for iteration * * This must be called either with wq_pool_mutex held or RCU read * locked. If the pool needs to be used beyond the locking in effect, the * caller is responsible for guaranteeing that the pool stays online. * * The if/else clause exists only for the lockdep assertion and can be * ignored. */ #define for_each_pool(pool, pi) \ idr_for_each_entry(&worker_pool_idr, pool, pi) \ if (({ assert_rcu_or_pool_mutex(); false; })) { } \ else /** * for_each_pool_worker - iterate through all workers of a worker_pool * @worker: iteration cursor * @pool: worker_pool to iterate workers of * * This must be called with wq_pool_attach_mutex. * * The if/else clause exists only for the lockdep assertion and can be * ignored. */ #define for_each_pool_worker(worker, pool) \ list_for_each_entry((worker), &(pool)->workers, node) \ if (({ lockdep_assert_held(&wq_pool_attach_mutex); false; })) { } \ else /** * for_each_pwq - iterate through all pool_workqueues of the specified workqueue * @pwq: iteration cursor * @wq: the target workqueue * * This must be called either with wq->mutex held or RCU read locked. * If the pwq needs to be used beyond the locking in effect, the caller is * responsible for guaranteeing that the pwq stays online. * * The if/else clause exists only for the lockdep assertion and can be * ignored. */ #define for_each_pwq(pwq, wq) \ list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node, \ lockdep_is_held(&(wq->mutex))) #ifdef CONFIG_DEBUG_OBJECTS_WORK static const struct debug_obj_descr work_debug_descr; static void *work_debug_hint(void *addr) { return ((struct work_struct *) addr)->func; } static bool work_is_static_object(void *addr) { struct work_struct *work = addr; return test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work)); } /* * fixup_init is called when: * - an active object is initialized */ static bool work_fixup_init(void *addr, enum debug_obj_state state) { struct work_struct *work = addr; switch (state) { case ODEBUG_STATE_ACTIVE: cancel_work_sync(work); debug_object_init(work, &work_debug_descr); return true; default: return false; } } /* * fixup_free is called when: * - an active object is freed */ static bool work_fixup_free(void *addr, enum debug_obj_state state) { struct work_struct *work = addr; switch (state) { case ODEBUG_STATE_ACTIVE: cancel_work_sync(work); debug_object_free(work, &work_debug_descr); return true; default: return false; } } static const struct debug_obj_descr work_debug_descr = { .name = "work_struct", .debug_hint = work_debug_hint, .is_static_object = work_is_static_object, .fixup_init = work_fixup_init, .fixup_free = work_fixup_free, }; static inline void debug_work_activate(struct work_struct *work) { debug_object_activate(work, &work_debug_descr); } static inline void debug_work_deactivate(struct work_struct *work) { debug_object_deactivate(work, &work_debug_descr); } void __init_work(struct work_struct *work, int onstack) { if (onstack) debug_object_init_on_stack(work, &work_debug_descr); else debug_object_init(work, &work_debug_descr); } EXPORT_SYMBOL_GPL(__init_work); void destroy_work_on_stack(struct work_struct *work) { debug_object_free(work, &work_debug_descr); } EXPORT_SYMBOL_GPL(destroy_work_on_stack); void destroy_delayed_work_on_stack(struct delayed_work *work) { timer_destroy_on_stack(&work->timer); debug_object_free(&work->work, &work_debug_descr); } EXPORT_SYMBOL_GPL(destroy_delayed_work_on_stack); #else static inline void debug_work_activate(struct work_struct *work) { } static inline void debug_work_deactivate(struct work_struct *work) { } #endif /** * worker_pool_assign_id - allocate ID and assign it to @pool * @pool: the pool pointer of interest * * Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned * successfully, -errno on failure. */ static int worker_pool_assign_id(struct worker_pool *pool) { int ret; lockdep_assert_held(&wq_pool_mutex); ret = idr_alloc(&worker_pool_idr, pool, 0, WORK_OFFQ_POOL_NONE, GFP_KERNEL); if (ret >= 0) { pool->id = ret; return 0; } return ret; } static struct pool_workqueue __rcu ** unbound_pwq_slot(struct workqueue_struct *wq, int cpu) { if (cpu >= 0) return per_cpu_ptr(wq->cpu_pwq, cpu); else return &wq->dfl_pwq; } /* @cpu < 0 for dfl_pwq */ static struct pool_workqueue *unbound_pwq(struct workqueue_struct *wq, int cpu) { return rcu_dereference_check(*unbound_pwq_slot(wq, cpu), lockdep_is_held(&wq_pool_mutex) || lockdep_is_held(&wq->mutex)); } /** * unbound_effective_cpumask - effective cpumask of an unbound workqueue * @wq: workqueue of interest * * @wq->unbound_attrs->cpumask contains the cpumask requested by the user which * is masked with wq_unbound_cpumask to determine the effective cpumask. The * default pwq is always mapped to the pool with the current effective cpumask. */ static struct cpumask *unbound_effective_cpumask(struct workqueue_struct *wq) { return unbound_pwq(wq, -1)->pool->attrs->__pod_cpumask; } static unsigned int work_color_to_flags(int color) { return color << WORK_STRUCT_COLOR_SHIFT; } static int get_work_color(unsigned long work_data) { return (work_data >> WORK_STRUCT_COLOR_SHIFT) & ((1 << WORK_STRUCT_COLOR_BITS) - 1); } static int work_next_color(int color) { return (color + 1) % WORK_NR_COLORS; } static unsigned long pool_offq_flags(struct worker_pool *pool) { return (pool->flags & POOL_BH) ? WORK_OFFQ_BH : 0; } /* * While queued, %WORK_STRUCT_PWQ is set and non flag bits of a work's data * contain the pointer to the queued pwq. Once execution starts, the flag * is cleared and the high bits contain OFFQ flags and pool ID. * * set_work_pwq(), set_work_pool_and_clear_pending() and mark_work_canceling() * can be used to set the pwq, pool or clear work->data. These functions should * only be called while the work is owned - ie. while the PENDING bit is set. * * get_work_pool() and get_work_pwq() can be used to obtain the pool or pwq * corresponding to a work. Pool is available once the work has been * queued anywhere after initialization until it is sync canceled. pwq is * available only while the work item is queued. */ static inline void set_work_data(struct work_struct *work, unsigned long data) { WARN_ON_ONCE(!work_pending(work)); atomic_long_set(&work->data, data | work_static(work)); } static void set_work_pwq(struct work_struct *work, struct pool_workqueue *pwq, unsigned long flags) { set_work_data(work, (unsigned long)pwq | WORK_STRUCT_PENDING | WORK_STRUCT_PWQ | flags); } static void set_work_pool_and_keep_pending(struct work_struct *work, int pool_id, unsigned long flags) { set_work_data(work, ((unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT) | WORK_STRUCT_PENDING | flags); } static void set_work_pool_and_clear_pending(struct work_struct *work, int pool_id, unsigned long flags) { /* * The following wmb is paired with the implied mb in * test_and_set_bit(PENDING) and ensures all updates to @work made * here are visible to and precede any updates by the next PENDING * owner. */ smp_wmb(); set_work_data(work, ((unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT) | flags); /* * The following mb guarantees that previous clear of a PENDING bit * will not be reordered with any speculative LOADS or STORES from * work->current_func, which is executed afterwards. This possible * reordering can lead to a missed execution on attempt to queue * the same @work. E.g. consider this case: * * CPU#0 CPU#1 * ---------------------------- -------------------------------- * * 1 STORE event_indicated * 2 queue_work_on() { * 3 test_and_set_bit(PENDING) * 4 } set_..._and_clear_pending() { * 5 set_work_data() # clear bit * 6 smp_mb() * 7 work->current_func() { * 8 LOAD event_indicated * } * * Without an explicit full barrier speculative LOAD on line 8 can * be executed before CPU#0 does STORE on line 1. If that happens, * CPU#0 observes the PENDING bit is still set and new execution of * a @work is not queued in a hope, that CPU#1 will eventually * finish the queued @work. Meanwhile CPU#1 does not see * event_indicated is set, because speculative LOAD was executed * before actual STORE. */ smp_mb(); } static inline struct pool_workqueue *work_struct_pwq(unsigned long data) { return (struct pool_workqueue *)(data & WORK_STRUCT_PWQ_MASK); } static struct pool_workqueue *get_work_pwq(struct work_struct *work) { unsigned long data = atomic_long_read(&work->data); if (data & WORK_STRUCT_PWQ) return work_struct_pwq(data); else return NULL; } /** * get_work_pool - return the worker_pool a given work was associated with * @work: the work item of interest * * Pools are created and destroyed under wq_pool_mutex, and allows read * access under RCU read lock. As such, this function should be * called under wq_pool_mutex or inside of a rcu_read_lock() region. * * All fields of the returned pool are accessible as long as the above * mentioned locking is in effect. If the returned pool needs to be used * beyond the critical section, the caller is responsible for ensuring the * returned pool is and stays online. * * Return: The worker_pool @work was last associated with. %NULL if none. */ static struct worker_pool *get_work_pool(struct work_struct *work) { unsigned long data = atomic_long_read(&work->data); int pool_id; assert_rcu_or_pool_mutex(); if (data & WORK_STRUCT_PWQ) return work_struct_pwq(data)->pool; pool_id = data >> WORK_OFFQ_POOL_SHIFT; if (pool_id == WORK_OFFQ_POOL_NONE) return NULL; return idr_find(&worker_pool_idr, pool_id); } static unsigned long shift_and_mask(unsigned long v, u32 shift, u32 bits) { return (v >> shift) & ((1U << bits) - 1); } static void work_offqd_unpack(struct work_offq_data *offqd, unsigned long data) { WARN_ON_ONCE(data & WORK_STRUCT_PWQ); offqd->pool_id = shift_and_mask(data, WORK_OFFQ_POOL_SHIFT, WORK_OFFQ_POOL_BITS); offqd->disable = shift_and_mask(data, WORK_OFFQ_DISABLE_SHIFT, WORK_OFFQ_DISABLE_BITS); offqd->flags = data & WORK_OFFQ_FLAG_MASK; } static unsigned long work_offqd_pack_flags(struct work_offq_data *offqd) { return ((unsigned long)offqd->disable << WORK_OFFQ_DISABLE_SHIFT) | ((unsigned long)offqd->flags); } /* * Policy functions. These define the policies on how the global worker * pools are managed. Unless noted otherwise, these functions assume that * they're being called with pool->lock held. */ /* * Need to wake up a worker? Called from anything but currently * running workers. * * Note that, because unbound workers never contribute to nr_running, this * function will always return %true for unbound pools as long as the * worklist isn't empty. */ static bool need_more_worker(struct worker_pool *pool) { return !list_empty(&pool->worklist) && !pool->nr_running; } /* Can I start working? Called from busy but !running workers. */ static bool may_start_working(struct worker_pool *pool) { return pool->nr_idle; } /* Do I need to keep working? Called from currently running workers. */ static bool keep_working(struct worker_pool *pool) { return !list_empty(&pool->worklist) && (pool->nr_running <= 1); } /* Do we need a new worker? Called from manager. */ static bool need_to_create_worker(struct worker_pool *pool) { return need_more_worker(pool) && !may_start_working(pool); } /* Do we have too many workers and should some go away? */ static bool too_many_workers(struct worker_pool *pool) { bool managing = pool->flags & POOL_MANAGER_ACTIVE; int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ int nr_busy = pool->nr_workers - nr_idle; return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; } /** * worker_set_flags - set worker flags and adjust nr_running accordingly * @worker: self * @flags: flags to set * * Set @flags in @worker->flags and adjust nr_running accordingly. */ static inline void worker_set_flags(struct worker *worker, unsigned int flags) { struct worker_pool *pool = worker->pool; lockdep_assert_held(&pool->lock); /* If transitioning into NOT_RUNNING, adjust nr_running. */ if ((flags & WORKER_NOT_RUNNING) && !(worker->flags & WORKER_NOT_RUNNING)) { pool->nr_running--; } worker->flags |= flags; } /** * worker_clr_flags - clear worker flags and adjust nr_running accordingly * @worker: self * @flags: flags to clear * * Clear @flags in @worker->flags and adjust nr_running accordingly. */ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) { struct worker_pool *pool = worker->pool; unsigned int oflags = worker->flags; lockdep_assert_held(&pool->lock); worker->flags &= ~flags; /* * If transitioning out of NOT_RUNNING, increment nr_running. Note * that the nested NOT_RUNNING is not a noop. NOT_RUNNING is mask * of multiple flags, not a single flag. */ if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) if (!(worker->flags & WORKER_NOT_RUNNING)) pool->nr_running++; } /* Return the first idle worker. Called with pool->lock held. */ static struct worker *first_idle_worker(struct worker_pool *pool) { if (unlikely(list_empty(&pool->idle_list))) return NULL; return list_first_entry(&pool->idle_list, struct worker, entry); } /** * worker_enter_idle - enter idle state * @worker: worker which is entering idle state * * @worker is entering idle state. Update stats and idle timer if * necessary. * * LOCKING: * raw_spin_lock_irq(pool->lock). */ static void worker_enter_idle(struct worker *worker) { struct worker_pool *pool = worker->pool; if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) || WARN_ON_ONCE(!list_empty(&worker->entry) && (worker->hentry.next || worker->hentry.pprev))) return; /* can't use worker_set_flags(), also called from create_worker() */ worker->flags |= WORKER_IDLE; pool->nr_idle++; worker->last_active = jiffies; /* idle_list is LIFO */ list_add(&worker->entry, &pool->idle_list); if (too_many_workers(pool) && !timer_pending(&pool->idle_timer)) mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); /* Sanity check nr_running. */ WARN_ON_ONCE(pool->nr_workers == pool->nr_idle && pool->nr_running); } /** * worker_leave_idle - leave idle state * @worker: worker which is leaving idle state * * @worker is leaving idle state. Update stats. * * LOCKING: * raw_spin_lock_irq(pool->lock). */ static void worker_leave_idle(struct worker *worker) { struct worker_pool *pool = worker->pool; if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE))) return; worker_clr_flags(worker, WORKER_IDLE); pool->nr_idle--; list_del_init(&worker->entry); } /** * find_worker_executing_work - find worker which is executing a work * @pool: pool of interest * @work: work to find worker for * * Find a worker which is executing @work on @pool by searching * @pool->busy_hash which is keyed by the address of @work. For a worker * to match, its current execution should match the address of @work and * its work function. This is to avoid unwanted dependency between * unrelated work executions through a work item being recycled while still * being executed. * * This is a bit tricky. A work item may be freed once its execution * starts and nothing prevents the freed area from being recycled for * another work item. If the same work item address ends up being reused * before the original execution finishes, workqueue will identify the * recycled work item as currently executing and make it wait until the * current execution finishes, introducing an unwanted dependency. * * This function checks the work item address and work function to avoid * false positives. Note that this isn't complete as one may construct a * work function which can introduce dependency onto itself through a * recycled work item. Well, if somebody wants to shoot oneself in the * foot that badly, there's only so much we can do, and if such deadlock * actually occurs, it should be easy to locate the culprit work function. * * CONTEXT: * raw_spin_lock_irq(pool->lock). * * Return: * Pointer to worker which is executing @work if found, %NULL * otherwise. */ static struct worker *find_worker_executing_work(struct worker_pool *pool, struct work_struct *work) { struct worker *worker; hash_for_each_possible(pool->busy_hash, worker, hentry, (unsigned long)work) if (worker->current_work == work && worker->current_func == work->func) return worker; return NULL; } /** * move_linked_works - move linked works to a list * @work: start of series of works to be scheduled * @head: target list to append @work to * @nextp: out parameter for nested worklist walking * * Schedule linked works starting from @work to @head. Work series to be * scheduled starts at @work and includes any consecutive work with * WORK_STRUCT_LINKED set in its predecessor. See assign_work() for details on * @nextp. * * CONTEXT: * raw_spin_lock_irq(pool->lock). */ static void move_linked_works(struct work_struct *work, struct list_head *head, struct work_struct **nextp) { struct work_struct *n; /* * Linked worklist will always end before the end of the list, * use NULL for list head. */ list_for_each_entry_safe_from(work, n, NULL, entry) { list_move_tail(&work->entry, head); if (!(*work_data_bits(work) & WORK_STRUCT_LINKED)) break; } /* * If we're already inside safe list traversal and have moved * multiple works to the scheduled queue, the next position * needs to be updated. */ if (nextp) *nextp = n; } /** * assign_work - assign a work item and its linked work items to a worker * @work: work to assign * @worker: worker to assign to * @nextp: out parameter for nested worklist walking * * Assign @work and its linked work items to @worker. If @work is already being * executed by another worker in the same pool, it'll be punted there. * * If @nextp is not NULL, it's updated to point to the next work of the last * scheduled work. This allows assign_work() to be nested inside * list_for_each_entry_safe(). * * Returns %true if @work was successfully assigned to @worker. %false if @work * was punted to another worker already executing it. */ static bool assign_work(struct work_struct *work, struct worker *worker, struct work_struct **nextp) { struct worker_pool *pool = worker->pool; struct worker *collision; lockdep_assert_held(&pool->lock); /* * A single work shouldn't be executed concurrently by multiple workers. * __queue_work() ensures that @work doesn't jump to a different pool * while still running in the previous pool. Here, we should ensure that * @work is not executed concurrently by multiple workers from the same * pool. Check whether anyone is already processing the work. If so, * defer the work to the currently executing one. */ collision = find_worker_executing_work(pool, work); if (unlikely(collision)) { move_linked_works(work, &collision->scheduled, nextp); return false; } move_linked_works(work, &worker->scheduled, nextp); return true; } static struct irq_work *bh_pool_irq_work(struct worker_pool *pool) { int high = pool->attrs->nice == HIGHPRI_NICE_LEVEL ? 1 : 0; return &per_cpu(bh_pool_irq_works, pool->cpu)[high]; } static void kick_bh_pool(struct worker_pool *pool) { #ifdef CONFIG_SMP /* see drain_dead_softirq_workfn() for BH_DRAINING */ if (unlikely(pool->cpu != smp_processor_id() && !(pool->flags & POOL_BH_DRAINING))) { irq_work_queue_on(bh_pool_irq_work(pool), pool->cpu); return; } #endif if (pool->attrs->nice == HIGHPRI_NICE_LEVEL) raise_softirq_irqoff(HI_SOFTIRQ); else raise_softirq_irqoff(TASKLET_SOFTIRQ); } /** * kick_pool - wake up an idle worker if necessary * @pool: pool to kick * * @pool may have pending work items. Wake up worker if necessary. Returns * whether a worker was woken up. */ static bool kick_pool(struct worker_pool *pool) { struct worker *worker = first_idle_worker(pool); struct task_struct *p; lockdep_assert_held(&pool->lock); if (!need_more_worker(pool) || !worker) return false; if (pool->flags & POOL_BH) { kick_bh_pool(pool); return true; } p = worker->task; #ifdef CONFIG_SMP /* * Idle @worker is about to execute @work and waking up provides an * opportunity to migrate @worker at a lower cost by setting the task's * wake_cpu field. Let's see if we want to move @worker to improve * execution locality. * * We're waking the worker that went idle the latest and there's some * chance that @worker is marked idle but hasn't gone off CPU yet. If * so, setting the wake_cpu won't do anything. As this is a best-effort * optimization and the race window is narrow, let's leave as-is for * now. If this becomes pronounced, we can skip over workers which are * still on cpu when picking an idle worker. * * If @pool has non-strict affinity, @worker might have ended up outside * its affinity scope. Repatriate. */ if (!pool->attrs->affn_strict && !cpumask_test_cpu(p->wake_cpu, pool->attrs->__pod_cpumask)) { struct work_struct *work = list_first_entry(&pool->worklist, struct work_struct, entry); int wake_cpu = cpumask_any_and_distribute(pool->attrs->__pod_cpumask, cpu_online_mask); if (wake_cpu < nr_cpu_ids) { p->wake_cpu = wake_cpu; get_work_pwq(work)->stats[PWQ_STAT_REPATRIATED]++; } } #endif wake_up_process(p); return true; } #ifdef CONFIG_WQ_CPU_INTENSIVE_REPORT /* * Concurrency-managed per-cpu work items that hog CPU for longer than * wq_cpu_intensive_thresh_us trigger the automatic CPU_INTENSIVE mechanism, * which prevents them from stalling other concurrency-managed work items. If a * work function keeps triggering this mechanism, it's likely that the work item * should be using an unbound workqueue instead. * * wq_cpu_intensive_report() tracks work functions which trigger such conditions * and report them so that they can be examined and converted to use unbound * workqueues as appropriate. To avoid flooding the console, each violating work * function is tracked and reported with exponential backoff. */ #define WCI_MAX_ENTS 128 struct wci_ent { work_func_t func; atomic64_t cnt; struct hlist_node hash_node; }; static struct wci_ent wci_ents[WCI_MAX_ENTS]; static int wci_nr_ents; static DEFINE_RAW_SPINLOCK(wci_lock); static DEFINE_HASHTABLE(wci_hash, ilog2(WCI_MAX_ENTS)); static struct wci_ent *wci_find_ent(work_func_t func) { struct wci_ent *ent; hash_for_each_possible_rcu(wci_hash, ent, hash_node, (unsigned long)func) { if (ent->func == func) return ent; } return NULL; } static void wq_cpu_intensive_report(work_func_t func) { struct wci_ent *ent; restart: ent = wci_find_ent(func); if (ent) { u64 cnt; /* * Start reporting from the warning_thresh and back off * exponentially. */ cnt = atomic64_inc_return_relaxed(&ent->cnt); if (wq_cpu_intensive_warning_thresh && cnt >= wq_cpu_intensive_warning_thresh && is_power_of_2(cnt + 1 - wq_cpu_intensive_warning_thresh)) printk_deferred(KERN_WARNING "workqueue: %ps hogged CPU for >%luus %llu times, consider switching to WQ_UNBOUND\n", ent->func, wq_cpu_intensive_thresh_us, atomic64_read(&ent->cnt)); return; } /* * @func is a new violation. Allocate a new entry for it. If wcn_ents[] * is exhausted, something went really wrong and we probably made enough * noise already. */ if (wci_nr_ents >= WCI_MAX_ENTS) return; raw_spin_lock(&wci_lock); if (wci_nr_ents >= WCI_MAX_ENTS) { raw_spin_unlock(&wci_lock); return; } if (wci_find_ent(func)) { raw_spin_unlock(&wci_lock); goto restart; } ent = &wci_ents[wci_nr_ents++]; ent->func = func; atomic64_set(&ent->cnt, 0); hash_add_rcu(wci_hash, &ent->hash_node, (unsigned long)func); raw_spin_unlock(&wci_lock); goto restart; } #else /* CONFIG_WQ_CPU_INTENSIVE_REPORT */ static void wq_cpu_intensive_report(work_func_t func) {} #endif /* CONFIG_WQ_CPU_INTENSIVE_REPORT */ /** * wq_worker_running - a worker is running again * @task: task waking up * * This function is called when a worker returns from schedule() */ void wq_worker_running(struct task_struct *task) { struct worker *worker = kthread_data(task); if (!READ_ONCE(worker->sleeping)) return; /* * If preempted by unbind_workers() between the WORKER_NOT_RUNNING check * and the nr_running increment below, we may ruin the nr_running reset * and leave with an unexpected pool->nr_running == 1 on the newly unbound * pool. Protect against such race. */ preempt_disable(); if (!(worker->flags & WORKER_NOT_RUNNING)) worker->pool->nr_running++; preempt_enable(); /* * CPU intensive auto-detection cares about how long a work item hogged * CPU without sleeping. Reset the starting timestamp on wakeup. */ worker->current_at = worker->task->se.sum_exec_runtime; WRITE_ONCE(worker->sleeping, 0); } /** * wq_worker_sleeping - a worker is going to sleep * @task: task going to sleep * * This function is called from schedule() when a busy worker is * going to sleep. */ void wq_worker_sleeping(struct task_struct *task) { struct worker *worker = kthread_data(task); struct worker_pool *pool; /* * Rescuers, which may not have all the fields set up like normal * workers, also reach here, let's not access anything before * checking NOT_RUNNING. */ if (worker->flags & WORKER_NOT_RUNNING) return; pool = worker->pool; /* Return if preempted before wq_worker_running() was reached */ if (READ_ONCE(worker->sleeping)) return; WRITE_ONCE(worker->sleeping, 1); raw_spin_lock_irq(&pool->lock); /* * Recheck in case unbind_workers() preempted us. We don't * want to decrement nr_running after the worker is unbound * and nr_running has been reset. */ if (worker->flags & WORKER_NOT_RUNNING) { raw_spin_unlock_irq(&pool->lock); return; } pool->nr_running--; if (kick_pool(pool)) worker->current_pwq->stats[PWQ_STAT_CM_WAKEUP]++; raw_spin_unlock_irq(&pool->lock); } /** * wq_worker_tick - a scheduler tick occurred while a kworker is running * @task: task currently running * * Called from sched_tick(). We're in the IRQ context and the current * worker's fields which follow the 'K' locking rule can be accessed safely. */ void wq_worker_tick(struct task_struct *task) { struct worker *worker = kthread_data(task); struct pool_workqueue *pwq = worker->current_pwq; struct worker_pool *pool = worker->pool; if (!pwq) return; pwq->stats[PWQ_STAT_CPU_TIME] += TICK_USEC; if (!wq_cpu_intensive_thresh_us) return; /* * If the current worker is concurrency managed and hogged the CPU for * longer than wq_cpu_intensive_thresh_us, it's automatically marked * CPU_INTENSIVE to avoid stalling other concurrency-managed work items. * * Set @worker->sleeping means that @worker is in the process of * switching out voluntarily and won't be contributing to * @pool->nr_running until it wakes up. As wq_worker_sleeping() also * decrements ->nr_running, setting CPU_INTENSIVE here can lead to * double decrements. The task is releasing the CPU anyway. Let's skip. * We probably want to make this prettier in the future. */ if ((worker->flags & WORKER_NOT_RUNNING) || READ_ONCE(worker->sleeping) || worker->task->se.sum_exec_runtime - worker->current_at < wq_cpu_intensive_thresh_us * NSEC_PER_USEC) return; raw_spin_lock(&pool->lock); worker_set_flags(worker, WORKER_CPU_INTENSIVE); wq_cpu_intensive_report(worker->current_func); pwq->stats[PWQ_STAT_CPU_INTENSIVE]++; if (kick_pool(pool)) pwq->stats[PWQ_STAT_CM_WAKEUP]++; raw_spin_unlock(&pool->lock); } /** * wq_worker_last_func - retrieve worker's last work function * @task: Task to retrieve last work function of. * * Determine the last function a worker executed. This is called from * the scheduler to get a worker's last known identity. * * CONTEXT: * raw_spin_lock_irq(rq->lock) * * This function is called during schedule() when a kworker is going * to sleep. It's used by psi to identify aggregation workers during * dequeuing, to allow periodic aggregation to shut-off when that * worker is the last task in the system or cgroup to go to sleep. * * As this function doesn't involve any workqueue-related locking, it * only returns stable values when called from inside the scheduler's * queuing and dequeuing paths, when @task, which must be a kworker, * is guaranteed to not be processing any works. * * Return: * The last work function %current executed as a worker, NULL if it * hasn't executed any work yet. */ work_func_t wq_worker_last_func(struct task_struct *task) { struct worker *worker = kthread_data(task); return worker->last_func; } /** * wq_node_nr_active - Determine wq_node_nr_active to use * @wq: workqueue of interest * @node: NUMA node, can be %NUMA_NO_NODE * * Determine wq_node_nr_active to use for @wq on @node. Returns: * * - %NULL for per-cpu workqueues as they don't need to use shared nr_active. * * - node_nr_active[nr_node_ids] if @node is %NUMA_NO_NODE. * * - Otherwise, node_nr_active[@node]. */ static struct wq_node_nr_active *wq_node_nr_active(struct workqueue_struct *wq, int node) { if (!(wq->flags & WQ_UNBOUND)) return NULL; if (node == NUMA_NO_NODE) node = nr_node_ids; return wq->node_nr_active[node]; } /** * wq_update_node_max_active - Update per-node max_actives to use * @wq: workqueue to update * @off_cpu: CPU that's going down, -1 if a CPU is not going down * * Update @wq->node_nr_active[]->max. @wq must be unbound. max_active is * distributed among nodes according to the proportions of numbers of online * cpus. The result is always between @wq->min_active and max_active. */ static void wq_update_node_max_active(struct workqueue_struct *wq, int off_cpu) { struct cpumask *effective = unbound_effective_cpumask(wq); int min_active = READ_ONCE(wq->min_active); int max_active = READ_ONCE(wq->max_active); int total_cpus, node; lockdep_assert_held(&wq->mutex); if (!wq_topo_initialized) return; if (off_cpu >= 0 && !cpumask_test_cpu(off_cpu, effective)) off_cpu = -1; total_cpus = cpumask_weight_and(effective, cpu_online_mask); if (off_cpu >= 0) total_cpus--; /* If all CPUs of the wq get offline, use the default values */ if (unlikely(!total_cpus)) { for_each_node(node) wq_node_nr_active(wq, node)->max = min_active; wq_node_nr_active(wq, NUMA_NO_NODE)->max = max_active; return; } for_each_node(node) { int node_cpus; node_cpus = cpumask_weight_and(effective, cpumask_of_node(node)); if (off_cpu >= 0 && cpu_to_node(off_cpu) == node) node_cpus--; wq_node_nr_active(wq, node)->max = clamp(DIV_ROUND_UP(max_active * node_cpus, total_cpus), min_active, max_active); } wq_node_nr_active(wq, NUMA_NO_NODE)->max = max_active; } /** * get_pwq - get an extra reference on the specified pool_workqueue * @pwq: pool_workqueue to get * * Obtain an extra reference on @pwq. The caller should guarantee that * @pwq has positive refcnt and be holding the matching pool->lock. */ static void get_pwq(struct pool_workqueue *pwq) { lockdep_assert_held(&pwq->pool->lock); WARN_ON_ONCE(pwq->refcnt <= 0); pwq->refcnt++; } /** * put_pwq - put a pool_workqueue reference * @pwq: pool_workqueue to put * * Drop a reference of @pwq. If its refcnt reaches zero, schedule its * destruction. The caller should be holding the matching pool->lock. */ static void put_pwq(struct pool_workqueue *pwq) { lockdep_assert_held(&pwq->pool->lock); if (likely(--pwq->refcnt)) return; /* * @pwq can't be released under pool->lock, bounce to a dedicated * kthread_worker to avoid A-A deadlocks. */ kthread_queue_work(pwq_release_worker, &pwq->release_work); } /** * put_pwq_unlocked - put_pwq() with surrounding pool lock/unlock * @pwq: pool_workqueue to put (can be %NULL) * * put_pwq() with locking. This function also allows %NULL @pwq. */ static void put_pwq_unlocked(struct pool_workqueue *pwq) { if (pwq) { /* * As both pwqs and pools are RCU protected, the * following lock operations are safe. */ raw_spin_lock_irq(&pwq->pool->lock); put_pwq(pwq); raw_spin_unlock_irq(&pwq->pool->lock); } } static bool pwq_is_empty(struct pool_workqueue *pwq) { return !pwq->nr_active && list_empty(&pwq->inactive_works); } static void __pwq_activate_work(struct pool_workqueue *pwq, struct work_struct *work) { unsigned long *wdb = work_data_bits(work); WARN_ON_ONCE(!(*wdb & WORK_STRUCT_INACTIVE)); trace_workqueue_activate_work(work); if (list_empty(&pwq->pool->worklist)) pwq->pool->watchdog_ts = jiffies; move_linked_works(work, &pwq->pool->worklist, NULL); __clear_bit(WORK_STRUCT_INACTIVE_BIT, wdb); } static bool tryinc_node_nr_active(struct wq_node_nr_active *nna) { int max = READ_ONCE(nna->max); int old = atomic_read(&nna->nr); do { if (old >= max) return false; } while (!atomic_try_cmpxchg_relaxed(&nna->nr, &old, old + 1)); return true; } /** * pwq_tryinc_nr_active - Try to increment nr_active for a pwq * @pwq: pool_workqueue of interest * @fill: max_active may have increased, try to increase concurrency level * * Try to increment nr_active for @pwq. Returns %true if an nr_active count is * successfully obtained. %false otherwise. */ static bool pwq_tryinc_nr_active(struct pool_workqueue *pwq, bool fill) { struct workqueue_struct *wq = pwq->wq; struct worker_pool *pool = pwq->pool; struct wq_node_nr_active *nna = wq_node_nr_active(wq, pool->node); bool obtained = false; lockdep_assert_held(&pool->lock); if (!nna) { /* BH or per-cpu workqueue, pwq->nr_active is sufficient */ obtained = pwq->nr_active < READ_ONCE(wq->max_active); goto out; } if (unlikely(pwq->plugged)) return false; /* * Unbound workqueue uses per-node shared nr_active $nna. If @pwq is * already waiting on $nna, pwq_dec_nr_active() will maintain the * concurrency level. Don't jump the line. * * We need to ignore the pending test after max_active has increased as * pwq_dec_nr_active() can only maintain the concurrency level but not * increase it. This is indicated by @fill. */ if (!list_empty(&pwq->pending_node) && likely(!fill)) goto out; obtained = tryinc_node_nr_active(nna); if (obtained) goto out; /* * Lockless acquisition failed. Lock, add ourself to $nna->pending_pwqs * and try again. The smp_mb() is paired with the implied memory barrier * of atomic_dec_return() in pwq_dec_nr_active() to ensure that either * we see the decremented $nna->nr or they see non-empty * $nna->pending_pwqs. */ raw_spin_lock(&nna->lock); if (list_empty(&pwq->pending_node)) list_add_tail(&pwq->pending_node, &nna->pending_pwqs); else if (likely(!fill)) goto out_unlock; smp_mb(); obtained = tryinc_node_nr_active(nna); /* * If @fill, @pwq might have already been pending. Being spuriously * pending in cold paths doesn't affect anything. Let's leave it be. */ if (obtained && likely(!fill)) list_del_init(&pwq->pending_node); out_unlock: raw_spin_unlock(&nna->lock); out: if (obtained) pwq->nr_active++; return obtained; } /** * pwq_activate_first_inactive - Activate the first inactive work item on a pwq * @pwq: pool_workqueue of interest * @fill: max_active may have increased, try to increase concurrency level * * Activate the first inactive work item of @pwq if available and allowed by * max_active limit. * * Returns %true if an inactive work item has been activated. %false if no * inactive work item is found or max_active limit is reached. */ static bool pwq_activate_first_inactive(struct pool_workqueue *pwq, bool fill) { struct work_struct *work = list_first_entry_or_null(&pwq->inactive_works, struct work_struct, entry); if (work && pwq_tryinc_nr_active(pwq, fill)) { __pwq_activate_work(pwq, work); return true; } else { return false; } } /** * unplug_oldest_pwq - unplug the oldest pool_workqueue * @wq: workqueue_struct where its oldest pwq is to be unplugged * * This function should only be called for ordered workqueues where only the * oldest pwq is unplugged, the others are plugged to suspend execution to * ensure proper work item ordering:: * * dfl_pwq --------------+ [P] - plugged * | * v * pwqs -> A -> B [P] -> C [P] (newest) * | | | * 1 3 5 * | | | * 2 4 6 * * When the oldest pwq is drained and removed, this function should be called * to unplug the next oldest one to start its work item execution. Note that * pwq's are linked into wq->pwqs with the oldest first, so the first one in * the list is the oldest. */ static void unplug_oldest_pwq(struct workqueue_struct *wq) { struct pool_workqueue *pwq; lockdep_assert_held(&wq->mutex); /* Caller should make sure that pwqs isn't empty before calling */ pwq = list_first_entry_or_null(&wq->pwqs, struct pool_workqueue, pwqs_node); raw_spin_lock_irq(&pwq->pool->lock); if (pwq->plugged) { pwq->plugged = false; if (pwq_activate_first_inactive(pwq, true)) kick_pool(pwq->pool); } raw_spin_unlock_irq(&pwq->pool->lock); } /** * node_activate_pending_pwq - Activate a pending pwq on a wq_node_nr_active * @nna: wq_node_nr_active to activate a pending pwq for * @caller_pool: worker_pool the caller is locking * * Activate a pwq in @nna->pending_pwqs. Called with @caller_pool locked. * @caller_pool may be unlocked and relocked to lock other worker_pools. */ static void node_activate_pending_pwq(struct wq_node_nr_active *nna, struct worker_pool *caller_pool) { struct worker_pool *locked_pool = caller_pool; struct pool_workqueue *pwq; struct work_struct *work; lockdep_assert_held(&caller_pool->lock); raw_spin_lock(&nna->lock); retry: pwq = list_first_entry_or_null(&nna->pending_pwqs, struct pool_workqueue, pending_node); if (!pwq) goto out_unlock; /* * If @pwq is for a different pool than @locked_pool, we need to lock * @pwq->pool->lock. Let's trylock first. If unsuccessful, do the unlock * / lock dance. For that, we also need to release @nna->lock as it's * nested inside pool locks. */ if (pwq->pool != locked_pool) { raw_spin_unlock(&locked_pool->lock); locked_pool = pwq->pool; if (!raw_spin_trylock(&locked_pool->lock)) { raw_spin_unlock(&nna->lock); raw_spin_lock(&locked_pool->lock); raw_spin_lock(&nna->lock); goto retry; } } /* * $pwq may not have any inactive work items due to e.g. cancellations. * Drop it from pending_pwqs and see if there's another one. */ work = list_first_entry_or_null(&pwq->inactive_works, struct work_struct, entry); if (!work) { list_del_init(&pwq->pending_node); goto retry; } /* * Acquire an nr_active count and activate the inactive work item. If * $pwq still has inactive work items, rotate it to the end of the * pending_pwqs so that we round-robin through them. This means that * inactive work items are not activated in queueing order which is fine * given that there has never been any ordering across different pwqs. */ if (likely(tryinc_node_nr_active(nna))) { pwq->nr_active++; __pwq_activate_work(pwq, work); if (list_empty(&pwq->inactive_works)) list_del_init(&pwq->pending_node); else list_move_tail(&pwq->pending_node, &nna->pending_pwqs); /* if activating a foreign pool, make sure it's running */ if (pwq->pool != caller_pool) kick_pool(pwq->pool); } out_unlock: raw_spin_unlock(&nna->lock); if (locked_pool != caller_pool) { raw_spin_unlock(&locked_pool->lock); raw_spin_lock(&caller_pool->lock); } } /** * pwq_dec_nr_active - Retire an active count * @pwq: pool_workqueue of interest * * Decrement @pwq's nr_active and try to activate the first inactive work item. * For unbound workqueues, this function may temporarily drop @pwq->pool->lock. */ static void pwq_dec_nr_active(struct pool_workqueue *pwq) { struct worker_pool *pool = pwq->pool; struct wq_node_nr_active *nna = wq_node_nr_active(pwq->wq, pool->node); lockdep_assert_held(&pool->lock); /* * @pwq->nr_active should be decremented for both percpu and unbound * workqueues. */ pwq->nr_active--; /* * For a percpu workqueue, it's simple. Just need to kick the first * inactive work item on @pwq itself. */ if (!nna) { pwq_activate_first_inactive(pwq, false); return; } /* * If @pwq is for an unbound workqueue, it's more complicated because * multiple pwqs and pools may be sharing the nr_active count. When a * pwq needs to wait for an nr_active count, it puts itself on * $nna->pending_pwqs. The following atomic_dec_return()'s implied * memory barrier is paired with smp_mb() in pwq_tryinc_nr_active() to * guarantee that either we see non-empty pending_pwqs or they see * decremented $nna->nr. * * $nna->max may change as CPUs come online/offline and @pwq->wq's * max_active gets updated. However, it is guaranteed to be equal to or * larger than @pwq->wq->min_active which is above zero unless freezing. * This maintains the forward progress guarantee. */ if (atomic_dec_return(&nna->nr) >= READ_ONCE(nna->max)) return; if (!list_empty(&nna->pending_pwqs)) node_activate_pending_pwq(nna, pool); } /** * pwq_dec_nr_in_flight - decrement pwq's nr_in_flight * @pwq: pwq of interest * @work_data: work_data of work which left the queue * * A work either has completed or is removed from pending queue, * decrement nr_in_flight of its pwq and handle workqueue flushing. * * NOTE: * For unbound workqueues, this function may temporarily drop @pwq->pool->lock * and thus should be called after all other state updates for the in-flight * work item is complete. * * CONTEXT: * raw_spin_lock_irq(pool->lock). */ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, unsigned long work_data) { int color = get_work_color(work_data); if (!(work_data & WORK_STRUCT_INACTIVE)) pwq_dec_nr_active(pwq); pwq->nr_in_flight[color]--; /* is flush in progress and are we at the flushing tip? */ if (likely(pwq->flush_color != color)) goto out_put; /* are there still in-flight works? */ if (pwq->nr_in_flight[color]) goto out_put; /* this pwq is done, clear flush_color */ pwq->flush_color = -1; /* * If this was the last pwq, wake up the first flusher. It * will handle the rest. */ if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush)) complete(&pwq->wq->first_flusher->done); out_put: put_pwq(pwq); } /** * try_to_grab_pending - steal work item from worklist and disable irq * @work: work item to steal * @cflags: %WORK_CANCEL_ flags * @irq_flags: place to store irq state * * Try to grab PENDING bit of @work. This function can handle @work in any * stable state - idle, on timer or on worklist. * * Return: * * ======== ================================================================ * 1 if @work was pending and we successfully stole PENDING * 0 if @work was idle and we claimed PENDING * -EAGAIN if PENDING couldn't be grabbed at the moment, safe to busy-retry * ======== ================================================================ * * Note: * On >= 0 return, the caller owns @work's PENDING bit. To avoid getting * interrupted while holding PENDING and @work off queue, irq must be * disabled on entry. This, combined with delayed_work->timer being * irqsafe, ensures that we return -EAGAIN for finite short period of time. * * On successful return, >= 0, irq is disabled and the caller is * responsible for releasing it using local_irq_restore(*@irq_flags). * * This function is safe to call from any context including IRQ handler. */ static int try_to_grab_pending(struct work_struct *work, u32 cflags, unsigned long *irq_flags) { struct worker_pool *pool; struct pool_workqueue *pwq; local_irq_save(*irq_flags); /* try to steal the timer if it exists */ if (cflags & WORK_CANCEL_DELAYED) { struct delayed_work *dwork = to_delayed_work(work); /* * dwork->timer is irqsafe. If timer_delete() fails, it's * guaranteed that the timer is not queued anywhere and not * running on the local CPU. */ if (likely(timer_delete(&dwork->timer))) return 1; } /* try to claim PENDING the normal way */ if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) return 0; rcu_read_lock(); /* * The queueing is in progress, or it is already queued. Try to * steal it from ->worklist without clearing WORK_STRUCT_PENDING. */ pool = get_work_pool(work); if (!pool) goto fail; raw_spin_lock(&pool->lock); /* * work->data is guaranteed to point to pwq only while the work * item is queued on pwq->wq, and both updating work->data to point * to pwq on queueing and to pool on dequeueing are done under * pwq->pool->lock. This in turn guarantees that, if work->data * points to pwq which is associated with a locked pool, the work * item is currently queued on that pool. */ pwq = get_work_pwq(work); if (pwq && pwq->pool == pool) { unsigned long work_data = *work_data_bits(work); debug_work_deactivate(work); /* * A cancelable inactive work item must be in the * pwq->inactive_works since a queued barrier can't be * canceled (see the comments in insert_wq_barrier()). * * An inactive work item cannot be deleted directly because * it might have linked barrier work items which, if left * on the inactive_works list, will confuse pwq->nr_active * management later on and cause stall. Move the linked * barrier work items to the worklist when deleting the grabbed * item. Also keep WORK_STRUCT_INACTIVE in work_data, so that * it doesn't participate in nr_active management in later * pwq_dec_nr_in_flight(). */ if (work_data & WORK_STRUCT_INACTIVE) move_linked_works(work, &pwq->pool->worklist, NULL); list_del_init(&work->entry); /* * work->data points to pwq iff queued. Let's point to pool. As * this destroys work->data needed by the next step, stash it. */ set_work_pool_and_keep_pending(work, pool->id, pool_offq_flags(pool)); /* must be the last step, see the function comment */ pwq_dec_nr_in_flight(pwq, work_data); raw_spin_unlock(&pool->lock); rcu_read_unlock(); return 1; } raw_spin_unlock(&pool->lock); fail: rcu_read_unlock(); local_irq_restore(*irq_flags); return -EAGAIN; } /** * work_grab_pending - steal work item from worklist and disable irq * @work: work item to steal * @cflags: %WORK_CANCEL_ flags * @irq_flags: place to store IRQ state * * Grab PENDING bit of @work. @work can be in any stable state - idle, on timer * or on worklist. * * Can be called from any context. IRQ is disabled on return with IRQ state * stored in *@irq_flags. The caller is responsible for re-enabling it using * local_irq_restore(). * * Returns %true if @work was pending. %false if idle. */ static bool work_grab_pending(struct work_struct *work, u32 cflags, unsigned long *irq_flags) { int ret; while (true) { ret = try_to_grab_pending(work, cflags, irq_flags); if (ret >= 0) return ret; cpu_relax(); } } /** * insert_work - insert a work into a pool * @pwq: pwq @work belongs to * @work: work to insert * @head: insertion point * @extra_flags: extra WORK_STRUCT_* flags to set * * Insert @work which belongs to @pwq after @head. @extra_flags is or'd to * work_struct flags. * * CONTEXT: * raw_spin_lock_irq(pool->lock). */ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work, struct list_head *head, unsigned int extra_flags) { debug_work_activate(work); /* record the work call stack in order to print it in KASAN reports */ kasan_record_aux_stack(work); /* we own @work, set data and link */ set_work_pwq(work, pwq, extra_flags); list_add_tail(&work->entry, head); get_pwq(pwq); } /* * Test whether @work is being queued from another work executing on the * same workqueue. */ static bool is_chained_work(struct workqueue_struct *wq) { struct worker *worker; worker = current_wq_worker(); /* * Return %true iff I'm a worker executing a work item on @wq. If * I'm @worker, it's safe to dereference it without locking. */ return worker && worker->current_pwq->wq == wq; } /* * When queueing an unbound work item to a wq, prefer local CPU if allowed * by wq_unbound_cpumask. Otherwise, round robin among the allowed ones to * avoid perturbing sensitive tasks. */ static int wq_select_unbound_cpu(int cpu) { int new_cpu; if (likely(!wq_debug_force_rr_cpu)) { if (cpumask_test_cpu(cpu, wq_unbound_cpumask)) return cpu; } else { pr_warn_once("workqueue: round-robin CPU selection forced, expect performance impact\n"); } new_cpu = __this_cpu_read(wq_rr_cpu_last); new_cpu = cpumask_next_and_wrap(new_cpu, wq_unbound_cpumask, cpu_online_mask); if (unlikely(new_cpu >= nr_cpu_ids)) return cpu; __this_cpu_write(wq_rr_cpu_last, new_cpu); return new_cpu; } static void __queue_work(int cpu, struct workqueue_struct *wq, struct work_struct *work) { struct pool_workqueue *pwq; struct worker_pool *last_pool, *pool; unsigned int work_flags; unsigned int req_cpu = cpu; /* * While a work item is PENDING && off queue, a task trying to * steal the PENDING will busy-loop waiting for it to either get * queued or lose PENDING. Grabbing PENDING and queueing should * happen with IRQ disabled. */ lockdep_assert_irqs_disabled(); /* * For a draining wq, only works from the same workqueue are * allowed. The __WQ_DESTROYING helps to spot the issue that * queues a new work item to a wq after destroy_workqueue(wq). */ if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) && WARN_ONCE(!is_chained_work(wq), "workqueue: cannot queue %ps on wq %s\n", work->func, wq->name))) { return; } rcu_read_lock(); retry: /* pwq which will be used unless @work is executing elsewhere */ if (req_cpu == WORK_CPU_UNBOUND) { if (wq->flags & WQ_UNBOUND) cpu = wq_select_unbound_cpu(raw_smp_processor_id()); else cpu = raw_smp_processor_id(); } pwq = rcu_dereference(*per_cpu_ptr(wq->cpu_pwq, cpu)); pool = pwq->pool; /* * If @work was previously on a different pool, it might still be * running there, in which case the work needs to be queued on that * pool to guarantee non-reentrancy. * * For ordered workqueue, work items must be queued on the newest pwq * for accurate order management. Guaranteed order also guarantees * non-reentrancy. See the comments above unplug_oldest_pwq(). */ last_pool = get_work_pool(work); if (last_pool && last_pool != pool && !(wq->flags & __WQ_ORDERED)) { struct worker *worker; raw_spin_lock(&last_pool->lock); worker = find_worker_executing_work(last_pool, work); if (worker && worker->current_pwq->wq == wq) { pwq = worker->current_pwq; pool = pwq->pool; WARN_ON_ONCE(pool != last_pool); } else { /* meh... not running there, queue here */ raw_spin_unlock(&last_pool->lock); raw_spin_lock(&pool->lock); } } else { raw_spin_lock(&pool->lock); } /* * pwq is determined and locked. For unbound pools, we could have raced * with pwq release and it could already be dead. If its refcnt is zero, * repeat pwq selection. Note that unbound pwqs never die without * another pwq replacing it in cpu_pwq or while work items are executing * on it, so the retrying is guaranteed to make forward-progress. */ if (unlikely(!pwq->refcnt)) { if (wq->flags & WQ_UNBOUND) { raw_spin_unlock(&pool->lock); cpu_relax(); goto retry; } /* oops */ WARN_ONCE(true, "workqueue: per-cpu pwq for %s on cpu%d has 0 refcnt", wq->name, cpu); } /* pwq determined, queue */ trace_workqueue_queue_work(req_cpu, pwq, work); if (WARN_ON(!list_empty(&work->entry))) goto out; pwq->nr_in_flight[pwq->work_color]++; work_flags = work_color_to_flags(pwq->work_color); /* * Limit the number of concurrently active work items to max_active. * @work must also queue behind existing inactive work items to maintain * ordering when max_active changes. See wq_adjust_max_active(). */ if (list_empty(&pwq->inactive_works) && pwq_tryinc_nr_active(pwq, false)) { if (list_empty(&pool->worklist)) pool->watchdog_ts = jiffies; trace_workqueue_activate_work(work); insert_work(pwq, work, &pool->worklist, work_flags); kick_pool(pool); } else { work_flags |= WORK_STRUCT_INACTIVE; insert_work(pwq, work, &pwq->inactive_works, work_flags); } out: raw_spin_unlock(&pool->lock); rcu_read_unlock(); } static bool clear_pending_if_disabled(struct work_struct *work) { unsigned long data = *work_data_bits(work); struct work_offq_data offqd; if (likely((data & WORK_STRUCT_PWQ) || !(data & WORK_OFFQ_DISABLE_MASK))) return false; work_offqd_unpack(&offqd, data); set_work_pool_and_clear_pending(work, offqd.pool_id, work_offqd_pack_flags(&offqd)); return true; } /** * queue_work_on - queue work on specific cpu * @cpu: CPU number to execute work on * @wq: workqueue to use * @work: work to queue * * We queue the work to a specific CPU, the caller must ensure it * can't go away. Callers that fail to ensure that the specified * CPU cannot go away will execute on a randomly chosen CPU. * But note well that callers specifying a CPU that never has been * online will get a splat. * * Return: %false if @work was already on a queue, %true otherwise. */ bool queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work) { bool ret = false; unsigned long irq_flags; local_irq_save(irq_flags); if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) && !clear_pending_if_disabled(work)) { __queue_work(cpu, wq, work); ret = true; } local_irq_restore(irq_flags); return ret; } EXPORT_SYMBOL(queue_work_on); /** * select_numa_node_cpu - Select a CPU based on NUMA node * @node: NUMA node ID that we want to select a CPU from * * This function will attempt to find a "random" cpu available on a given * node. If there are no CPUs available on the given node it will return * WORK_CPU_UNBOUND indicating that we should just schedule to any * available CPU if we need to schedule this work. */ static int select_numa_node_cpu(int node) { int cpu; /* Delay binding to CPU if node is not valid or online */ if (node < 0 || node >= MAX_NUMNODES || !node_online(node)) return WORK_CPU_UNBOUND; /* Use local node/cpu if we are already there */ cpu = raw_smp_processor_id(); if (node == cpu_to_node(cpu)) return cpu; /* Use "random" otherwise know as "first" online CPU of node */ cpu = cpumask_any_and(cpumask_of_node(node), cpu_online_mask); /* If CPU is valid return that, otherwise just defer */ return cpu < nr_cpu_ids ? cpu : WORK_CPU_UNBOUND; } /** * queue_work_node - queue work on a "random" cpu for a given NUMA node * @node: NUMA node that we are targeting the work for * @wq: workqueue to use * @work: work to queue * * We queue the work to a "random" CPU within a given NUMA node. The basic * idea here is to provide a way to somehow associate work with a given * NUMA node. * * This function will only make a best effort attempt at getting this onto * the right NUMA node. If no node is requested or the requested node is * offline then we just fall back to standard queue_work behavior. * * Currently the "random" CPU ends up being the first available CPU in the * intersection of cpu_online_mask and the cpumask of the node, unless we * are running on the node. In that case we just use the current CPU. * * Return: %false if @work was already on a queue, %true otherwise. */ bool queue_work_node(int node, struct workqueue_struct *wq, struct work_struct *work) { unsigned long irq_flags; bool ret = false; /* * This current implementation is specific to unbound workqueues. * Specifically we only return the first available CPU for a given * node instead of cycling through individual CPUs within the node. * * If this is used with a per-cpu workqueue then the logic in * workqueue_select_cpu_near would need to be updated to allow for * some round robin type logic. */ WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)); local_irq_save(irq_flags); if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) && !clear_pending_if_disabled(work)) { int cpu = select_numa_node_cpu(node); __queue_work(cpu, wq, work); ret = true; } local_irq_restore(irq_flags); return ret; } EXPORT_SYMBOL_GPL(queue_work_node); void delayed_work_timer_fn(struct timer_list *t) { struct delayed_work *dwork = timer_container_of(dwork, t, timer); /* should have been called from irqsafe timer with irq already off */ __queue_work(dwork->cpu, dwork->wq, &dwork->work); } EXPORT_SYMBOL(delayed_work_timer_fn); static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay) { struct timer_list *timer = &dwork->timer; struct work_struct *work = &dwork->work; WARN_ON_ONCE(!wq); WARN_ON_ONCE(timer->function != delayed_work_timer_fn); WARN_ON_ONCE(timer_pending(timer)); WARN_ON_ONCE(!list_empty(&work->entry)); /* * If @delay is 0, queue @dwork->work immediately. This is for * both optimization and correctness. The earliest @timer can * expire is on the closest next tick and delayed_work users depend * on that there's no such delay when @delay is 0. */ if (!delay) { __queue_work(cpu, wq, &dwork->work); return; } WARN_ON_ONCE(cpu != WORK_CPU_UNBOUND && !cpu_online(cpu)); dwork->wq = wq; dwork->cpu = cpu; timer->expires = jiffies + delay; if (housekeeping_enabled(HK_TYPE_TIMER)) { /* If the current cpu is a housekeeping cpu, use it. */ cpu = smp_processor_id(); if (!housekeeping_test_cpu(cpu, HK_TYPE_TIMER)) cpu = housekeeping_any_cpu(HK_TYPE_TIMER); add_timer_on(timer, cpu); } else { if (likely(cpu == WORK_CPU_UNBOUND)) add_timer_global(timer); else add_timer_on(timer, cpu); } } /** * queue_delayed_work_on - queue work on specific CPU after delay * @cpu: CPU number to execute work on * @wq: workqueue to use * @dwork: work to queue * @delay: number of jiffies to wait before queueing * * We queue the delayed_work to a specific CPU, for non-zero delays the * caller must ensure it is online and can't go away. Callers that fail * to ensure this, may get @dwork->timer queued to an offlined CPU and * this will prevent queueing of @dwork->work unless the offlined CPU * becomes online again. * * Return: %false if @work was already on a queue, %true otherwise. If * @delay is zero and @dwork is idle, it will be scheduled for immediate * execution. */ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay) { struct work_struct *work = &dwork->work; bool ret = false; unsigned long irq_flags; /* read the comment in __queue_work() */ local_irq_save(irq_flags); if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) && !clear_pending_if_disabled(work)) { __queue_delayed_work(cpu, wq, dwork, delay); ret = true; } local_irq_restore(irq_flags); return ret; } EXPORT_SYMBOL(queue_delayed_work_on); /** * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU * @cpu: CPU number to execute work on * @wq: workqueue to use * @dwork: work to queue * @delay: number of jiffies to wait before queueing * * If @dwork is idle, equivalent to queue_delayed_work_on(); otherwise, * modify @dwork's timer so that it expires after @delay. If @delay is * zero, @work is guaranteed to be scheduled immediately regardless of its * current state. * * Return: %false if @dwork was idle and queued, %true if @dwork was * pending and its timer was modified. * * This function is safe to call from any context including IRQ handler. * See try_to_grab_pending() for details. */ bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay) { unsigned long irq_flags; bool ret; ret = work_grab_pending(&dwork->work, WORK_CANCEL_DELAYED, &irq_flags); if (!clear_pending_if_disabled(&dwork->work)) __queue_delayed_work(cpu, wq, dwork, delay); local_irq_restore(irq_flags); return ret; } EXPORT_SYMBOL_GPL(mod_delayed_work_on); static void rcu_work_rcufn(struct rcu_head *rcu) { struct rcu_work *rwork = container_of(rcu, struct rcu_work, rcu); /* read the comment in __queue_work() */ local_irq_disable(); __queue_work(WORK_CPU_UNBOUND, rwork->wq, &rwork->work); local_irq_enable(); } /** * queue_rcu_work - queue work after a RCU grace period * @wq: workqueue to use * @rwork: work to queue * * Return: %false if @rwork was already pending, %true otherwise. Note * that a full RCU grace period is guaranteed only after a %true return. * While @rwork is guaranteed to be executed after a %false return, the * execution may happen before a full RCU grace period has passed. */ bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork) { struct work_struct *work = &rwork->work; /* * rcu_work can't be canceled or disabled. Warn if the user reached * inside @rwork and disabled the inner work. */ if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) && !WARN_ON_ONCE(clear_pending_if_disabled(work))) { rwork->wq = wq; call_rcu_hurry(&rwork->rcu, rcu_work_rcufn); return true; } return false; } EXPORT_SYMBOL(queue_rcu_work); static struct worker *alloc_worker(int node) { struct worker *worker; worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, node); if (worker) { INIT_LIST_HEAD(&worker->entry); INIT_LIST_HEAD(&worker->scheduled); INIT_LIST_HEAD(&worker->node); /* on creation a worker is in !idle && prep state */ worker->flags = WORKER_PREP; } return worker; } static cpumask_t *pool_allowed_cpus(struct worker_pool *pool) { if (pool->cpu < 0 && pool->attrs->affn_strict) return pool->attrs->__pod_cpumask; else return pool->attrs->cpumask; } /** * worker_attach_to_pool() - attach a worker to a pool * @worker: worker to be attached * @pool: the target pool * * Attach @worker to @pool. Once attached, the %WORKER_UNBOUND flag and * cpu-binding of @worker are kept coordinated with the pool across * cpu-[un]hotplugs. */ static void worker_attach_to_pool(struct worker *worker, struct worker_pool *pool) { mutex_lock(&wq_pool_attach_mutex); /* * The wq_pool_attach_mutex ensures %POOL_DISASSOCIATED remains stable * across this function. See the comments above the flag definition for * details. BH workers are, while per-CPU, always DISASSOCIATED. */ if (pool->flags & POOL_DISASSOCIATED) { worker->flags |= WORKER_UNBOUND; } else { WARN_ON_ONCE(pool->flags & POOL_BH); kthread_set_per_cpu(worker->task, pool->cpu); } if (worker->rescue_wq) set_cpus_allowed_ptr(worker->task, pool_allowed_cpus(pool)); list_add_tail(&worker->node, &pool->workers); worker->pool = pool; mutex_unlock(&wq_pool_attach_mutex); } static void unbind_worker(struct worker *worker) { lockdep_assert_held(&wq_pool_attach_mutex); kthread_set_per_cpu(worker->task, -1); if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask)) WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0); else WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0); } static void detach_worker(struct worker *worker) { lockdep_assert_held(&wq_pool_attach_mutex); unbind_worker(worker); list_del(&worker->node); } /** * worker_detach_from_pool() - detach a worker from its pool * @worker: worker which is attached to its pool * * Undo the attaching which had been done in worker_attach_to_pool(). The * caller worker shouldn't access to the pool after detached except it has * other reference to the pool. */ static void worker_detach_from_pool(struct worker *worker) { struct worker_pool *pool = worker->pool; /* there is one permanent BH worker per CPU which should never detach */ WARN_ON_ONCE(pool->flags & POOL_BH); mutex_lock(&wq_pool_attach_mutex); detach_worker(worker); worker->pool = NULL; mutex_unlock(&wq_pool_attach_mutex); /* clear leftover flags without pool->lock after it is detached */ worker->flags &= ~(WORKER_UNBOUND | WORKER_REBOUND); } static int format_worker_id(char *buf, size_t size, struct worker *worker, struct worker_pool *pool) { if (worker->rescue_wq) return scnprintf(buf, size, "kworker/R-%s", worker->rescue_wq->name); if (pool) { if (pool->cpu >= 0) return scnprintf(buf, size, "kworker/%d:%d%s", pool->cpu, worker->id, pool->attrs->nice < 0 ? "H" : ""); else return scnprintf(buf, size, "kworker/u%d:%d", pool->id, worker->id); } else { return scnprintf(buf, size, "kworker/dying"); } } /** * create_worker - create a new workqueue worker * @pool: pool the new worker will belong to * * Create and start a new worker which is attached to @pool. * * CONTEXT: * Might sleep. Does GFP_KERNEL allocations. * * Return: * Pointer to the newly created worker. */ static struct worker *create_worker(struct worker_pool *pool) { struct worker *worker; int id; /* ID is needed to determine kthread name */ id = ida_alloc(&pool->worker_ida, GFP_KERNEL); if (id < 0) { pr_err_once("workqueue: Failed to allocate a worker ID: %pe\n", ERR_PTR(id)); return NULL; } worker = alloc_worker(pool->node); if (!worker) { pr_err_once("workqueue: Failed to allocate a worker\n"); goto fail; } worker->id = id; if (!(pool->flags & POOL_BH)) { char id_buf[WORKER_ID_LEN]; format_worker_id(id_buf, sizeof(id_buf), worker, pool); worker->task = kthread_create_on_node(worker_thread, worker, pool->node, "%s", id_buf); if (IS_ERR(worker->task)) { if (PTR_ERR(worker->task) == -EINTR) { pr_err("workqueue: Interrupted when creating a worker thread \"%s\"\n", id_buf); } else { pr_err_once("workqueue: Failed to create a worker thread: %pe", worker->task); } goto fail; } set_user_nice(worker->task, pool->attrs->nice); kthread_bind_mask(worker->task, pool_allowed_cpus(pool)); } /* successful, attach the worker to the pool */ worker_attach_to_pool(worker, pool); /* start the newly created worker */ raw_spin_lock_irq(&pool->lock); worker->pool->nr_workers++; worker_enter_idle(worker); /* * @worker is waiting on a completion in kthread() and will trigger hung * check if not woken up soon. As kick_pool() is noop if @pool is empty, * wake it up explicitly. */ if (worker->task) wake_up_process(worker->task); raw_spin_unlock_irq(&pool->lock); return worker; fail: ida_free(&pool->worker_ida, id); kfree(worker); return NULL; } static void detach_dying_workers(struct list_head *cull_list) { struct worker *worker; list_for_each_entry(worker, cull_list, entry) detach_worker(worker); } static void reap_dying_workers(struct list_head *cull_list) { struct worker *worker, *tmp; list_for_each_entry_safe(worker, tmp, cull_list, entry) { list_del_init(&worker->entry); kthread_stop_put(worker->task); kfree(worker); } } /** * set_worker_dying - Tag a worker for destruction * @worker: worker to be destroyed * @list: transfer worker away from its pool->idle_list and into list * * Tag @worker for destruction and adjust @pool stats accordingly. The worker * should be idle. * * CONTEXT: * raw_spin_lock_irq(pool->lock). */ static void set_worker_dying(struct worker *worker, struct list_head *list) { struct worker_pool *pool = worker->pool; lockdep_assert_held(&pool->lock); lockdep_assert_held(&wq_pool_attach_mutex); /* sanity check frenzy */ if (WARN_ON(worker->current_work) || WARN_ON(!list_empty(&worker->scheduled)) || WARN_ON(!(worker->flags & WORKER_IDLE))) return; pool->nr_workers--; pool->nr_idle--; worker->flags |= WORKER_DIE; list_move(&worker->entry, list); /* get an extra task struct reference for later kthread_stop_put() */ get_task_struct(worker->task); } /** * idle_worker_timeout - check if some idle workers can now be deleted. * @t: The pool's idle_timer that just expired * * The timer is armed in worker_enter_idle(). Note that it isn't disarmed in * worker_leave_idle(), as a worker flicking between idle and active while its * pool is at the too_many_workers() tipping point would cause too much timer * housekeeping overhead. Since IDLE_WORKER_TIMEOUT is long enough, we just let * it expire and re-evaluate things from there. */ static void idle_worker_timeout(struct timer_list *t) { struct worker_pool *pool = timer_container_of(pool, t, idle_timer); bool do_cull = false; if (work_pending(&pool->idle_cull_work)) return; raw_spin_lock_irq(&pool->lock); if (too_many_workers(pool)) { struct worker *worker; unsigned long expires; /* idle_list is kept in LIFO order, check the last one */ worker = list_last_entry(&pool->idle_list, struct worker, entry); expires = worker->last_active + IDLE_WORKER_TIMEOUT; do_cull = !time_before(jiffies, expires); if (!do_cull) mod_timer(&pool->idle_timer, expires); } raw_spin_unlock_irq(&pool->lock); if (do_cull) queue_work(system_unbound_wq, &pool->idle_cull_work); } /** * idle_cull_fn - cull workers that have been idle for too long. * @work: the pool's work for handling these idle workers * * This goes through a pool's idle workers and gets rid of those that have been * idle for at least IDLE_WORKER_TIMEOUT seconds. * * We don't want to disturb isolated CPUs because of a pcpu kworker being * culled, so this also resets worker affinity. This requires a sleepable * context, hence the split between timer callback and work item. */ static void idle_cull_fn(struct work_struct *work) { struct worker_pool *pool = container_of(work, struct worker_pool, idle_cull_work); LIST_HEAD(cull_list); /* * Grabbing wq_pool_attach_mutex here ensures an already-running worker * cannot proceed beyong set_pf_worker() in its self-destruct path. * This is required as a previously-preempted worker could run after * set_worker_dying() has happened but before detach_dying_workers() did. */ mutex_lock(&wq_pool_attach_mutex); raw_spin_lock_irq(&pool->lock); while (too_many_workers(pool)) { struct worker *worker; unsigned long expires; worker = list_last_entry(&pool->idle_list, struct worker, entry); expires = worker->last_active + IDLE_WORKER_TIMEOUT; if (time_before(jiffies, expires)) { mod_timer(&pool->idle_timer, expires); break; } set_worker_dying(worker, &cull_list); } raw_spin_unlock_irq(&pool->lock); detach_dying_workers(&cull_list); mutex_unlock(&wq_pool_attach_mutex); reap_dying_workers(&cull_list); } static void send_mayday(struct work_struct *work) { struct pool_workqueue *pwq = get_work_pwq(work); struct workqueue_struct *wq = pwq->wq; lockdep_assert_held(&wq_mayday_lock); if (!wq->rescuer) return; /* mayday mayday mayday */ if (list_empty(&pwq->mayday_node)) { /* * If @pwq is for an unbound wq, its base ref may be put at * any time due to an attribute change. Pin @pwq until the * rescuer is done with it. */ get_pwq(pwq); list_add_tail(&pwq->mayday_node, &wq->maydays); wake_up_process(wq->rescuer->task); pwq->stats[PWQ_STAT_MAYDAY]++; } } static void pool_mayday_timeout(struct timer_list *t) { struct worker_pool *pool = timer_container_of(pool, t, mayday_timer); struct work_struct *work; raw_spin_lock_irq(&pool->lock); raw_spin_lock(&wq_mayday_lock); /* for wq->maydays */ if (need_to_create_worker(pool)) { /* * We've been trying to create a new worker but * haven't been successful. We might be hitting an * allocation deadlock. Send distress signals to * rescuers. */ list_for_each_entry(work, &pool->worklist, entry) send_mayday(work); } raw_spin_unlock(&wq_mayday_lock); raw_spin_unlock_irq(&pool->lock); mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL); } /** * maybe_create_worker - create a new worker if necessary * @pool: pool to create a new worker for * * Create a new worker for @pool if necessary. @pool is guaranteed to * have at least one idle worker on return from this function. If * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is * sent to all rescuers with works scheduled on @pool to resolve * possible allocation deadlock. * * On return, need_to_create_worker() is guaranteed to be %false and * may_start_working() %true. * * LOCKING: * raw_spin_lock_irq(pool->lock) which may be released and regrabbed * multiple times. Does GFP_KERNEL allocations. Called only from * manager. */ static void maybe_create_worker(struct worker_pool *pool) __releases(&pool->lock) __acquires(&pool->lock) { restart: raw_spin_unlock_irq(&pool->lock); /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */ mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); while (true) { if (create_worker(pool) || !need_to_create_worker(pool)) break; schedule_timeout_interruptible(CREATE_COOLDOWN); if (!need_to_create_worker(pool)) break; } timer_delete_sync(&pool->mayday_timer); raw_spin_lock_irq(&pool->lock); /* * This is necessary even after a new worker was just successfully * created as @pool->lock was dropped and the new worker might have * already become busy. */ if (need_to_create_worker(pool)) goto restart; } /** * manage_workers - manage worker pool * @worker: self * * Assume the manager role and manage the worker pool @worker belongs * to. At any given time, there can be only zero or one manager per * pool. The exclusion is handled automatically by this function. * * The caller can safely start processing works on false return. On * true return, it's guaranteed that need_to_create_worker() is false * and may_start_working() is true. * * CONTEXT: * raw_spin_lock_irq(pool->lock) which may be released and regrabbed * multiple times. Does GFP_KERNEL allocations. * * Return: * %false if the pool doesn't need management and the caller can safely * start processing works, %true if management function was performed and * the conditions that the caller verified before calling the function may * no longer be true. */ static bool manage_workers(struct worker *worker) { struct worker_pool *pool = worker->pool; if (pool->flags & POOL_MANAGER_ACTIVE) return false; pool->flags |= POOL_MANAGER_ACTIVE; pool->manager = worker; maybe_create_worker(pool); pool->manager = NULL; pool->flags &= ~POOL_MANAGER_ACTIVE; rcuwait_wake_up(&manager_wait); return true; } /** * process_one_work - process single work * @worker: self * @work: work to process * * Process @work. This function contains all the logics necessary to * process a single work including synchronization against and * interaction with other workers on the same cpu, queueing and * flushing. As long as context requirement is met, any worker can * call this function to process a work. * * CONTEXT: * raw_spin_lock_irq(pool->lock) which is released and regrabbed. */ static void process_one_work(struct worker *worker, struct work_struct *work) __releases(&pool->lock) __acquires(&pool->lock) { struct pool_workqueue *pwq = get_work_pwq(work); struct worker_pool *pool = worker->pool; unsigned long work_data; int lockdep_start_depth, rcu_start_depth; bool bh_draining = pool->flags & POOL_BH_DRAINING; #ifdef CONFIG_LOCKDEP /* * It is permissible to free the struct work_struct from * inside the function that is called from it, this we need to * take into account for lockdep too. To avoid bogus "held * lock freed" warnings as well as problems when looking into * work->lockdep_map, make a copy and use that here. */ struct lockdep_map lockdep_map; lockdep_copy_map(&lockdep_map, &work->lockdep_map); #endif /* ensure we're on the correct CPU */ WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) && raw_smp_processor_id() != pool->cpu); /* claim and dequeue */ debug_work_deactivate(work); hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work); worker->current_work = work; worker->current_func = work->func; worker->current_pwq = pwq; if (worker->task) worker->current_at = worker->task->se.sum_exec_runtime; work_data = *work_data_bits(work); worker->current_color = get_work_color(work_data); /* * Record wq name for cmdline and debug reporting, may get * overridden through set_worker_desc(). */ strscpy(worker->desc, pwq->wq->name, WORKER_DESC_LEN); list_del_init(&work->entry); /* * CPU intensive works don't participate in concurrency management. * They're the scheduler's responsibility. This takes @worker out * of concurrency management and the next code block will chain * execution of the pending work items. */ if (unlikely(pwq->wq->flags & WQ_CPU_INTENSIVE)) worker_set_flags(worker, WORKER_CPU_INTENSIVE); /* * Kick @pool if necessary. It's always noop for per-cpu worker pools * since nr_running would always be >= 1 at this point. This is used to * chain execution of the pending work items for WORKER_NOT_RUNNING * workers such as the UNBOUND and CPU_INTENSIVE ones. */ kick_pool(pool); /* * Record the last pool and clear PENDING which should be the last * update to @work. Also, do this inside @pool->lock so that * PENDING and queued state changes happen together while IRQ is * disabled. */ set_work_pool_and_clear_pending(work, pool->id, pool_offq_flags(pool)); pwq->stats[PWQ_STAT_STARTED]++; raw_spin_unlock_irq(&pool->lock); rcu_start_depth = rcu_preempt_depth(); lockdep_start_depth = lockdep_depth(current); /* see drain_dead_softirq_workfn() */ if (!bh_draining) lock_map_acquire(pwq->wq->lockdep_map); lock_map_acquire(&lockdep_map); /* * Strictly speaking we should mark the invariant state without holding * any locks, that is, before these two lock_map_acquire()'s. * * However, that would result in: * * A(W1) * WFC(C) * A(W1) * C(C) * * Which would create W1->C->W1 dependencies, even though there is no * actual deadlock possible. There are two solutions, using a * read-recursive acquire on the work(queue) 'locks', but this will then * hit the lockdep limitation on recursive locks, or simply discard * these locks. * * AFAICT there is no possible deadlock scenario between the * flush_work() and complete() primitives (except for single-threaded * workqueues), so hiding them isn't a problem. */ lockdep_invariant_state(true); trace_workqueue_execute_start(work); worker->current_func(work); /* * While we must be careful to not use "work" after this, the trace * point will only record its address. */ trace_workqueue_execute_end(work, worker->current_func); lock_map_release(&lockdep_map); if (!bh_draining) lock_map_release(pwq->wq->lockdep_map); if (unlikely((worker->task && in_atomic()) || lockdep_depth(current) != lockdep_start_depth || rcu_preempt_depth() != rcu_start_depth)) { pr_err("BUG: workqueue leaked atomic, lock or RCU: %s[%d]\n" " preempt=0x%08x lock=%d->%d RCU=%d->%d workfn=%ps\n", current->comm, task_pid_nr(current), preempt_count(), lockdep_start_depth, lockdep_depth(current), rcu_start_depth, rcu_preempt_depth(), worker->current_func); debug_show_held_locks(current); dump_stack(); } /* * The following prevents a kworker from hogging CPU on !PREEMPTION * kernels, where a requeueing work item waiting for something to * happen could deadlock with stop_machine as such work item could * indefinitely requeue itself while all other CPUs are trapped in * stop_machine. At the same time, report a quiescent RCU state so * the same condition doesn't freeze RCU. */ if (worker->task) cond_resched(); raw_spin_lock_irq(&pool->lock); pwq->stats[PWQ_STAT_COMPLETED]++; /* * In addition to %WQ_CPU_INTENSIVE, @worker may also have been marked * CPU intensive by wq_worker_tick() if @work hogged CPU longer than * wq_cpu_intensive_thresh_us. Clear it. */ worker_clr_flags(worker, WORKER_CPU_INTENSIVE); /* tag the worker for identification in schedule() */ worker->last_func = worker->current_func; /* we're done with it, release */ hash_del(&worker->hentry); worker->current_work = NULL; worker->current_func = NULL; worker->current_pwq = NULL; worker->current_color = INT_MAX; /* must be the last step, see the function comment */ pwq_dec_nr_in_flight(pwq, work_data); } /** * process_scheduled_works - process scheduled works * @worker: self * * Process all scheduled works. Please note that the scheduled list * may change while processing a work, so this function repeatedly * fetches a work from the top and executes it. * * CONTEXT: * raw_spin_lock_irq(pool->lock) which may be released and regrabbed * multiple times. */ static void process_scheduled_works(struct worker *worker) { struct work_struct *work; bool first = true; while ((work = list_first_entry_or_null(&worker->scheduled, struct work_struct, entry))) { if (first) { worker->pool->watchdog_ts = jiffies; first = false; } process_one_work(worker, work); } } static void set_pf_worker(bool val) { mutex_lock(&wq_pool_attach_mutex); if (val) current->flags |= PF_WQ_WORKER; else current->flags &= ~PF_WQ_WORKER; mutex_unlock(&wq_pool_attach_mutex); } /** * worker_thread - the worker thread function * @__worker: self * * The worker thread function. All workers belong to a worker_pool - * either a per-cpu one or dynamic unbound one. These workers process all * work items regardless of their specific target workqueue. The only * exception is work items which belong to workqueues with a rescuer which * will be explained in rescuer_thread(). * * Return: 0 */ static int worker_thread(void *__worker) { struct worker *worker = __worker; struct worker_pool *pool = worker->pool; /* tell the scheduler that this is a workqueue worker */ set_pf_worker(true); woke_up: raw_spin_lock_irq(&pool->lock); /* am I supposed to die? */ if (unlikely(worker->flags & WORKER_DIE)) { raw_spin_unlock_irq(&pool->lock); set_pf_worker(false); /* * The worker is dead and PF_WQ_WORKER is cleared, worker->pool * shouldn't be accessed, reset it to NULL in case otherwise. */ worker->pool = NULL; ida_free(&pool->worker_ida, worker->id); return 0; } worker_leave_idle(worker); recheck: /* no more worker necessary? */ if (!need_more_worker(pool)) goto sleep; /* do we need to manage? */ if (unlikely(!may_start_working(pool)) && manage_workers(worker)) goto recheck; /* * ->scheduled list can only be filled while a worker is * preparing to process a work or actually processing it. * Make sure nobody diddled with it while I was sleeping. */ WARN_ON_ONCE(!list_empty(&worker->scheduled)); /* * Finish PREP stage. We're guaranteed to have at least one idle * worker or that someone else has already assumed the manager * role. This is where @worker starts participating in concurrency * management if applicable and concurrency management is restored * after being rebound. See rebind_workers() for details. */ worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND); do { struct work_struct *work = list_first_entry(&pool->worklist, struct work_struct, entry); if (assign_work(work, worker, NULL)) process_scheduled_works(worker); } while (keep_working(pool)); worker_set_flags(worker, WORKER_PREP); sleep: /* * pool->lock is held and there's no work to process and no need to * manage, sleep. Workers are woken up only while holding * pool->lock or from local cpu, so setting the current state * before releasing pool->lock is enough to prevent losing any * event. */ worker_enter_idle(worker); __set_current_state(TASK_IDLE); raw_spin_unlock_irq(&pool->lock); schedule(); goto woke_up; } /** * rescuer_thread - the rescuer thread function * @__rescuer: self * * Workqueue rescuer thread function. There's one rescuer for each * workqueue which has WQ_MEM_RECLAIM set. * * Regular work processing on a pool may block trying to create a new * worker which uses GFP_KERNEL allocation which has slight chance of * developing into deadlock if some works currently on the same queue * need to be processed to satisfy the GFP_KERNEL allocation. This is * the problem rescuer solves. * * When such condition is possible, the pool summons rescuers of all * workqueues which have works queued on the pool and let them process * those works so that forward progress can be guaranteed. * * This should happen rarely. * * Return: 0 */ static int rescuer_thread(void *__rescuer) { struct worker *rescuer = __rescuer; struct workqueue_struct *wq = rescuer->rescue_wq; bool should_stop; set_user_nice(current, RESCUER_NICE_LEVEL); /* * Mark rescuer as worker too. As WORKER_PREP is never cleared, it * doesn't participate in concurrency management. */ set_pf_worker(true); repeat: set_current_state(TASK_IDLE); /* * By the time the rescuer is requested to stop, the workqueue * shouldn't have any work pending, but @wq->maydays may still have * pwq(s) queued. This can happen by non-rescuer workers consuming * all the work items before the rescuer got to them. Go through * @wq->maydays processing before acting on should_stop so that the * list is always empty on exit. */ should_stop = kthread_should_stop(); /* see whether any pwq is asking for help */ raw_spin_lock_irq(&wq_mayday_lock); while (!list_empty(&wq->maydays)) { struct pool_workqueue *pwq = list_first_entry(&wq->maydays, struct pool_workqueue, mayday_node); struct worker_pool *pool = pwq->pool; struct work_struct *work, *n; __set_current_state(TASK_RUNNING); list_del_init(&pwq->mayday_node); raw_spin_unlock_irq(&wq_mayday_lock); worker_attach_to_pool(rescuer, pool); raw_spin_lock_irq(&pool->lock); /* * Slurp in all works issued via this workqueue and * process'em. */ WARN_ON_ONCE(!list_empty(&rescuer->scheduled)); list_for_each_entry_safe(work, n, &pool->worklist, entry) { if (get_work_pwq(work) == pwq && assign_work(work, rescuer, &n)) pwq->stats[PWQ_STAT_RESCUED]++; } if (!list_empty(&rescuer->scheduled)) { process_scheduled_works(rescuer); /* * The above execution of rescued work items could * have created more to rescue through * pwq_activate_first_inactive() or chained * queueing. Let's put @pwq back on mayday list so * that such back-to-back work items, which may be * being used to relieve memory pressure, don't * incur MAYDAY_INTERVAL delay inbetween. */ if (pwq->nr_active && need_to_create_worker(pool)) { raw_spin_lock(&wq_mayday_lock); /* * Queue iff we aren't racing destruction * and somebody else hasn't queued it already. */ if (wq->rescuer && list_empty(&pwq->mayday_node)) { get_pwq(pwq); list_add_tail(&pwq->mayday_node, &wq->maydays); } raw_spin_unlock(&wq_mayday_lock); } } /* * Leave this pool. Notify regular workers; otherwise, we end up * with 0 concurrency and stalling the execution. */ kick_pool(pool); raw_spin_unlock_irq(&pool->lock); worker_detach_from_pool(rescuer); /* * Put the reference grabbed by send_mayday(). @pool might * go away any time after it. */ put_pwq_unlocked(pwq); raw_spin_lock_irq(&wq_mayday_lock); } raw_spin_unlock_irq(&wq_mayday_lock); if (should_stop) { __set_current_state(TASK_RUNNING); set_pf_worker(false); return 0; } /* rescuers should never participate in concurrency management */ WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING)); schedule(); goto repeat; } static void bh_worker(struct worker *worker) { struct worker_pool *pool = worker->pool; int nr_restarts = BH_WORKER_RESTARTS; unsigned long end = jiffies + BH_WORKER_JIFFIES; raw_spin_lock_irq(&pool->lock); worker_leave_idle(worker); /* * This function follows the structure of worker_thread(). See there for * explanations on each step. */ if (!need_more_worker(pool)) goto done; WARN_ON_ONCE(!list_empty(&worker->scheduled)); worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND); do { struct work_struct *work = list_first_entry(&pool->worklist, struct work_struct, entry); if (assign_work(work, worker, NULL)) process_scheduled_works(worker); } while (keep_working(pool) && --nr_restarts && time_before(jiffies, end)); worker_set_flags(worker, WORKER_PREP); done: worker_enter_idle(worker); kick_pool(pool); raw_spin_unlock_irq(&pool->lock); } /* * TODO: Convert all tasklet users to workqueue and use softirq directly. * * This is currently called from tasklet[_hi]action() and thus is also called * whenever there are tasklets to run. Let's do an early exit if there's nothing * queued. Once conversion from tasklet is complete, the need_more_worker() test * can be dropped. * * After full conversion, we'll add worker->softirq_action, directly use the * softirq action and obtain the worker pointer from the softirq_action pointer. */ void workqueue_softirq_action(bool highpri) { struct worker_pool *pool = &per_cpu(bh_worker_pools, smp_processor_id())[highpri]; if (need_more_worker(pool)) bh_worker(list_first_entry(&pool->workers, struct worker, node)); } struct wq_drain_dead_softirq_work { struct work_struct work; struct worker_pool *pool; struct completion done; }; static void drain_dead_softirq_workfn(struct work_struct *work) { struct wq_drain_dead_softirq_work *dead_work = container_of(work, struct wq_drain_dead_softirq_work, work); struct worker_pool *pool = dead_work->pool; bool repeat; /* * @pool's CPU is dead and we want to execute its still pending work * items from this BH work item which is running on a different CPU. As * its CPU is dead, @pool can't be kicked and, as work execution path * will be nested, a lockdep annotation needs to be suppressed. Mark * @pool with %POOL_BH_DRAINING for the special treatments. */ raw_spin_lock_irq(&pool->lock); pool->flags |= POOL_BH_DRAINING; raw_spin_unlock_irq(&pool->lock); bh_worker(list_first_entry(&pool->workers, struct worker, node)); raw_spin_lock_irq(&pool->lock); pool->flags &= ~POOL_BH_DRAINING; repeat = need_more_worker(pool); raw_spin_unlock_irq(&pool->lock); /* * bh_worker() might hit consecutive execution limit and bail. If there * still are pending work items, reschedule self and return so that we * don't hog this CPU's BH. */ if (repeat) { if (pool->attrs->nice == HIGHPRI_NICE_LEVEL) queue_work(system_bh_highpri_wq, work); else queue_work(system_bh_wq, work); } else { complete(&dead_work->done); } } /* * @cpu is dead. Drain the remaining BH work items on the current CPU. It's * possible to allocate dead_work per CPU and avoid flushing. However, then we * have to worry about draining overlapping with CPU coming back online or * nesting (one CPU's dead_work queued on another CPU which is also dead and so * on). Let's keep it simple and drain them synchronously. These are BH work * items which shouldn't be requeued on the same pool. Shouldn't take long. */ void workqueue_softirq_dead(unsigned int cpu) { int i; for (i = 0; i < NR_STD_WORKER_POOLS; i++) { struct worker_pool *pool = &per_cpu(bh_worker_pools, cpu)[i]; struct wq_drain_dead_softirq_work dead_work; if (!need_more_worker(pool)) continue; INIT_WORK_ONSTACK(&dead_work.work, drain_dead_softirq_workfn); dead_work.pool = pool; init_completion(&dead_work.done); if (pool->attrs->nice == HIGHPRI_NICE_LEVEL) queue_work(system_bh_highpri_wq, &dead_work.work); else queue_work(system_bh_wq, &dead_work.work); wait_for_completion(&dead_work.done); destroy_work_on_stack(&dead_work.work); } } /** * check_flush_dependency - check for flush dependency sanity * @target_wq: workqueue being flushed * @target_work: work item being flushed (NULL for workqueue flushes) * @from_cancel: are we called from the work cancel path * * %current is trying to flush the whole @target_wq or @target_work on it. * If this is not the cancel path (which implies work being flushed is either * already running, or will not be at all), check if @target_wq doesn't have * %WQ_MEM_RECLAIM and verify that %current is not reclaiming memory or running * on a workqueue which doesn't have %WQ_MEM_RECLAIM as that can break forward- * progress guarantee leading to a deadlock. */ static void check_flush_dependency(struct workqueue_struct *target_wq, struct work_struct *target_work, bool from_cancel) { work_func_t target_func; struct worker *worker; if (from_cancel || target_wq->flags & WQ_MEM_RECLAIM) return; worker = current_wq_worker(); target_func = target_work ? target_work->func : NULL; WARN_ONCE(current->flags & PF_MEMALLOC, "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%ps", current->pid, current->comm, target_wq->name, target_func); WARN_ONCE(worker && ((worker->current_pwq->wq->flags & (WQ_MEM_RECLAIM | __WQ_LEGACY)) == WQ_MEM_RECLAIM), "workqueue: WQ_MEM_RECLAIM %s:%ps is flushing !WQ_MEM_RECLAIM %s:%ps", worker->current_pwq->wq->name, worker->current_func, target_wq->name, target_func); } struct wq_barrier { struct work_struct work; struct completion done; struct task_struct *task; /* purely informational */ }; static void wq_barrier_func(struct work_struct *work) { struct wq_barrier *barr = container_of(work, struct wq_barrier, work); complete(&barr->done); } /** * insert_wq_barrier - insert a barrier work * @pwq: pwq to insert barrier into * @barr: wq_barrier to insert * @target: target work to attach @barr to * @worker: worker currently executing @target, NULL if @target is not executing * * @barr is linked to @target such that @barr is completed only after * @target finishes execution. Please note that the ordering * guarantee is observed only with respect to @target and on the local * cpu. * * Currently, a queued barrier can't be canceled. This is because * try_to_grab_pending() can't determine whether the work to be * grabbed is at the head of the queue and thus can't clear LINKED * flag of the previous work while there must be a valid next work * after a work with LINKED flag set. * * Note that when @worker is non-NULL, @target may be modified * underneath us, so we can't reliably determine pwq from @target. * * CONTEXT: * raw_spin_lock_irq(pool->lock). */ static void insert_wq_barrier(struct pool_workqueue *pwq, struct wq_barrier *barr, struct work_struct *target, struct worker *worker) { static __maybe_unused struct lock_class_key bh_key, thr_key; unsigned int work_flags = 0; unsigned int work_color; struct list_head *head; /* * debugobject calls are safe here even with pool->lock locked * as we know for sure that this will not trigger any of the * checks and call back into the fixup functions where we * might deadlock. * * BH and threaded workqueues need separate lockdep keys to avoid * spuriously triggering "inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} * usage". */ INIT_WORK_ONSTACK_KEY(&barr->work, wq_barrier_func, (pwq->wq->flags & WQ_BH) ? &bh_key : &thr_key); __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); init_completion_map(&barr->done, &target->lockdep_map); barr->task = current; /* The barrier work item does not participate in nr_active. */ work_flags |= WORK_STRUCT_INACTIVE; /* * If @target is currently being executed, schedule the * barrier to the worker; otherwise, put it after @target. */ if (worker) { head = worker->scheduled.next; work_color = worker->current_color; } else { unsigned long *bits = work_data_bits(target); head = target->entry.next; /* there can already be other linked works, inherit and set */ work_flags |= *bits & WORK_STRUCT_LINKED; work_color = get_work_color(*bits); __set_bit(WORK_STRUCT_LINKED_BIT, bits); } pwq->nr_in_flight[work_color]++; work_flags |= work_color_to_flags(work_color); insert_work(pwq, &barr->work, head, work_flags); } /** * flush_workqueue_prep_pwqs - prepare pwqs for workqueue flushing * @wq: workqueue being flushed * @flush_color: new flush color, < 0 for no-op * @work_color: new work color, < 0 for no-op * * Prepare pwqs for workqueue flushing. * * If @flush_color is non-negative, flush_color on all pwqs should be * -1. If no pwq has in-flight commands at the specified color, all * pwq->flush_color's stay at -1 and %false is returned. If any pwq * has in flight commands, its pwq->flush_color is set to * @flush_color, @wq->nr_pwqs_to_flush is updated accordingly, pwq * wakeup logic is armed and %true is returned. * * The caller should have initialized @wq->first_flusher prior to * calling this function with non-negative @flush_color. If * @flush_color is negative, no flush color update is done and %false * is returned. * * If @work_color is non-negative, all pwqs should have the same * work_color which is previous to @work_color and all will be * advanced to @work_color. * * CONTEXT: * mutex_lock(wq->mutex). * * Return: * %true if @flush_color >= 0 and there's something to flush. %false * otherwise. */ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, int flush_color, int work_color) { bool wait = false; struct pool_workqueue *pwq; struct worker_pool *current_pool = NULL; if (flush_color >= 0) { WARN_ON_ONCE(atomic_read(&wq->nr_pwqs_to_flush)); atomic_set(&wq->nr_pwqs_to_flush, 1); } /* * For unbound workqueue, pwqs will map to only a few pools. * Most of the time, pwqs within the same pool will be linked * sequentially to wq->pwqs by cpu index. So in the majority * of pwq iters, the pool is the same, only doing lock/unlock * if the pool has changed. This can largely reduce expensive * lock operations. */ for_each_pwq(pwq, wq) { if (current_pool != pwq->pool) { if (likely(current_pool)) raw_spin_unlock_irq(&current_pool->lock); current_pool = pwq->pool; raw_spin_lock_irq(&current_pool->lock); } if (flush_color >= 0) { WARN_ON_ONCE(pwq->flush_color != -1); if (pwq->nr_in_flight[flush_color]) { pwq->flush_color = flush_color; atomic_inc(&wq->nr_pwqs_to_flush); wait = true; } } if (work_color >= 0) { WARN_ON_ONCE(work_color != work_next_color(pwq->work_color)); pwq->work_color = work_color; } } if (current_pool) raw_spin_unlock_irq(&current_pool->lock); if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush)) complete(&wq->first_flusher->done); return wait; } static void touch_wq_lockdep_map(struct workqueue_struct *wq) { #ifdef CONFIG_LOCKDEP if (unlikely(!wq->lockdep_map)) return; if (wq->flags & WQ_BH) local_bh_disable(); lock_map_acquire(wq->lockdep_map); lock_map_release(wq->lockdep_map); if (wq->flags & WQ_BH) local_bh_enable(); #endif } static void touch_work_lockdep_map(struct work_struct *work, struct workqueue_struct *wq) { #ifdef CONFIG_LOCKDEP if (wq->flags & WQ_BH) local_bh_disable(); lock_map_acquire(&work->lockdep_map); lock_map_release(&work->lockdep_map); if (wq->flags & WQ_BH) local_bh_enable(); #endif } /** * __flush_workqueue - ensure that any scheduled work has run to completion. * @wq: workqueue to flush * * This function sleeps until all work items which were queued on entry * have finished execution, but it is not livelocked by new incoming ones. */ void __flush_workqueue(struct workqueue_struct *wq) { struct wq_flusher this_flusher = { .list = LIST_HEAD_INIT(this_flusher.list), .flush_color = -1, .done = COMPLETION_INITIALIZER_ONSTACK_MAP(this_flusher.done, (*wq->lockdep_map)), }; int next_color; if (WARN_ON(!wq_online)) return; touch_wq_lockdep_map(wq); mutex_lock(&wq->mutex); /* * Start-to-wait phase */ next_color = work_next_color(wq->work_color); if (next_color != wq->flush_color) { /* * Color space is not full. The current work_color * becomes our flush_color and work_color is advanced * by one. */ WARN_ON_ONCE(!list_empty(&wq->flusher_overflow)); this_flusher.flush_color = wq->work_color; wq->work_color = next_color; if (!wq->first_flusher) { /* no flush in progress, become the first flusher */ WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color); wq->first_flusher = &this_flusher; if (!flush_workqueue_prep_pwqs(wq, wq->flush_color, wq->work_color)) { /* nothing to flush, done */ wq->flush_color = next_color; wq->first_flusher = NULL; goto out_unlock; } } else { /* wait in queue */ WARN_ON_ONCE(wq->flush_color == this_flusher.flush_color); list_add_tail(&this_flusher.list, &wq->flusher_queue); flush_workqueue_prep_pwqs(wq, -1, wq->work_color); } } else { /* * Oops, color space is full, wait on overflow queue. * The next flush completion will assign us * flush_color and transfer to flusher_queue. */ list_add_tail(&this_flusher.list, &wq->flusher_overflow); } check_flush_dependency(wq, NULL, false); mutex_unlock(&wq->mutex); wait_for_completion(&this_flusher.done); /* * Wake-up-and-cascade phase * * First flushers are responsible for cascading flushes and * handling overflow. Non-first flushers can simply return. */ if (READ_ONCE(wq->first_flusher) != &this_flusher) return; mutex_lock(&wq->mutex); /* we might have raced, check again with mutex held */ if (wq->first_flusher != &this_flusher) goto out_unlock; WRITE_ONCE(wq->first_flusher, NULL); WARN_ON_ONCE(!list_empty(&this_flusher.list)); WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color); while (true) { struct wq_flusher *next, *tmp; /* complete all the flushers sharing the current flush color */ list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) { if (next->flush_color != wq->flush_color) break; list_del_init(&next->list); complete(&next->done); } WARN_ON_ONCE(!list_empty(&wq->flusher_overflow) && wq->flush_color != work_next_color(wq->work_color)); /* this flush_color is finished, advance by one */ wq->flush_color = work_next_color(wq->flush_color); /* one color has been freed, handle overflow queue */ if (!list_empty(&wq->flusher_overflow)) { /* * Assign the same color to all overflowed * flushers, advance work_color and append to * flusher_queue. This is the start-to-wait * phase for these overflowed flushers. */ list_for_each_entry(tmp, &wq->flusher_overflow, list) tmp->flush_color = wq->work_color; wq->work_color = work_next_color(wq->work_color); list_splice_tail_init(&wq->flusher_overflow, &wq->flusher_queue); flush_workqueue_prep_pwqs(wq, -1, wq->work_color); } if (list_empty(&wq->flusher_queue)) { WARN_ON_ONCE(wq->flush_color != wq->work_color); break; } /* * Need to flush more colors. Make the next flusher * the new first flusher and arm pwqs. */ WARN_ON_ONCE(wq->flush_color == wq->work_color); WARN_ON_ONCE(wq->flush_color != next->flush_color); list_del_init(&next->list); wq->first_flusher = next; if (flush_workqueue_prep_pwqs(wq, wq->flush_color, -1)) break; /* * Meh... this color is already done, clear first * flusher and repeat cascading. */ wq->first_flusher = NULL; } out_unlock: mutex_unlock(&wq->mutex); } EXPORT_SYMBOL(__flush_workqueue); /** * drain_workqueue - drain a workqueue * @wq: workqueue to drain * * Wait until the workqueue becomes empty. While draining is in progress, * only chain queueing is allowed. IOW, only currently pending or running * work items on @wq can queue further work items on it. @wq is flushed * repeatedly until it becomes empty. The number of flushing is determined * by the depth of chaining and should be relatively short. Whine if it * takes too long. */ void drain_workqueue(struct workqueue_struct *wq) { unsigned int flush_cnt = 0; struct pool_workqueue *pwq; /* * __queue_work() needs to test whether there are drainers, is much * hotter than drain_workqueue() and already looks at @wq->flags. * Use __WQ_DRAINING so that queue doesn't have to check nr_drainers. */ mutex_lock(&wq->mutex); if (!wq->nr_drainers++) wq->flags |= __WQ_DRAINING; mutex_unlock(&wq->mutex); reflush: __flush_workqueue(wq); mutex_lock(&wq->mutex); for_each_pwq(pwq, wq) { bool drained; raw_spin_lock_irq(&pwq->pool->lock); drained = pwq_is_empty(pwq); raw_spin_unlock_irq(&pwq->pool->lock); if (drained) continue; if (++flush_cnt == 10 || (flush_cnt % 100 == 0 && flush_cnt <= 1000)) pr_warn("workqueue %s: %s() isn't complete after %u tries\n", wq->name, __func__, flush_cnt); mutex_unlock(&wq->mutex); goto reflush; } if (!--wq->nr_drainers) wq->flags &= ~__WQ_DRAINING; mutex_unlock(&wq->mutex); } EXPORT_SYMBOL_GPL(drain_workqueue); static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, bool from_cancel) { struct worker *worker = NULL; struct worker_pool *pool; struct pool_workqueue *pwq; struct workqueue_struct *wq; rcu_read_lock(); pool = get_work_pool(work); if (!pool) { rcu_read_unlock(); return false; } raw_spin_lock_irq(&pool->lock); /* see the comment in try_to_grab_pending() with the same code */ pwq = get_work_pwq(work); if (pwq) { if (unlikely(pwq->pool != pool)) goto already_gone; } else { worker = find_worker_executing_work(pool, work); if (!worker) goto already_gone; pwq = worker->current_pwq; } wq = pwq->wq; check_flush_dependency(wq, work, from_cancel); insert_wq_barrier(pwq, barr, work, worker); raw_spin_unlock_irq(&pool->lock); touch_work_lockdep_map(work, wq); /* * Force a lock recursion deadlock when using flush_work() inside a * single-threaded or rescuer equipped workqueue. * * For single threaded workqueues the deadlock happens when the work * is after the work issuing the flush_work(). For rescuer equipped * workqueues the deadlock happens when the rescuer stalls, blocking * forward progress. */ if (!from_cancel && (wq->saved_max_active == 1 || wq->rescuer)) touch_wq_lockdep_map(wq); rcu_read_unlock(); return true; already_gone: raw_spin_unlock_irq(&pool->lock); rcu_read_unlock(); return false; } static bool __flush_work(struct work_struct *work, bool from_cancel) { struct wq_barrier barr; if (WARN_ON(!wq_online)) return false; if (WARN_ON(!work->func)) return false; if (!start_flush_work(work, &barr, from_cancel)) return false; /* * start_flush_work() returned %true. If @from_cancel is set, we know * that @work must have been executing during start_flush_work() and * can't currently be queued. Its data must contain OFFQ bits. If @work * was queued on a BH workqueue, we also know that it was running in the * BH context and thus can be busy-waited. */ if (from_cancel) { unsigned long data = *work_data_bits(work); if (!WARN_ON_ONCE(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_BH)) { /* * On RT, prevent a live lock when %current preempted * soft interrupt processing or prevents ksoftirqd from * running by keeping flipping BH. If the BH work item * runs on a different CPU then this has no effect other * than doing the BH disable/enable dance for nothing. * This is copied from * kernel/softirq.c::tasklet_unlock_spin_wait(). */ while (!try_wait_for_completion(&barr.done)) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) { local_bh_disable(); local_bh_enable(); } else { cpu_relax(); } } goto out_destroy; } } wait_for_completion(&barr.done); out_destroy: destroy_work_on_stack(&barr.work); return true; } /** * flush_work - wait for a work to finish executing the last queueing instance * @work: the work to flush * * Wait until @work has finished execution. @work is guaranteed to be idle * on return if it hasn't been requeued since flush started. * * Return: * %true if flush_work() waited for the work to finish execution, * %false if it was already idle. */ bool flush_work(struct work_struct *work) { might_sleep(); return __flush_work(work, false); } EXPORT_SYMBOL_GPL(flush_work); /** * flush_delayed_work - wait for a dwork to finish executing the last queueing * @dwork: the delayed work to flush * * Delayed timer is cancelled and the pending work is queued for * immediate execution. Like flush_work(), this function only * considers the last queueing instance of @dwork. * * Return: * %true if flush_work() waited for the work to finish execution, * %false if it was already idle. */ bool flush_delayed_work(struct delayed_work *dwork) { local_irq_disable(); if (timer_delete_sync(&dwork->timer)) __queue_work(dwork->cpu, dwork->wq, &dwork->work); local_irq_enable(); return flush_work(&dwork->work); } EXPORT_SYMBOL(flush_delayed_work); /** * flush_rcu_work - wait for a rwork to finish executing the last queueing * @rwork: the rcu work to flush * * Return: * %true if flush_rcu_work() waited for the work to finish execution, * %false if it was already idle. */ bool flush_rcu_work(struct rcu_work *rwork) { if (test_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&rwork->work))) { rcu_barrier(); flush_work(&rwork->work); return true; } else { return flush_work(&rwork->work); } } EXPORT_SYMBOL(flush_rcu_work); static void work_offqd_disable(struct work_offq_data *offqd) { const unsigned long max = (1lu << WORK_OFFQ_DISABLE_BITS) - 1; if (likely(offqd->disable < max)) offqd->disable++; else WARN_ONCE(true, "workqueue: work disable count overflowed\n"); } static void work_offqd_enable(struct work_offq_data *offqd) { if (likely(offqd->disable > 0)) offqd->disable--; else WARN_ONCE(true, "workqueue: work disable count underflowed\n"); } static bool __cancel_work(struct work_struct *work, u32 cflags) { struct work_offq_data offqd; unsigned long irq_flags; int ret; ret = work_grab_pending(work, cflags, &irq_flags); work_offqd_unpack(&offqd, *work_data_bits(work)); if (cflags & WORK_CANCEL_DISABLE) work_offqd_disable(&offqd); set_work_pool_and_clear_pending(work, offqd.pool_id, work_offqd_pack_flags(&offqd)); local_irq_restore(irq_flags); return ret; } static bool __cancel_work_sync(struct work_struct *work, u32 cflags) { bool ret; ret = __cancel_work(work, cflags | WORK_CANCEL_DISABLE); if (*work_data_bits(work) & WORK_OFFQ_BH) WARN_ON_ONCE(in_hardirq()); else might_sleep(); /* * Skip __flush_work() during early boot when we know that @work isn't * executing. This allows canceling during early boot. */ if (wq_online) __flush_work(work, true); if (!(cflags & WORK_CANCEL_DISABLE)) enable_work(work); return ret; } /* * See cancel_delayed_work() */ bool cancel_work(struct work_struct *work) { return __cancel_work(work, 0); } EXPORT_SYMBOL(cancel_work); /** * cancel_work_sync - cancel a work and wait for it to finish * @work: the work to cancel * * Cancel @work and wait for its execution to finish. This function can be used * even if the work re-queues itself or migrates to another workqueue. On return * from this function, @work is guaranteed to be not pending or executing on any * CPU as long as there aren't racing enqueues. * * cancel_work_sync(&delayed_work->work) must not be used for delayed_work's. * Use cancel_delayed_work_sync() instead. * * Must be called from a sleepable context if @work was last queued on a non-BH * workqueue. Can also be called from non-hardirq atomic contexts including BH * if @work was last queued on a BH workqueue. * * Returns %true if @work was pending, %false otherwise. */ bool cancel_work_sync(struct work_struct *work) { return __cancel_work_sync(work, 0); } EXPORT_SYMBOL_GPL(cancel_work_sync); /** * cancel_delayed_work - cancel a delayed work * @dwork: delayed_work to cancel * * Kill off a pending delayed_work. * * Return: %true if @dwork was pending and canceled; %false if it wasn't * pending. * * Note: * The work callback function may still be running on return, unless * it returns %true and the work doesn't re-arm itself. Explicitly flush or * use cancel_delayed_work_sync() to wait on it. * * This function is safe to call from any context including IRQ handler. */ bool cancel_delayed_work(struct delayed_work *dwork) { return __cancel_work(&dwork->work, WORK_CANCEL_DELAYED); } EXPORT_SYMBOL(cancel_delayed_work); /** * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish * @dwork: the delayed work cancel * * This is cancel_work_sync() for delayed works. * * Return: * %true if @dwork was pending, %false otherwise. */ bool cancel_delayed_work_sync(struct delayed_work *dwork) { return __cancel_work_sync(&dwork->work, WORK_CANCEL_DELAYED); } EXPORT_SYMBOL(cancel_delayed_work_sync); /** * disable_work - Disable and cancel a work item * @work: work item to disable * * Disable @work by incrementing its disable count and cancel it if currently * pending. As long as the disable count is non-zero, any attempt to queue @work * will fail and return %false. The maximum supported disable depth is 2 to the * power of %WORK_OFFQ_DISABLE_BITS, currently 65536. * * Can be called from any context. Returns %true if @work was pending, %false * otherwise. */ bool disable_work(struct work_struct *work) { return __cancel_work(work, WORK_CANCEL_DISABLE); } EXPORT_SYMBOL_GPL(disable_work); /** * disable_work_sync - Disable, cancel and drain a work item * @work: work item to disable * * Similar to disable_work() but also wait for @work to finish if currently * executing. * * Must be called from a sleepable context if @work was last queued on a non-BH * workqueue. Can also be called from non-hardirq atomic contexts including BH * if @work was last queued on a BH workqueue. * * Returns %true if @work was pending, %false otherwise. */ bool disable_work_sync(struct work_struct *work) { return __cancel_work_sync(work, WORK_CANCEL_DISABLE); } EXPORT_SYMBOL_GPL(disable_work_sync); /** * enable_work - Enable a work item * @work: work item to enable * * Undo disable_work[_sync]() by decrementing @work's disable count. @work can * only be queued if its disable count is 0. * * Can be called from any context. Returns %true if the disable count reached 0. * Otherwise, %false. */ bool enable_work(struct work_struct *work) { struct work_offq_data offqd; unsigned long irq_flags; work_grab_pending(work, 0, &irq_flags); work_offqd_unpack(&offqd, *work_data_bits(work)); work_offqd_enable(&offqd); set_work_pool_and_clear_pending(work, offqd.pool_id, work_offqd_pack_flags(&offqd)); local_irq_restore(irq_flags); return !offqd.disable; } EXPORT_SYMBOL_GPL(enable_work); /** * disable_delayed_work - Disable and cancel a delayed work item * @dwork: delayed work item to disable * * disable_work() for delayed work items. */ bool disable_delayed_work(struct delayed_work *dwork) { return __cancel_work(&dwork->work, WORK_CANCEL_DELAYED | WORK_CANCEL_DISABLE); } EXPORT_SYMBOL_GPL(disable_delayed_work); /** * disable_delayed_work_sync - Disable, cancel and drain a delayed work item * @dwork: delayed work item to disable * * disable_work_sync() for delayed work items. */ bool disable_delayed_work_sync(struct delayed_work *dwork) { return __cancel_work_sync(&dwork->work, WORK_CANCEL_DELAYED | WORK_CANCEL_DISABLE); } EXPORT_SYMBOL_GPL(disable_delayed_work_sync); /** * enable_delayed_work - Enable a delayed work item * @dwork: delayed work item to enable * * enable_work() for delayed work items. */ bool enable_delayed_work(struct delayed_work *dwork) { return enable_work(&dwork->work); } EXPORT_SYMBOL_GPL(enable_delayed_work); /** * schedule_on_each_cpu - execute a function synchronously on each online CPU * @func: the function to call * * schedule_on_each_cpu() executes @func on each online CPU using the * system workqueue and blocks until all CPUs have completed. * schedule_on_each_cpu() is very slow. * * Return: * 0 on success, -errno on failure. */ int schedule_on_each_cpu(work_func_t func) { int cpu; struct work_struct __percpu *works; works = alloc_percpu(struct work_struct); if (!works) return -ENOMEM; cpus_read_lock(); for_each_online_cpu(cpu) { struct work_struct *work = per_cpu_ptr(works, cpu); INIT_WORK(work, func); schedule_work_on(cpu, work); } for_each_online_cpu(cpu) flush_work(per_cpu_ptr(works, cpu)); cpus_read_unlock(); free_percpu(works); return 0; } /** * execute_in_process_context - reliably execute the routine with user context * @fn: the function to execute * @ew: guaranteed storage for the execute work structure (must * be available when the work executes) * * Executes the function immediately if process context is available, * otherwise schedules the function for delayed execution. * * Return: 0 - function was executed * 1 - function was scheduled for execution */ int execute_in_process_context(work_func_t fn, struct execute_work *ew) { if (!in_interrupt()) { fn(&ew->work); return 0; } INIT_WORK(&ew->work, fn); schedule_work(&ew->work); return 1; } EXPORT_SYMBOL_GPL(execute_in_process_context); /** * free_workqueue_attrs - free a workqueue_attrs * @attrs: workqueue_attrs to free * * Undo alloc_workqueue_attrs(). */ void free_workqueue_attrs(struct workqueue_attrs *attrs) { if (attrs) { free_cpumask_var(attrs->cpumask); free_cpumask_var(attrs->__pod_cpumask); kfree(attrs); } } /** * alloc_workqueue_attrs - allocate a workqueue_attrs * * Allocate a new workqueue_attrs, initialize with default settings and * return it. * * Return: The allocated new workqueue_attr on success. %NULL on failure. */ struct workqueue_attrs *alloc_workqueue_attrs_noprof(void) { struct workqueue_attrs *attrs; attrs = kzalloc(sizeof(*attrs), GFP_KERNEL); if (!attrs) goto fail; if (!alloc_cpumask_var(&attrs->cpumask, GFP_KERNEL)) goto fail; if (!alloc_cpumask_var(&attrs->__pod_cpumask, GFP_KERNEL)) goto fail; cpumask_copy(attrs->cpumask, cpu_possible_mask); attrs->affn_scope = WQ_AFFN_DFL; return attrs; fail: free_workqueue_attrs(attrs); return NULL; } static void copy_workqueue_attrs(struct workqueue_attrs *to, const struct workqueue_attrs *from) { to->nice = from->nice; cpumask_copy(to->cpumask, from->cpumask); cpumask_copy(to->__pod_cpumask, from->__pod_cpumask); to->affn_strict = from->affn_strict; /* * Unlike hash and equality test, copying shouldn't ignore wq-only * fields as copying is used for both pool and wq attrs. Instead, * get_unbound_pool() explicitly clears the fields. */ to->affn_scope = from->affn_scope; to->ordered = from->ordered; } /* * Some attrs fields are workqueue-only. Clear them for worker_pool's. See the * comments in 'struct workqueue_attrs' definition. */ static void wqattrs_clear_for_pool(struct workqueue_attrs *attrs) { attrs->affn_scope = WQ_AFFN_NR_TYPES; attrs->ordered = false; if (attrs->affn_strict) cpumask_copy(attrs->cpumask, cpu_possible_mask); } /* hash value of the content of @attr */ static u32 wqattrs_hash(const struct workqueue_attrs *attrs) { u32 hash = 0; hash = jhash_1word(attrs->nice, hash); hash = jhash_1word(attrs->affn_strict, hash); hash = jhash(cpumask_bits(attrs->__pod_cpumask), BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash); if (!attrs->affn_strict) hash = jhash(cpumask_bits(attrs->cpumask), BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash); return hash; } /* content equality test */ static bool wqattrs_equal(const struct workqueue_attrs *a, const struct workqueue_attrs *b) { if (a->nice != b->nice) return false; if (a->affn_strict != b->affn_strict) return false; if (!cpumask_equal(a->__pod_cpumask, b->__pod_cpumask)) return false; if (!a->affn_strict && !cpumask_equal(a->cpumask, b->cpumask)) return false; return true; } /* Update @attrs with actually available CPUs */ static void wqattrs_actualize_cpumask(struct workqueue_attrs *attrs, const cpumask_t *unbound_cpumask) { /* * Calculate the effective CPU mask of @attrs given @unbound_cpumask. If * @attrs->cpumask doesn't overlap with @unbound_cpumask, we fallback to * @unbound_cpumask. */ cpumask_and(attrs->cpumask, attrs->cpumask, unbound_cpumask); if (unlikely(cpumask_empty(attrs->cpumask))) cpumask_copy(attrs->cpumask, unbound_cpumask); } /* find wq_pod_type to use for @attrs */ static const struct wq_pod_type * wqattrs_pod_type(const struct workqueue_attrs *attrs) { enum wq_affn_scope scope; struct wq_pod_type *pt; /* to synchronize access to wq_affn_dfl */ lockdep_assert_held(&wq_pool_mutex); if (attrs->affn_scope == WQ_AFFN_DFL) scope = wq_affn_dfl; else scope = attrs->affn_scope; pt = &wq_pod_types[scope]; if (!WARN_ON_ONCE(attrs->affn_scope == WQ_AFFN_NR_TYPES) && likely(pt->nr_pods)) return pt; /* * Before workqueue_init_topology(), only SYSTEM is available which is * initialized in workqueue_init_early(). */ pt = &wq_pod_types[WQ_AFFN_SYSTEM]; BUG_ON(!pt->nr_pods); return pt; } /** * init_worker_pool - initialize a newly zalloc'd worker_pool * @pool: worker_pool to initialize * * Initialize a newly zalloc'd @pool. It also allocates @pool->attrs. * * Return: 0 on success, -errno on failure. Even on failure, all fields * inside @pool proper are initialized and put_unbound_pool() can be called * on @pool safely to release it. */ static int init_worker_pool(struct worker_pool *pool) { raw_spin_lock_init(&pool->lock); pool->id = -1; pool->cpu = -1; pool->node = NUMA_NO_NODE; pool->flags |= POOL_DISASSOCIATED; pool->watchdog_ts = jiffies; INIT_LIST_HEAD(&pool->worklist); INIT_LIST_HEAD(&pool->idle_list); hash_init(pool->busy_hash); timer_setup(&pool->idle_timer, idle_worker_timeout, TIMER_DEFERRABLE); INIT_WORK(&pool->idle_cull_work, idle_cull_fn); timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0); INIT_LIST_HEAD(&pool->workers); ida_init(&pool->worker_ida); INIT_HLIST_NODE(&pool->hash_node); pool->refcnt = 1; /* shouldn't fail above this point */ pool->attrs = alloc_workqueue_attrs(); if (!pool->attrs) return -ENOMEM; wqattrs_clear_for_pool(pool->attrs); return 0; } #ifdef CONFIG_LOCKDEP static void wq_init_lockdep(struct workqueue_struct *wq) { char *lock_name; lockdep_register_key(&wq->key); lock_name = kasprintf(GFP_KERNEL, "%s%s", "(wq_completion)", wq->name); if (!lock_name) lock_name = wq->name; wq->lock_name = lock_name; wq->lockdep_map = &wq->__lockdep_map; lockdep_init_map(wq->lockdep_map, lock_name, &wq->key, 0); } static void wq_unregister_lockdep(struct workqueue_struct *wq) { if (wq->lockdep_map != &wq->__lockdep_map) return; lockdep_unregister_key(&wq->key); } static void wq_free_lockdep(struct workqueue_struct *wq) { if (wq->lockdep_map != &wq->__lockdep_map) return; if (wq->lock_name != wq->name) kfree(wq->lock_name); } #else static void wq_init_lockdep(struct workqueue_struct *wq) { } static void wq_unregister_lockdep(struct workqueue_struct *wq) { } static void wq_free_lockdep(struct workqueue_struct *wq) { } #endif static void free_node_nr_active(struct wq_node_nr_active **nna_ar) { int node; for_each_node(node) { kfree(nna_ar[node]); nna_ar[node] = NULL; } kfree(nna_ar[nr_node_ids]); nna_ar[nr_node_ids] = NULL; } static void init_node_nr_active(struct wq_node_nr_active *nna) { nna->max = WQ_DFL_MIN_ACTIVE; atomic_set(&nna->nr, 0); raw_spin_lock_init(&nna->lock); INIT_LIST_HEAD(&nna->pending_pwqs); } /* * Each node's nr_active counter will be accessed mostly from its own node and * should be allocated in the node. */ static int alloc_node_nr_active(struct wq_node_nr_active **nna_ar) { struct wq_node_nr_active *nna; int node; for_each_node(node) { nna = kzalloc_node(sizeof(*nna), GFP_KERNEL, node); if (!nna) goto err_free; init_node_nr_active(nna); nna_ar[node] = nna; } /* [nr_node_ids] is used as the fallback */ nna = kzalloc_node(sizeof(*nna), GFP_KERNEL, NUMA_NO_NODE); if (!nna) goto err_free; init_node_nr_active(nna); nna_ar[nr_node_ids] = nna; return 0; err_free: free_node_nr_active(nna_ar); return -ENOMEM; } static void rcu_free_wq(struct rcu_head *rcu) { struct workqueue_struct *wq = container_of(rcu, struct workqueue_struct, rcu); if (wq->flags & WQ_UNBOUND) free_node_nr_active(wq->node_nr_active); wq_free_lockdep(wq); free_percpu(wq->cpu_pwq); free_workqueue_attrs(wq->unbound_attrs); kfree(wq); } static void rcu_free_pool(struct rcu_head *rcu) { struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu); ida_destroy(&pool->worker_ida); free_workqueue_attrs(pool->attrs); kfree(pool); } /** * put_unbound_pool - put a worker_pool * @pool: worker_pool to put * * Put @pool. If its refcnt reaches zero, it gets destroyed in RCU * safe manner. get_unbound_pool() calls this function on its failure path * and this function should be able to release pools which went through, * successfully or not, init_worker_pool(). * * Should be called with wq_pool_mutex held. */ static void put_unbound_pool(struct worker_pool *pool) { struct worker *worker; LIST_HEAD(cull_list); lockdep_assert_held(&wq_pool_mutex); if (--pool->refcnt) return; /* sanity checks */ if (WARN_ON(!(pool->cpu < 0)) || WARN_ON(!list_empty(&pool->worklist))) return; /* release id and unhash */ if (pool->id >= 0) idr_remove(&worker_pool_idr, pool->id); hash_del(&pool->hash_node); /* * Become the manager and destroy all workers. This prevents * @pool's workers from blocking on attach_mutex. We're the last * manager and @pool gets freed with the flag set. * * Having a concurrent manager is quite unlikely to happen as we can * only get here with * pwq->refcnt == pool->refcnt == 0 * which implies no work queued to the pool, which implies no worker can * become the manager. However a worker could have taken the role of * manager before the refcnts dropped to 0, since maybe_create_worker() * drops pool->lock */ while (true) { rcuwait_wait_event(&manager_wait, !(pool->flags & POOL_MANAGER_ACTIVE), TASK_UNINTERRUPTIBLE); mutex_lock(&wq_pool_attach_mutex); raw_spin_lock_irq(&pool->lock); if (!(pool->flags & POOL_MANAGER_ACTIVE)) { pool->flags |= POOL_MANAGER_ACTIVE; break; } raw_spin_unlock_irq(&pool->lock); mutex_unlock(&wq_pool_attach_mutex); } while ((worker = first_idle_worker(pool))) set_worker_dying(worker, &cull_list); WARN_ON(pool->nr_workers || pool->nr_idle); raw_spin_unlock_irq(&pool->lock); detach_dying_workers(&cull_list); mutex_unlock(&wq_pool_attach_mutex); reap_dying_workers(&cull_list); /* shut down the timers */ timer_delete_sync(&pool->idle_timer); cancel_work_sync(&pool->idle_cull_work); timer_delete_sync(&pool->mayday_timer); /* RCU protected to allow dereferences from get_work_pool() */ call_rcu(&pool->rcu, rcu_free_pool); } /** * get_unbound_pool - get a worker_pool with the specified attributes * @attrs: the attributes of the worker_pool to get * * Obtain a worker_pool which has the same attributes as @attrs, bump the * reference count and return it. If there already is a matching * worker_pool, it will be used; otherwise, this function attempts to * create a new one. * * Should be called with wq_pool_mutex held. * * Return: On success, a worker_pool with the same attributes as @attrs. * On failure, %NULL. */ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) { struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_NUMA]; u32 hash = wqattrs_hash(attrs); struct worker_pool *pool; int pod, node = NUMA_NO_NODE; lockdep_assert_held(&wq_pool_mutex); /* do we already have a matching pool? */ hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) { if (wqattrs_equal(pool->attrs, attrs)) { pool->refcnt++; return pool; } } /* If __pod_cpumask is contained inside a NUMA pod, that's our node */ for (pod = 0; pod < pt->nr_pods; pod++) { if (cpumask_subset(attrs->__pod_cpumask, pt->pod_cpus[pod])) { node = pt->pod_node[pod]; break; } } /* nope, create a new one */ pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, node); if (!pool || init_worker_pool(pool) < 0) goto fail; pool->node = node; copy_workqueue_attrs(pool->attrs, attrs); wqattrs_clear_for_pool(pool->attrs); if (worker_pool_assign_id(pool) < 0) goto fail; /* create and start the initial worker */ if (wq_online && !create_worker(pool)) goto fail; /* install */ hash_add(unbound_pool_hash, &pool->hash_node, hash); return pool; fail: if (pool) put_unbound_pool(pool); return NULL; } /* * Scheduled on pwq_release_worker by put_pwq() when an unbound pwq hits zero * refcnt and needs to be destroyed. */ static void pwq_release_workfn(struct kthread_work *work) { struct pool_workqueue *pwq = container_of(work, struct pool_workqueue, release_work); struct workqueue_struct *wq = pwq->wq; struct worker_pool *pool = pwq->pool; bool is_last = false; /* * When @pwq is not linked, it doesn't hold any reference to the * @wq, and @wq is invalid to access. */ if (!list_empty(&pwq->pwqs_node)) { mutex_lock(&wq->mutex); list_del_rcu(&pwq->pwqs_node); is_last = list_empty(&wq->pwqs); /* * For ordered workqueue with a plugged dfl_pwq, restart it now. */ if (!is_last && (wq->flags & __WQ_ORDERED)) unplug_oldest_pwq(wq); mutex_unlock(&wq->mutex); } if (wq->flags & WQ_UNBOUND) { mutex_lock(&wq_pool_mutex); put_unbound_pool(pool); mutex_unlock(&wq_pool_mutex); } if (!list_empty(&pwq->pending_node)) { struct wq_node_nr_active *nna = wq_node_nr_active(pwq->wq, pwq->pool->node); raw_spin_lock_irq(&nna->lock); list_del_init(&pwq->pending_node); raw_spin_unlock_irq(&nna->lock); } kfree_rcu(pwq, rcu); /* * If we're the last pwq going away, @wq is already dead and no one * is gonna access it anymore. Schedule RCU free. */ if (is_last) { wq_unregister_lockdep(wq); call_rcu(&wq->rcu, rcu_free_wq); } } /* initialize newly allocated @pwq which is associated with @wq and @pool */ static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, struct worker_pool *pool) { BUG_ON((unsigned long)pwq & ~WORK_STRUCT_PWQ_MASK); memset(pwq, 0, sizeof(*pwq)); pwq->pool = pool; pwq->wq = wq; pwq->flush_color = -1; pwq->refcnt = 1; INIT_LIST_HEAD(&pwq->inactive_works); INIT_LIST_HEAD(&pwq->pending_node); INIT_LIST_HEAD(&pwq->pwqs_node); INIT_LIST_HEAD(&pwq->mayday_node); kthread_init_work(&pwq->release_work, pwq_release_workfn); } /* sync @pwq with the current state of its associated wq and link it */ static void link_pwq(struct pool_workqueue *pwq) { struct workqueue_struct *wq = pwq->wq; lockdep_assert_held(&wq->mutex); /* may be called multiple times, ignore if already linked */ if (!list_empty(&pwq->pwqs_node)) return; /* set the matching work_color */ pwq->work_color = wq->work_color; /* link in @pwq */ list_add_tail_rcu(&pwq->pwqs_node, &wq->pwqs); } /* obtain a pool matching @attr and create a pwq associating the pool and @wq */ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq, const struct workqueue_attrs *attrs) { struct worker_pool *pool; struct pool_workqueue *pwq; lockdep_assert_held(&wq_pool_mutex); pool = get_unbound_pool(attrs); if (!pool) return NULL; pwq = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, pool->node); if (!pwq) { put_unbound_pool(pool); return NULL; } init_pwq(pwq, wq, pool); return pwq; } static void apply_wqattrs_lock(void) { mutex_lock(&wq_pool_mutex); } static void apply_wqattrs_unlock(void) { mutex_unlock(&wq_pool_mutex); } /** * wq_calc_pod_cpumask - calculate a wq_attrs' cpumask for a pod * @attrs: the wq_attrs of the default pwq of the target workqueue * @cpu: the target CPU * * Calculate the cpumask a workqueue with @attrs should use on @pod. * The result is stored in @attrs->__pod_cpumask. * * If pod affinity is not enabled, @attrs->cpumask is always used. If enabled * and @pod has online CPUs requested by @attrs, the returned cpumask is the * intersection of the possible CPUs of @pod and @attrs->cpumask. * * The caller is responsible for ensuring that the cpumask of @pod stays stable. */ static void wq_calc_pod_cpumask(struct workqueue_attrs *attrs, int cpu) { const struct wq_pod_type *pt = wqattrs_pod_type(attrs); int pod = pt->cpu_pod[cpu]; /* calculate possible CPUs in @pod that @attrs wants */ cpumask_and(attrs->__pod_cpumask, pt->pod_cpus[pod], attrs->cpumask); /* does @pod have any online CPUs @attrs wants? */ if (!cpumask_intersects(attrs->__pod_cpumask, wq_online_cpumask)) { cpumask_copy(attrs->__pod_cpumask, attrs->cpumask); return; } } /* install @pwq into @wq and return the old pwq, @cpu < 0 for dfl_pwq */ static struct pool_workqueue *install_unbound_pwq(struct workqueue_struct *wq, int cpu, struct pool_workqueue *pwq) { struct pool_workqueue __rcu **slot = unbound_pwq_slot(wq, cpu); struct pool_workqueue *old_pwq; lockdep_assert_held(&wq_pool_mutex); lockdep_assert_held(&wq->mutex); /* link_pwq() can handle duplicate calls */ link_pwq(pwq); old_pwq = rcu_access_pointer(*slot); rcu_assign_pointer(*slot, pwq); return old_pwq; } /* context to store the prepared attrs & pwqs before applying */ struct apply_wqattrs_ctx { struct workqueue_struct *wq; /* target workqueue */ struct workqueue_attrs *attrs; /* attrs to apply */ struct list_head list; /* queued for batching commit */ struct pool_workqueue *dfl_pwq; struct pool_workqueue *pwq_tbl[]; }; /* free the resources after success or abort */ static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx) { if (ctx) { int cpu; for_each_possible_cpu(cpu) put_pwq_unlocked(ctx->pwq_tbl[cpu]); put_pwq_unlocked(ctx->dfl_pwq); free_workqueue_attrs(ctx->attrs); kfree(ctx); } } /* allocate the attrs and pwqs for later installation */ static struct apply_wqattrs_ctx * apply_wqattrs_prepare(struct workqueue_struct *wq, const struct workqueue_attrs *attrs, const cpumask_var_t unbound_cpumask) { struct apply_wqattrs_ctx *ctx; struct workqueue_attrs *new_attrs; int cpu; lockdep_assert_held(&wq_pool_mutex); if (WARN_ON(attrs->affn_scope < 0 || attrs->affn_scope >= WQ_AFFN_NR_TYPES)) return ERR_PTR(-EINVAL); ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_cpu_ids), GFP_KERNEL); new_attrs = alloc_workqueue_attrs(); if (!ctx || !new_attrs) goto out_free; /* * If something goes wrong during CPU up/down, we'll fall back to * the default pwq covering whole @attrs->cpumask. Always create * it even if we don't use it immediately. */ copy_workqueue_attrs(new_attrs, attrs); wqattrs_actualize_cpumask(new_attrs, unbound_cpumask); cpumask_copy(new_attrs->__pod_cpumask, new_attrs->cpumask); ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs); if (!ctx->dfl_pwq) goto out_free; for_each_possible_cpu(cpu) { if (new_attrs->ordered) { ctx->dfl_pwq->refcnt++; ctx->pwq_tbl[cpu] = ctx->dfl_pwq; } else { wq_calc_pod_cpumask(new_attrs, cpu); ctx->pwq_tbl[cpu] = alloc_unbound_pwq(wq, new_attrs); if (!ctx->pwq_tbl[cpu]) goto out_free; } } /* save the user configured attrs and sanitize it. */ copy_workqueue_attrs(new_attrs, attrs); cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask); cpumask_copy(new_attrs->__pod_cpumask, new_attrs->cpumask); ctx->attrs = new_attrs; /* * For initialized ordered workqueues, there should only be one pwq * (dfl_pwq). Set the plugged flag of ctx->dfl_pwq to suspend execution * of newly queued work items until execution of older work items in * the old pwq's have completed. */ if ((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)) ctx->dfl_pwq->plugged = true; ctx->wq = wq; return ctx; out_free: free_workqueue_attrs(new_attrs); apply_wqattrs_cleanup(ctx); return ERR_PTR(-ENOMEM); } /* set attrs and install prepared pwqs, @ctx points to old pwqs on return */ static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx) { int cpu; /* all pwqs have been created successfully, let's install'em */ mutex_lock(&ctx->wq->mutex); copy_workqueue_attrs(ctx->wq->unbound_attrs, ctx->attrs); /* save the previous pwqs and install the new ones */ for_each_possible_cpu(cpu) ctx->pwq_tbl[cpu] = install_unbound_pwq(ctx->wq, cpu, ctx->pwq_tbl[cpu]); ctx->dfl_pwq = install_unbound_pwq(ctx->wq, -1, ctx->dfl_pwq); /* update node_nr_active->max */ wq_update_node_max_active(ctx->wq, -1); /* rescuer needs to respect wq cpumask changes */ if (ctx->wq->rescuer) set_cpus_allowed_ptr(ctx->wq->rescuer->task, unbound_effective_cpumask(ctx->wq)); mutex_unlock(&ctx->wq->mutex); } static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, const struct workqueue_attrs *attrs) { struct apply_wqattrs_ctx *ctx; /* only unbound workqueues can change attributes */ if (WARN_ON(!(wq->flags & WQ_UNBOUND))) return -EINVAL; ctx = apply_wqattrs_prepare(wq, attrs, wq_unbound_cpumask); if (IS_ERR(ctx)) return PTR_ERR(ctx); /* the ctx has been prepared successfully, let's commit it */ apply_wqattrs_commit(ctx); apply_wqattrs_cleanup(ctx); return 0; } /** * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue * @wq: the target workqueue * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs() * * Apply @attrs to an unbound workqueue @wq. Unless disabled, this function maps * a separate pwq to each CPU pod with possibles CPUs in @attrs->cpumask so that * work items are affine to the pod it was issued on. Older pwqs are released as * in-flight work items finish. Note that a work item which repeatedly requeues * itself back-to-back will stay on its current pwq. * * Performs GFP_KERNEL allocations. * * Return: 0 on success and -errno on failure. */ int apply_workqueue_attrs(struct workqueue_struct *wq, const struct workqueue_attrs *attrs) { int ret; mutex_lock(&wq_pool_mutex); ret = apply_workqueue_attrs_locked(wq, attrs); mutex_unlock(&wq_pool_mutex); return ret; } /** * unbound_wq_update_pwq - update a pwq slot for CPU hot[un]plug * @wq: the target workqueue * @cpu: the CPU to update the pwq slot for * * This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and * %CPU_DOWN_FAILED. @cpu is in the same pod of the CPU being hot[un]plugged. * * * If pod affinity can't be adjusted due to memory allocation failure, it falls * back to @wq->dfl_pwq which may not be optimal but is always correct. * * Note that when the last allowed CPU of a pod goes offline for a workqueue * with a cpumask spanning multiple pods, the workers which were already * executing the work items for the workqueue will lose their CPU affinity and * may execute on any CPU. This is similar to how per-cpu workqueues behave on * CPU_DOWN. If a workqueue user wants strict affinity, it's the user's * responsibility to flush the work item from CPU_DOWN_PREPARE. */ static void unbound_wq_update_pwq(struct workqueue_struct *wq, int cpu) { struct pool_workqueue *old_pwq = NULL, *pwq; struct workqueue_attrs *target_attrs; lockdep_assert_held(&wq_pool_mutex); if (!(wq->flags & WQ_UNBOUND) || wq->unbound_attrs->ordered) return; /* * We don't wanna alloc/free wq_attrs for each wq for each CPU. * Let's use a preallocated one. The following buf is protected by * CPU hotplug exclusion. */ target_attrs = unbound_wq_update_pwq_attrs_buf; copy_workqueue_attrs(target_attrs, wq->unbound_attrs); wqattrs_actualize_cpumask(target_attrs, wq_unbound_cpumask); /* nothing to do if the target cpumask matches the current pwq */ wq_calc_pod_cpumask(target_attrs, cpu); if (wqattrs_equal(target_attrs, unbound_pwq(wq, cpu)->pool->attrs)) return; /* create a new pwq */ pwq = alloc_unbound_pwq(wq, target_attrs); if (!pwq) { pr_warn("workqueue: allocation failed while updating CPU pod affinity of \"%s\"\n", wq->name); goto use_dfl_pwq; } /* Install the new pwq. */ mutex_lock(&wq->mutex); old_pwq = install_unbound_pwq(wq, cpu, pwq); goto out_unlock; use_dfl_pwq: mutex_lock(&wq->mutex); pwq = unbound_pwq(wq, -1); raw_spin_lock_irq(&pwq->pool->lock); get_pwq(pwq); raw_spin_unlock_irq(&pwq->pool->lock); old_pwq = install_unbound_pwq(wq, cpu, pwq); out_unlock: mutex_unlock(&wq->mutex); put_pwq_unlocked(old_pwq); } static int alloc_and_link_pwqs(struct workqueue_struct *wq) { bool highpri = wq->flags & WQ_HIGHPRI; int cpu, ret; lockdep_assert_held(&wq_pool_mutex); wq->cpu_pwq = alloc_percpu(struct pool_workqueue *); if (!wq->cpu_pwq) goto enomem; if (!(wq->flags & WQ_UNBOUND)) { struct worker_pool __percpu *pools; if (wq->flags & WQ_BH) pools = bh_worker_pools; else pools = cpu_worker_pools; for_each_possible_cpu(cpu) { struct pool_workqueue **pwq_p; struct worker_pool *pool; pool = &(per_cpu_ptr(pools, cpu)[highpri]); pwq_p = per_cpu_ptr(wq->cpu_pwq, cpu); *pwq_p = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, pool->node); if (!*pwq_p) goto enomem; init_pwq(*pwq_p, wq, pool); mutex_lock(&wq->mutex); link_pwq(*pwq_p); mutex_unlock(&wq->mutex); } return 0; } if (wq->flags & __WQ_ORDERED) { struct pool_workqueue *dfl_pwq; ret = apply_workqueue_attrs_locked(wq, ordered_wq_attrs[highpri]); /* there should only be single pwq for ordering guarantee */ dfl_pwq = rcu_access_pointer(wq->dfl_pwq); WARN(!ret && (wq->pwqs.next != &dfl_pwq->pwqs_node || wq->pwqs.prev != &dfl_pwq->pwqs_node), "ordering guarantee broken for workqueue %s\n", wq->name); } else { ret = apply_workqueue_attrs_locked(wq, unbound_std_wq_attrs[highpri]); } return ret; enomem: if (wq->cpu_pwq) { for_each_possible_cpu(cpu) { struct pool_workqueue *pwq = *per_cpu_ptr(wq->cpu_pwq, cpu); if (pwq) kmem_cache_free(pwq_cache, pwq); } free_percpu(wq->cpu_pwq); wq->cpu_pwq = NULL; } return -ENOMEM; } static int wq_clamp_max_active(int max_active, unsigned int flags, const char *name) { if (max_active < 1 || max_active > WQ_MAX_ACTIVE) pr_warn("workqueue: max_active %d requested for %s is out of range, clamping between %d and %d\n", max_active, name, 1, WQ_MAX_ACTIVE); return clamp_val(max_active, 1, WQ_MAX_ACTIVE); } /* * Workqueues which may be used during memory reclaim should have a rescuer * to guarantee forward progress. */ static int init_rescuer(struct workqueue_struct *wq) { struct worker *rescuer; char id_buf[WORKER_ID_LEN]; int ret; lockdep_assert_held(&wq_pool_mutex); if (!(wq->flags & WQ_MEM_RECLAIM)) return 0; rescuer = alloc_worker(NUMA_NO_NODE); if (!rescuer) { pr_err("workqueue: Failed to allocate a rescuer for wq \"%s\"\n", wq->name); return -ENOMEM; } rescuer->rescue_wq = wq; format_worker_id(id_buf, sizeof(id_buf), rescuer, NULL); rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", id_buf); if (IS_ERR(rescuer->task)) { ret = PTR_ERR(rescuer->task); pr_err("workqueue: Failed to create a rescuer kthread for wq \"%s\": %pe", wq->name, ERR_PTR(ret)); kfree(rescuer); return ret; } wq->rescuer = rescuer; if (wq->flags & WQ_UNBOUND) kthread_bind_mask(rescuer->task, unbound_effective_cpumask(wq)); else kthread_bind_mask(rescuer->task, cpu_possible_mask); wake_up_process(rescuer->task); return 0; } /** * wq_adjust_max_active - update a wq's max_active to the current setting * @wq: target workqueue * * If @wq isn't freezing, set @wq->max_active to the saved_max_active and * activate inactive work items accordingly. If @wq is freezing, clear * @wq->max_active to zero. */ static void wq_adjust_max_active(struct workqueue_struct *wq) { bool activated; int new_max, new_min; lockdep_assert_held(&wq->mutex); if ((wq->flags & WQ_FREEZABLE) && workqueue_freezing) { new_max = 0; new_min = 0; } else { new_max = wq->saved_max_active; new_min = wq->saved_min_active; } if (wq->max_active == new_max && wq->min_active == new_min) return; /* * Update @wq->max/min_active and then kick inactive work items if more * active work items are allowed. This doesn't break work item ordering * because new work items are always queued behind existing inactive * work items if there are any. */ WRITE_ONCE(wq->max_active, new_max); WRITE_ONCE(wq->min_active, new_min); if (wq->flags & WQ_UNBOUND) wq_update_node_max_active(wq, -1); if (new_max == 0) return; /* * Round-robin through pwq's activating the first inactive work item * until max_active is filled. */ do { struct pool_workqueue *pwq; activated = false; for_each_pwq(pwq, wq) { unsigned long irq_flags; /* can be called during early boot w/ irq disabled */ raw_spin_lock_irqsave(&pwq->pool->lock, irq_flags); if (pwq_activate_first_inactive(pwq, true)) { activated = true; kick_pool(pwq->pool); } raw_spin_unlock_irqrestore(&pwq->pool->lock, irq_flags); } } while (activated); } __printf(1, 0) static struct workqueue_struct *__alloc_workqueue(const char *fmt, unsigned int flags, int max_active, va_list args) { struct workqueue_struct *wq; size_t wq_size; int name_len; if (flags & WQ_BH) { if (WARN_ON_ONCE(flags & ~__WQ_BH_ALLOWS)) return NULL; if (WARN_ON_ONCE(max_active)) return NULL; } /* see the comment above the definition of WQ_POWER_EFFICIENT */ if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient) flags |= WQ_UNBOUND; /* allocate wq and format name */ if (flags & WQ_UNBOUND) wq_size = struct_size(wq, node_nr_active, nr_node_ids + 1); else wq_size = sizeof(*wq); wq = kzalloc_noprof(wq_size, GFP_KERNEL); if (!wq) return NULL; if (flags & WQ_UNBOUND) { wq->unbound_attrs = alloc_workqueue_attrs_noprof(); if (!wq->unbound_attrs) goto err_free_wq; } name_len = vsnprintf(wq->name, sizeof(wq->name), fmt, args); if (name_len >= WQ_NAME_LEN) pr_warn_once("workqueue: name exceeds WQ_NAME_LEN. Truncating to: %s\n", wq->name); if (flags & WQ_BH) { /* * BH workqueues always share a single execution context per CPU * and don't impose any max_active limit. */ max_active = INT_MAX; } else { max_active = max_active ?: WQ_DFL_ACTIVE; max_active = wq_clamp_max_active(max_active, flags, wq->name); } /* init wq */ wq->flags = flags; wq->max_active = max_active; wq->min_active = min(max_active, WQ_DFL_MIN_ACTIVE); wq->saved_max_active = wq->max_active; wq->saved_min_active = wq->min_active; mutex_init(&wq->mutex); atomic_set(&wq->nr_pwqs_to_flush, 0); INIT_LIST_HEAD(&wq->pwqs); INIT_LIST_HEAD(&wq->flusher_queue); INIT_LIST_HEAD(&wq->flusher_overflow); INIT_LIST_HEAD(&wq->maydays); INIT_LIST_HEAD(&wq->list); if (flags & WQ_UNBOUND) { if (alloc_node_nr_active(wq->node_nr_active) < 0) goto err_free_wq; } /* * wq_pool_mutex protects the workqueues list, allocations of PWQs, * and the global freeze state. */ apply_wqattrs_lock(); if (alloc_and_link_pwqs(wq) < 0) goto err_unlock_free_node_nr_active; mutex_lock(&wq->mutex); wq_adjust_max_active(wq); mutex_unlock(&wq->mutex); list_add_tail_rcu(&wq->list, &workqueues); if (wq_online && init_rescuer(wq) < 0) goto err_unlock_destroy; apply_wqattrs_unlock(); if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq)) goto err_destroy; return wq; err_unlock_free_node_nr_active: apply_wqattrs_unlock(); /* * Failed alloc_and_link_pwqs() may leave pending pwq->release_work, * flushing the pwq_release_worker ensures that the pwq_release_workfn() * completes before calling kfree(wq). */ if (wq->flags & WQ_UNBOUND) { kthread_flush_worker(pwq_release_worker); free_node_nr_active(wq->node_nr_active); } err_free_wq: free_workqueue_attrs(wq->unbound_attrs); kfree(wq); return NULL; err_unlock_destroy: apply_wqattrs_unlock(); err_destroy: destroy_workqueue(wq); return NULL; } __printf(1, 4) struct workqueue_struct *alloc_workqueue_noprof(const char *fmt, unsigned int flags, int max_active, ...) { struct workqueue_struct *wq; va_list args; va_start(args, max_active); wq = __alloc_workqueue(fmt, flags, max_active, args); va_end(args); if (!wq) return NULL; wq_init_lockdep(wq); return wq; } EXPORT_SYMBOL_GPL(alloc_workqueue_noprof); #ifdef CONFIG_LOCKDEP __printf(1, 5) struct workqueue_struct * alloc_workqueue_lockdep_map(const char *fmt, unsigned int flags, int max_active, struct lockdep_map *lockdep_map, ...) { struct workqueue_struct *wq; va_list args; va_start(args, lockdep_map); wq = __alloc_workqueue(fmt, flags, max_active, args); va_end(args); if (!wq) return NULL; wq->lockdep_map = lockdep_map; return wq; } EXPORT_SYMBOL_GPL(alloc_workqueue_lockdep_map); #endif static bool pwq_busy(struct pool_workqueue *pwq) { int i; for (i = 0; i < WORK_NR_COLORS; i++) if (pwq->nr_in_flight[i]) return true; if ((pwq != rcu_access_pointer(pwq->wq->dfl_pwq)) && (pwq->refcnt > 1)) return true; if (!pwq_is_empty(pwq)) return true; return false; } /** * destroy_workqueue - safely terminate a workqueue * @wq: target workqueue * * Safely destroy a workqueue. All work currently pending will be done first. * * This function does NOT guarantee that non-pending work that has been * submitted with queue_delayed_work() and similar functions will be done * before destroying the workqueue. The fundamental problem is that, currently, * the workqueue has no way of accessing non-pending delayed_work. delayed_work * is only linked on the timer-side. All delayed_work must, therefore, be * canceled before calling this function. * * TODO: It would be better if the problem described above wouldn't exist and * destroy_workqueue() would cleanly cancel all pending and non-pending * delayed_work. */ void destroy_workqueue(struct workqueue_struct *wq) { struct pool_workqueue *pwq; int cpu; /* * Remove it from sysfs first so that sanity check failure doesn't * lead to sysfs name conflicts. */ workqueue_sysfs_unregister(wq); /* mark the workqueue destruction is in progress */ mutex_lock(&wq->mutex); wq->flags |= __WQ_DESTROYING; mutex_unlock(&wq->mutex); /* drain it before proceeding with destruction */ drain_workqueue(wq); /* kill rescuer, if sanity checks fail, leave it w/o rescuer */ if (wq->rescuer) { struct worker *rescuer = wq->rescuer; /* this prevents new queueing */ raw_spin_lock_irq(&wq_mayday_lock); wq->rescuer = NULL; raw_spin_unlock_irq(&wq_mayday_lock); /* rescuer will empty maydays list before exiting */ kthread_stop(rescuer->task); kfree(rescuer); } /* * Sanity checks - grab all the locks so that we wait for all * in-flight operations which may do put_pwq(). */ mutex_lock(&wq_pool_mutex); mutex_lock(&wq->mutex); for_each_pwq(pwq, wq) { raw_spin_lock_irq(&pwq->pool->lock); if (WARN_ON(pwq_busy(pwq))) { pr_warn("%s: %s has the following busy pwq\n", __func__, wq->name); show_pwq(pwq); raw_spin_unlock_irq(&pwq->pool->lock); mutex_unlock(&wq->mutex); mutex_unlock(&wq_pool_mutex); show_one_workqueue(wq); return; } raw_spin_unlock_irq(&pwq->pool->lock); } mutex_unlock(&wq->mutex); /* * wq list is used to freeze wq, remove from list after * flushing is complete in case freeze races us. */ list_del_rcu(&wq->list); mutex_unlock(&wq_pool_mutex); /* * We're the sole accessor of @wq. Directly access cpu_pwq and dfl_pwq * to put the base refs. @wq will be auto-destroyed from the last * pwq_put. RCU read lock prevents @wq from going away from under us. */ rcu_read_lock(); for_each_possible_cpu(cpu) { put_pwq_unlocked(unbound_pwq(wq, cpu)); RCU_INIT_POINTER(*unbound_pwq_slot(wq, cpu), NULL); } put_pwq_unlocked(unbound_pwq(wq, -1)); RCU_INIT_POINTER(*unbound_pwq_slot(wq, -1), NULL); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(destroy_workqueue); /** * workqueue_set_max_active - adjust max_active of a workqueue * @wq: target workqueue * @max_active: new max_active value. * * Set max_active of @wq to @max_active. See the alloc_workqueue() function * comment. * * CONTEXT: * Don't call from IRQ context. */ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) { /* max_active doesn't mean anything for BH workqueues */ if (WARN_ON(wq->flags & WQ_BH)) return; /* disallow meddling with max_active for ordered workqueues */ if (WARN_ON(wq->flags & __WQ_ORDERED)) return; max_active = wq_clamp_max_active(max_active, wq->flags, wq->name); mutex_lock(&wq->mutex); wq->saved_max_active = max_active; if (wq->flags & WQ_UNBOUND) wq->saved_min_active = min(wq->saved_min_active, max_active); wq_adjust_max_active(wq); mutex_unlock(&wq->mutex); } EXPORT_SYMBOL_GPL(workqueue_set_max_active); /** * workqueue_set_min_active - adjust min_active of an unbound workqueue * @wq: target unbound workqueue * @min_active: new min_active value * * Set min_active of an unbound workqueue. Unlike other types of workqueues, an * unbound workqueue is not guaranteed to be able to process max_active * interdependent work items. Instead, an unbound workqueue is guaranteed to be * able to process min_active number of interdependent work items which is * %WQ_DFL_MIN_ACTIVE by default. * * Use this function to adjust the min_active value between 0 and the current * max_active. */ void workqueue_set_min_active(struct workqueue_struct *wq, int min_active) { /* min_active is only meaningful for non-ordered unbound workqueues */ if (WARN_ON((wq->flags & (WQ_BH | WQ_UNBOUND | __WQ_ORDERED)) != WQ_UNBOUND)) return; mutex_lock(&wq->mutex); wq->saved_min_active = clamp(min_active, 0, wq->saved_max_active); wq_adjust_max_active(wq); mutex_unlock(&wq->mutex); } /** * current_work - retrieve %current task's work struct * * Determine if %current task is a workqueue worker and what it's working on. * Useful to find out the context that the %current task is running in. * * Return: work struct if %current task is a workqueue worker, %NULL otherwise. */ struct work_struct *current_work(void) { struct worker *worker = current_wq_worker(); return worker ? worker->current_work : NULL; } EXPORT_SYMBOL(current_work); /** * current_is_workqueue_rescuer - is %current workqueue rescuer? * * Determine whether %current is a workqueue rescuer. Can be used from * work functions to determine whether it's being run off the rescuer task. * * Return: %true if %current is a workqueue rescuer. %false otherwise. */ bool current_is_workqueue_rescuer(void) { struct worker *worker = current_wq_worker(); return worker && worker->rescue_wq; } /** * workqueue_congested - test whether a workqueue is congested * @cpu: CPU in question * @wq: target workqueue * * Test whether @wq's cpu workqueue for @cpu is congested. There is * no synchronization around this function and the test result is * unreliable and only useful as advisory hints or for debugging. * * If @cpu is WORK_CPU_UNBOUND, the test is performed on the local CPU. * * With the exception of ordered workqueues, all workqueues have per-cpu * pool_workqueues, each with its own congested state. A workqueue being * congested on one CPU doesn't mean that the workqueue is contested on any * other CPUs. * * Return: * %true if congested, %false otherwise. */ bool workqueue_congested(int cpu, struct workqueue_struct *wq) { struct pool_workqueue *pwq; bool ret; rcu_read_lock(); preempt_disable(); if (cpu == WORK_CPU_UNBOUND) cpu = smp_processor_id(); pwq = *per_cpu_ptr(wq->cpu_pwq, cpu); ret = !list_empty(&pwq->inactive_works); preempt_enable(); rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(workqueue_congested); /** * work_busy - test whether a work is currently pending or running * @work: the work to be tested * * Test whether @work is currently pending or running. There is no * synchronization around this function and the test result is * unreliable and only useful as advisory hints or for debugging. * * Return: * OR'd bitmask of WORK_BUSY_* bits. */ unsigned int work_busy(struct work_struct *work) { struct worker_pool *pool; unsigned long irq_flags; unsigned int ret = 0; if (work_pending(work)) ret |= WORK_BUSY_PENDING; rcu_read_lock(); pool = get_work_pool(work); if (pool) { raw_spin_lock_irqsave(&pool->lock, irq_flags); if (find_worker_executing_work(pool, work)) ret |= WORK_BUSY_RUNNING; raw_spin_unlock_irqrestore(&pool->lock, irq_flags); } rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(work_busy); /** * set_worker_desc - set description for the current work item * @fmt: printf-style format string * @...: arguments for the format string * * This function can be called by a running work function to describe what * the work item is about. If the worker task gets dumped, this * information will be printed out together to help debugging. The * description can be at most WORKER_DESC_LEN including the trailing '\0'. */ void set_worker_desc(const char *fmt, ...) { struct worker *worker = current_wq_worker(); va_list args; if (worker) { va_start(args, fmt); vsnprintf(worker->desc, sizeof(worker->desc), fmt, args); va_end(args); } } EXPORT_SYMBOL_GPL(set_worker_desc); /** * print_worker_info - print out worker information and description * @log_lvl: the log level to use when printing * @task: target task * * If @task is a worker and currently executing a work item, print out the * name of the workqueue being serviced and worker description set with * set_worker_desc() by the currently executing work item. * * This function can be safely called on any task as long as the * task_struct itself is accessible. While safe, this function isn't * synchronized and may print out mixups or garbages of limited length. */ void print_worker_info(const char *log_lvl, struct task_struct *task) { work_func_t *fn = NULL; char name[WQ_NAME_LEN] = { }; char desc[WORKER_DESC_LEN] = { }; struct pool_workqueue *pwq = NULL; struct workqueue_struct *wq = NULL; struct worker *worker; if (!(task->flags & PF_WQ_WORKER)) return; /* * This function is called without any synchronization and @task * could be in any state. Be careful with dereferences. */ worker = kthread_probe_data(task); /* * Carefully copy the associated workqueue's workfn, name and desc. * Keep the original last '\0' in case the original is garbage. */ copy_from_kernel_nofault(&fn, &worker->current_func, sizeof(fn)); copy_from_kernel_nofault(&pwq, &worker->current_pwq, sizeof(pwq)); copy_from_kernel_nofault(&wq, &pwq->wq, sizeof(wq)); copy_from_kernel_nofault(name, wq->name, sizeof(name) - 1); copy_from_kernel_nofault(desc, worker->desc, sizeof(desc) - 1); if (fn || name[0] || desc[0]) { printk("%sWorkqueue: %s %ps", log_lvl, name, fn); if (strcmp(name, desc)) pr_cont(" (%s)", desc); pr_cont("\n"); } } static void pr_cont_pool_info(struct worker_pool *pool) { pr_cont(" cpus=%*pbl", nr_cpumask_bits, pool->attrs->cpumask); if (pool->node != NUMA_NO_NODE) pr_cont(" node=%d", pool->node); pr_cont(" flags=0x%x", pool->flags); if (pool->flags & POOL_BH) pr_cont(" bh%s", pool->attrs->nice == HIGHPRI_NICE_LEVEL ? "-hi" : ""); else pr_cont(" nice=%d", pool->attrs->nice); } static void pr_cont_worker_id(struct worker *worker) { struct worker_pool *pool = worker->pool; if (pool->flags & WQ_BH) pr_cont("bh%s", pool->attrs->nice == HIGHPRI_NICE_LEVEL ? "-hi" : ""); else pr_cont("%d%s", task_pid_nr(worker->task), worker->rescue_wq ? "(RESCUER)" : ""); } struct pr_cont_work_struct { bool comma; work_func_t func; long ctr; }; static void pr_cont_work_flush(bool comma, work_func_t func, struct pr_cont_work_struct *pcwsp) { if (!pcwsp->ctr) goto out_record; if (func == pcwsp->func) { pcwsp->ctr++; return; } if (pcwsp->ctr == 1) pr_cont("%s %ps", pcwsp->comma ? "," : "", pcwsp->func); else pr_cont("%s %ld*%ps", pcwsp->comma ? "," : "", pcwsp->ctr, pcwsp->func); pcwsp->ctr = 0; out_record: if ((long)func == -1L) return; pcwsp->comma = comma; pcwsp->func = func; pcwsp->ctr = 1; } static void pr_cont_work(bool comma, struct work_struct *work, struct pr_cont_work_struct *pcwsp) { if (work->func == wq_barrier_func) { struct wq_barrier *barr; barr = container_of(work, struct wq_barrier, work); pr_cont_work_flush(comma, (work_func_t)-1, pcwsp); pr_cont("%s BAR(%d)", comma ? "," : "", task_pid_nr(barr->task)); } else { if (!comma) pr_cont_work_flush(comma, (work_func_t)-1, pcwsp); pr_cont_work_flush(comma, work->func, pcwsp); } } static void show_pwq(struct pool_workqueue *pwq) { struct pr_cont_work_struct pcws = { .ctr = 0, }; struct worker_pool *pool = pwq->pool; struct work_struct *work; struct worker *worker; bool has_in_flight = false, has_pending = false; int bkt; pr_info(" pwq %d:", pool->id); pr_cont_pool_info(pool); pr_cont(" active=%d refcnt=%d%s\n", pwq->nr_active, pwq->refcnt, !list_empty(&pwq->mayday_node) ? " MAYDAY" : ""); hash_for_each(pool->busy_hash, bkt, worker, hentry) { if (worker->current_pwq == pwq) { has_in_flight = true; break; } } if (has_in_flight) { bool comma = false; pr_info(" in-flight:"); hash_for_each(pool->busy_hash, bkt, worker, hentry) { if (worker->current_pwq != pwq) continue; pr_cont(" %s", comma ? "," : ""); pr_cont_worker_id(worker); pr_cont(":%ps", worker->current_func); list_for_each_entry(work, &worker->scheduled, entry) pr_cont_work(false, work, &pcws); pr_cont_work_flush(comma, (work_func_t)-1L, &pcws); comma = true; } pr_cont("\n"); } list_for_each_entry(work, &pool->worklist, entry) { if (get_work_pwq(work) == pwq) { has_pending = true; break; } } if (has_pending) { bool comma = false; pr_info(" pending:"); list_for_each_entry(work, &pool->worklist, entry) { if (get_work_pwq(work) != pwq) continue; pr_cont_work(comma, work, &pcws); comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED); } pr_cont_work_flush(comma, (work_func_t)-1L, &pcws); pr_cont("\n"); } if (!list_empty(&pwq->inactive_works)) { bool comma = false; pr_info(" inactive:"); list_for_each_entry(work, &pwq->inactive_works, entry) { pr_cont_work(comma, work, &pcws); comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED); } pr_cont_work_flush(comma, (work_func_t)-1L, &pcws); pr_cont("\n"); } } /** * show_one_workqueue - dump state of specified workqueue * @wq: workqueue whose state will be printed */ void show_one_workqueue(struct workqueue_struct *wq) { struct pool_workqueue *pwq; bool idle = true; unsigned long irq_flags; for_each_pwq(pwq, wq) { if (!pwq_is_empty(pwq)) { idle = false; break; } } if (idle) /* Nothing to print for idle workqueue */ return; pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags); for_each_pwq(pwq, wq) { raw_spin_lock_irqsave(&pwq->pool->lock, irq_flags); if (!pwq_is_empty(pwq)) { /* * Defer printing to avoid deadlocks in console * drivers that queue work while holding locks * also taken in their write paths. */ printk_deferred_enter(); show_pwq(pwq); printk_deferred_exit(); } raw_spin_unlock_irqrestore(&pwq->pool->lock, irq_flags); /* * We could be printing a lot from atomic context, e.g. * sysrq-t -> show_all_workqueues(). Avoid triggering * hard lockup. */ touch_nmi_watchdog(); } } /** * show_one_worker_pool - dump state of specified worker pool * @pool: worker pool whose state will be printed */ static void show_one_worker_pool(struct worker_pool *pool) { struct worker *worker; bool first = true; unsigned long irq_flags; unsigned long hung = 0; raw_spin_lock_irqsave(&pool->lock, irq_flags); if (pool->nr_workers == pool->nr_idle) goto next_pool; /* How long the first pending work is waiting for a worker. */ if (!list_empty(&pool->worklist)) hung = jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000; /* * Defer printing to avoid deadlocks in console drivers that * queue work while holding locks also taken in their write * paths. */ printk_deferred_enter(); pr_info("pool %d:", pool->id); pr_cont_pool_info(pool); pr_cont(" hung=%lus workers=%d", hung, pool->nr_workers); if (pool->manager) pr_cont(" manager: %d", task_pid_nr(pool->manager->task)); list_for_each_entry(worker, &pool->idle_list, entry) { pr_cont(" %s", first ? "idle: " : ""); pr_cont_worker_id(worker); first = false; } pr_cont("\n"); printk_deferred_exit(); next_pool: raw_spin_unlock_irqrestore(&pool->lock, irq_flags); /* * We could be printing a lot from atomic context, e.g. * sysrq-t -> show_all_workqueues(). Avoid triggering * hard lockup. */ touch_nmi_watchdog(); } /** * show_all_workqueues - dump workqueue state * * Called from a sysrq handler and prints out all busy workqueues and pools. */ void show_all_workqueues(void) { struct workqueue_struct *wq; struct worker_pool *pool; int pi; rcu_read_lock(); pr_info("Showing busy workqueues and worker pools:\n"); list_for_each_entry_rcu(wq, &workqueues, list) show_one_workqueue(wq); for_each_pool(pool, pi) show_one_worker_pool(pool); rcu_read_unlock(); } /** * show_freezable_workqueues - dump freezable workqueue state * * Called from try_to_freeze_tasks() and prints out all freezable workqueues * still busy. */ void show_freezable_workqueues(void) { struct workqueue_struct *wq; rcu_read_lock(); pr_info("Showing freezable workqueues that are still busy:\n"); list_for_each_entry_rcu(wq, &workqueues, list) { if (!(wq->flags & WQ_FREEZABLE)) continue; show_one_workqueue(wq); } rcu_read_unlock(); } /* used to show worker information through /proc/PID/{comm,stat,status} */ void wq_worker_comm(char *buf, size_t size, struct task_struct *task) { /* stabilize PF_WQ_WORKER and worker pool association */ mutex_lock(&wq_pool_attach_mutex); if (task->flags & PF_WQ_WORKER) { struct worker *worker = kthread_data(task); struct worker_pool *pool = worker->pool; int off; off = format_worker_id(buf, size, worker, pool); if (pool) { raw_spin_lock_irq(&pool->lock); /* * ->desc tracks information (wq name or * set_worker_desc()) for the latest execution. If * current, prepend '+', otherwise '-'. */ if (worker->desc[0] != '\0') { if (worker->current_work) scnprintf(buf + off, size - off, "+%s", worker->desc); else scnprintf(buf + off, size - off, "-%s", worker->desc); } raw_spin_unlock_irq(&pool->lock); } } else { strscpy(buf, task->comm, size); } mutex_unlock(&wq_pool_attach_mutex); } #ifdef CONFIG_SMP /* * CPU hotplug. * * There are two challenges in supporting CPU hotplug. Firstly, there * are a lot of assumptions on strong associations among work, pwq and * pool which make migrating pending and scheduled works very * difficult to implement without impacting hot paths. Secondly, * worker pools serve mix of short, long and very long running works making * blocked draining impractical. * * This is solved by allowing the pools to be disassociated from the CPU * running as an unbound one and allowing it to be reattached later if the * cpu comes back online. */ static void unbind_workers(int cpu) { struct worker_pool *pool; struct worker *worker; for_each_cpu_worker_pool(pool, cpu) { mutex_lock(&wq_pool_attach_mutex); raw_spin_lock_irq(&pool->lock); /* * We've blocked all attach/detach operations. Make all workers * unbound and set DISASSOCIATED. Before this, all workers * must be on the cpu. After this, they may become diasporas. * And the preemption disabled section in their sched callbacks * are guaranteed to see WORKER_UNBOUND since the code here * is on the same cpu. */ for_each_pool_worker(worker, pool) worker->flags |= WORKER_UNBOUND; pool->flags |= POOL_DISASSOCIATED; /* * The handling of nr_running in sched callbacks are disabled * now. Zap nr_running. After this, nr_running stays zero and * need_more_worker() and keep_working() are always true as * long as the worklist is not empty. This pool now behaves as * an unbound (in terms of concurrency management) pool which * are served by workers tied to the pool. */ pool->nr_running = 0; /* * With concurrency management just turned off, a busy * worker blocking could lead to lengthy stalls. Kick off * unbound chain execution of currently pending work items. */ kick_pool(pool); raw_spin_unlock_irq(&pool->lock); for_each_pool_worker(worker, pool) unbind_worker(worker); mutex_unlock(&wq_pool_attach_mutex); } } /** * rebind_workers - rebind all workers of a pool to the associated CPU * @pool: pool of interest * * @pool->cpu is coming online. Rebind all workers to the CPU. */ static void rebind_workers(struct worker_pool *pool) { struct worker *worker; lockdep_assert_held(&wq_pool_attach_mutex); /* * Restore CPU affinity of all workers. As all idle workers should * be on the run-queue of the associated CPU before any local * wake-ups for concurrency management happen, restore CPU affinity * of all workers first and then clear UNBOUND. As we're called * from CPU_ONLINE, the following shouldn't fail. */ for_each_pool_worker(worker, pool) { kthread_set_per_cpu(worker->task, pool->cpu); WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, pool_allowed_cpus(pool)) < 0); } raw_spin_lock_irq(&pool->lock); pool->flags &= ~POOL_DISASSOCIATED; for_each_pool_worker(worker, pool) { unsigned int worker_flags = worker->flags; /* * We want to clear UNBOUND but can't directly call * worker_clr_flags() or adjust nr_running. Atomically * replace UNBOUND with another NOT_RUNNING flag REBOUND. * @worker will clear REBOUND using worker_clr_flags() when * it initiates the next execution cycle thus restoring * concurrency management. Note that when or whether * @worker clears REBOUND doesn't affect correctness. * * WRITE_ONCE() is necessary because @worker->flags may be * tested without holding any lock in * wq_worker_running(). Without it, NOT_RUNNING test may * fail incorrectly leading to premature concurrency * management operations. */ WARN_ON_ONCE(!(worker_flags & WORKER_UNBOUND)); worker_flags |= WORKER_REBOUND; worker_flags &= ~WORKER_UNBOUND; WRITE_ONCE(worker->flags, worker_flags); } raw_spin_unlock_irq(&pool->lock); } /** * restore_unbound_workers_cpumask - restore cpumask of unbound workers * @pool: unbound pool of interest * @cpu: the CPU which is coming up * * An unbound pool may end up with a cpumask which doesn't have any online * CPUs. When a worker of such pool get scheduled, the scheduler resets * its cpus_allowed. If @cpu is in @pool's cpumask which didn't have any * online CPU before, cpus_allowed of all its workers should be restored. */ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu) { static cpumask_t cpumask; struct worker *worker; lockdep_assert_held(&wq_pool_attach_mutex); /* is @cpu allowed for @pool? */ if (!cpumask_test_cpu(cpu, pool->attrs->cpumask)) return; cpumask_and(&cpumask, pool->attrs->cpumask, cpu_online_mask); /* as we're called from CPU_ONLINE, the following shouldn't fail */ for_each_pool_worker(worker, pool) WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, &cpumask) < 0); } int workqueue_prepare_cpu(unsigned int cpu) { struct worker_pool *pool; for_each_cpu_worker_pool(pool, cpu) { if (pool->nr_workers) continue; if (!create_worker(pool)) return -ENOMEM; } return 0; } int workqueue_online_cpu(unsigned int cpu) { struct worker_pool *pool; struct workqueue_struct *wq; int pi; mutex_lock(&wq_pool_mutex); cpumask_set_cpu(cpu, wq_online_cpumask); for_each_pool(pool, pi) { /* BH pools aren't affected by hotplug */ if (pool->flags & POOL_BH) continue; mutex_lock(&wq_pool_attach_mutex); if (pool->cpu == cpu) rebind_workers(pool); else if (pool->cpu < 0) restore_unbound_workers_cpumask(pool, cpu); mutex_unlock(&wq_pool_attach_mutex); } /* update pod affinity of unbound workqueues */ list_for_each_entry(wq, &workqueues, list) { struct workqueue_attrs *attrs = wq->unbound_attrs; if (attrs) { const struct wq_pod_type *pt = wqattrs_pod_type(attrs); int tcpu; for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]]) unbound_wq_update_pwq(wq, tcpu); mutex_lock(&wq->mutex); wq_update_node_max_active(wq, -1); mutex_unlock(&wq->mutex); } } mutex_unlock(&wq_pool_mutex); return 0; } int workqueue_offline_cpu(unsigned int cpu) { struct workqueue_struct *wq; /* unbinding per-cpu workers should happen on the local CPU */ if (WARN_ON(cpu != smp_processor_id())) return -1; unbind_workers(cpu); /* update pod affinity of unbound workqueues */ mutex_lock(&wq_pool_mutex); cpumask_clear_cpu(cpu, wq_online_cpumask); list_for_each_entry(wq, &workqueues, list) { struct workqueue_attrs *attrs = wq->unbound_attrs; if (attrs) { const struct wq_pod_type *pt = wqattrs_pod_type(attrs); int tcpu; for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]]) unbound_wq_update_pwq(wq, tcpu); mutex_lock(&wq->mutex); wq_update_node_max_active(wq, cpu); mutex_unlock(&wq->mutex); } } mutex_unlock(&wq_pool_mutex); return 0; } struct work_for_cpu { struct work_struct work; long (*fn)(void *); void *arg; long ret; }; static void work_for_cpu_fn(struct work_struct *work) { struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work); wfc->ret = wfc->fn(wfc->arg); } /** * work_on_cpu_key - run a function in thread context on a particular cpu * @cpu: the cpu to run on * @fn: the function to run * @arg: the function arg * @key: The lock class key for lock debugging purposes * * It is up to the caller to ensure that the cpu doesn't go offline. * The caller must not hold any locks which would prevent @fn from completing. * * Return: The value @fn returns. */ long work_on_cpu_key(int cpu, long (*fn)(void *), void *arg, struct lock_class_key *key) { struct work_for_cpu wfc = { .fn = fn, .arg = arg }; INIT_WORK_ONSTACK_KEY(&wfc.work, work_for_cpu_fn, key); schedule_work_on(cpu, &wfc.work); flush_work(&wfc.work); destroy_work_on_stack(&wfc.work); return wfc.ret; } EXPORT_SYMBOL_GPL(work_on_cpu_key); #endif /* CONFIG_SMP */ #ifdef CONFIG_FREEZER /** * freeze_workqueues_begin - begin freezing workqueues * * Start freezing workqueues. After this function returns, all freezable * workqueues will queue new works to their inactive_works list instead of * pool->worklist. * * CONTEXT: * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's. */ void freeze_workqueues_begin(void) { struct workqueue_struct *wq; mutex_lock(&wq_pool_mutex); WARN_ON_ONCE(workqueue_freezing); workqueue_freezing = true; list_for_each_entry(wq, &workqueues, list) { mutex_lock(&wq->mutex); wq_adjust_max_active(wq); mutex_unlock(&wq->mutex); } mutex_unlock(&wq_pool_mutex); } /** * freeze_workqueues_busy - are freezable workqueues still busy? * * Check whether freezing is complete. This function must be called * between freeze_workqueues_begin() and thaw_workqueues(). * * CONTEXT: * Grabs and releases wq_pool_mutex. * * Return: * %true if some freezable workqueues are still busy. %false if freezing * is complete. */ bool freeze_workqueues_busy(void) { bool busy = false; struct workqueue_struct *wq; struct pool_workqueue *pwq; mutex_lock(&wq_pool_mutex); WARN_ON_ONCE(!workqueue_freezing); list_for_each_entry(wq, &workqueues, list) { if (!(wq->flags & WQ_FREEZABLE)) continue; /* * nr_active is monotonically decreasing. It's safe * to peek without lock. */ rcu_read_lock(); for_each_pwq(pwq, wq) { WARN_ON_ONCE(pwq->nr_active < 0); if (pwq->nr_active) { busy = true; rcu_read_unlock(); goto out_unlock; } } rcu_read_unlock(); } out_unlock: mutex_unlock(&wq_pool_mutex); return busy; } /** * thaw_workqueues - thaw workqueues * * Thaw workqueues. Normal queueing is restored and all collected * frozen works are transferred to their respective pool worklists. * * CONTEXT: * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's. */ void thaw_workqueues(void) { struct workqueue_struct *wq; mutex_lock(&wq_pool_mutex); if (!workqueue_freezing) goto out_unlock; workqueue_freezing = false; /* restore max_active and repopulate worklist */ list_for_each_entry(wq, &workqueues, list) { mutex_lock(&wq->mutex); wq_adjust_max_active(wq); mutex_unlock(&wq->mutex); } out_unlock: mutex_unlock(&wq_pool_mutex); } #endif /* CONFIG_FREEZER */ static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask) { LIST_HEAD(ctxs); int ret = 0; struct workqueue_struct *wq; struct apply_wqattrs_ctx *ctx, *n; lockdep_assert_held(&wq_pool_mutex); list_for_each_entry(wq, &workqueues, list) { if (!(wq->flags & WQ_UNBOUND) || (wq->flags & __WQ_DESTROYING)) continue; ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs, unbound_cpumask); if (IS_ERR(ctx)) { ret = PTR_ERR(ctx); break; } list_add_tail(&ctx->list, &ctxs); } list_for_each_entry_safe(ctx, n, &ctxs, list) { if (!ret) apply_wqattrs_commit(ctx); apply_wqattrs_cleanup(ctx); } if (!ret) { mutex_lock(&wq_pool_attach_mutex); cpumask_copy(wq_unbound_cpumask, unbound_cpumask); mutex_unlock(&wq_pool_attach_mutex); } return ret; } /** * workqueue_unbound_exclude_cpumask - Exclude given CPUs from unbound cpumask * @exclude_cpumask: the cpumask to be excluded from wq_unbound_cpumask * * This function can be called from cpuset code to provide a set of isolated * CPUs that should be excluded from wq_unbound_cpumask. */ int workqueue_unbound_exclude_cpumask(cpumask_var_t exclude_cpumask) { cpumask_var_t cpumask; int ret = 0; if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL)) return -ENOMEM; mutex_lock(&wq_pool_mutex); /* * If the operation fails, it will fall back to * wq_requested_unbound_cpumask which is initially set to * (HK_TYPE_WQ ∩ HK_TYPE_DOMAIN) house keeping mask and rewritten * by any subsequent write to workqueue/cpumask sysfs file. */ if (!cpumask_andnot(cpumask, wq_requested_unbound_cpumask, exclude_cpumask)) cpumask_copy(cpumask, wq_requested_unbound_cpumask); if (!cpumask_equal(cpumask, wq_unbound_cpumask)) ret = workqueue_apply_unbound_cpumask(cpumask); /* Save the current isolated cpumask & export it via sysfs */ if (!ret) cpumask_copy(wq_isolated_cpumask, exclude_cpumask); mutex_unlock(&wq_pool_mutex); free_cpumask_var(cpumask); return ret; } static int parse_affn_scope(const char *val) { int i; for (i = 0; i < ARRAY_SIZE(wq_affn_names); i++) { if (!strncasecmp(val, wq_affn_names[i], strlen(wq_affn_names[i]))) return i; } return -EINVAL; } static int wq_affn_dfl_set(const char *val, const struct kernel_param *kp) { struct workqueue_struct *wq; int affn, cpu; affn = parse_affn_scope(val); if (affn < 0) return affn; if (affn == WQ_AFFN_DFL) return -EINVAL; cpus_read_lock(); mutex_lock(&wq_pool_mutex); wq_affn_dfl = affn; list_for_each_entry(wq, &workqueues, list) { for_each_online_cpu(cpu) unbound_wq_update_pwq(wq, cpu); } mutex_unlock(&wq_pool_mutex); cpus_read_unlock(); return 0; } static int wq_affn_dfl_get(char *buffer, const struct kernel_param *kp) { return scnprintf(buffer, PAGE_SIZE, "%s\n", wq_affn_names[wq_affn_dfl]); } static const struct kernel_param_ops wq_affn_dfl_ops = { .set = wq_affn_dfl_set, .get = wq_affn_dfl_get, }; module_param_cb(default_affinity_scope, &wq_affn_dfl_ops, NULL, 0644); #ifdef CONFIG_SYSFS /* * Workqueues with WQ_SYSFS flag set is visible to userland via * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the * following attributes. * * per_cpu RO bool : whether the workqueue is per-cpu or unbound * max_active RW int : maximum number of in-flight work items * * Unbound workqueues have the following extra attributes. * * nice RW int : nice value of the workers * cpumask RW mask : bitmask of allowed CPUs for the workers * affinity_scope RW str : worker CPU affinity scope (cache, numa, none) * affinity_strict RW bool : worker CPU affinity is strict */ struct wq_device { struct workqueue_struct *wq; struct device dev; }; static struct workqueue_struct *dev_to_wq(struct device *dev) { struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); return wq_dev->wq; } static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr, char *buf) { struct workqueue_struct *wq = dev_to_wq(dev); return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND)); } static DEVICE_ATTR_RO(per_cpu); static ssize_t max_active_show(struct device *dev, struct device_attribute *attr, char *buf) { struct workqueue_struct *wq = dev_to_wq(dev); return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active); } static ssize_t max_active_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct workqueue_struct *wq = dev_to_wq(dev); int val; if (sscanf(buf, "%d", &val) != 1 || val <= 0) return -EINVAL; workqueue_set_max_active(wq, val); return count; } static DEVICE_ATTR_RW(max_active); static struct attribute *wq_sysfs_attrs[] = { &dev_attr_per_cpu.attr, &dev_attr_max_active.attr, NULL, }; ATTRIBUTE_GROUPS(wq_sysfs); static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr, char *buf) { struct workqueue_struct *wq = dev_to_wq(dev); int written; mutex_lock(&wq->mutex); written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice); mutex_unlock(&wq->mutex); return written; } /* prepare workqueue_attrs for sysfs store operations */ static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq) { struct workqueue_attrs *attrs; lockdep_assert_held(&wq_pool_mutex); attrs = alloc_workqueue_attrs(); if (!attrs) return NULL; copy_workqueue_attrs(attrs, wq->unbound_attrs); return attrs; } static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct workqueue_struct *wq = dev_to_wq(dev); struct workqueue_attrs *attrs; int ret = -ENOMEM; apply_wqattrs_lock(); attrs = wq_sysfs_prep_attrs(wq); if (!attrs) goto out_unlock; if (sscanf(buf, "%d", &attrs->nice) == 1 && attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE) ret = apply_workqueue_attrs_locked(wq, attrs); else ret = -EINVAL; out_unlock: apply_wqattrs_unlock(); free_workqueue_attrs(attrs); return ret ?: count; } static ssize_t wq_cpumask_show(struct device *dev, struct device_attribute *attr, char *buf) { struct workqueue_struct *wq = dev_to_wq(dev); int written; mutex_lock(&wq->mutex); written = scnprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(wq->unbound_attrs->cpumask)); mutex_unlock(&wq->mutex); return written; } static ssize_t wq_cpumask_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct workqueue_struct *wq = dev_to_wq(dev); struct workqueue_attrs *attrs; int ret = -ENOMEM; apply_wqattrs_lock(); attrs = wq_sysfs_prep_attrs(wq); if (!attrs) goto out_unlock; ret = cpumask_parse(buf, attrs->cpumask); if (!ret) ret = apply_workqueue_attrs_locked(wq, attrs); out_unlock: apply_wqattrs_unlock(); free_workqueue_attrs(attrs); return ret ?: count; } static ssize_t wq_affn_scope_show(struct device *dev, struct device_attribute *attr, char *buf) { struct workqueue_struct *wq = dev_to_wq(dev); int written; mutex_lock(&wq->mutex); if (wq->unbound_attrs->affn_scope == WQ_AFFN_DFL) written = scnprintf(buf, PAGE_SIZE, "%s (%s)\n", wq_affn_names[WQ_AFFN_DFL], wq_affn_names[wq_affn_dfl]); else written = scnprintf(buf, PAGE_SIZE, "%s\n", wq_affn_names[wq->unbound_attrs->affn_scope]); mutex_unlock(&wq->mutex); return written; } static ssize_t wq_affn_scope_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct workqueue_struct *wq = dev_to_wq(dev); struct workqueue_attrs *attrs; int affn, ret = -ENOMEM; affn = parse_affn_scope(buf); if (affn < 0) return affn; apply_wqattrs_lock(); attrs = wq_sysfs_prep_attrs(wq); if (attrs) { attrs->affn_scope = affn; ret = apply_workqueue_attrs_locked(wq, attrs); } apply_wqattrs_unlock(); free_workqueue_attrs(attrs); return ret ?: count; } static ssize_t wq_affinity_strict_show(struct device *dev, struct device_attribute *attr, char *buf) { struct workqueue_struct *wq = dev_to_wq(dev); return scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->affn_strict); } static ssize_t wq_affinity_strict_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct workqueue_struct *wq = dev_to_wq(dev); struct workqueue_attrs *attrs; int v, ret = -ENOMEM; if (sscanf(buf, "%d", &v) != 1) return -EINVAL; apply_wqattrs_lock(); attrs = wq_sysfs_prep_attrs(wq); if (attrs) { attrs->affn_strict = (bool)v; ret = apply_workqueue_attrs_locked(wq, attrs); } apply_wqattrs_unlock(); free_workqueue_attrs(attrs); return ret ?: count; } static struct device_attribute wq_sysfs_unbound_attrs[] = { __ATTR(nice, 0644, wq_nice_show, wq_nice_store), __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store), __ATTR(affinity_scope, 0644, wq_affn_scope_show, wq_affn_scope_store), __ATTR(affinity_strict, 0644, wq_affinity_strict_show, wq_affinity_strict_store), __ATTR_NULL, }; static const struct bus_type wq_subsys = { .name = "workqueue", .dev_groups = wq_sysfs_groups, }; /** * workqueue_set_unbound_cpumask - Set the low-level unbound cpumask * @cpumask: the cpumask to set * * The low-level workqueues cpumask is a global cpumask that limits * the affinity of all unbound workqueues. This function check the @cpumask * and apply it to all unbound workqueues and updates all pwqs of them. * * Return: 0 - Success * -EINVAL - Invalid @cpumask * -ENOMEM - Failed to allocate memory for attrs or pwqs. */ static int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) { int ret = -EINVAL; /* * Not excluding isolated cpus on purpose. * If the user wishes to include them, we allow that. */ cpumask_and(cpumask, cpumask, cpu_possible_mask); if (!cpumask_empty(cpumask)) { ret = 0; apply_wqattrs_lock(); if (!cpumask_equal(cpumask, wq_unbound_cpumask)) ret = workqueue_apply_unbound_cpumask(cpumask); if (!ret) cpumask_copy(wq_requested_unbound_cpumask, cpumask); apply_wqattrs_unlock(); } return ret; } static ssize_t __wq_cpumask_show(struct device *dev, struct device_attribute *attr, char *buf, cpumask_var_t mask) { int written; mutex_lock(&wq_pool_mutex); written = scnprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask)); mutex_unlock(&wq_pool_mutex); return written; } static ssize_t cpumask_requested_show(struct device *dev, struct device_attribute *attr, char *buf) { return __wq_cpumask_show(dev, attr, buf, wq_requested_unbound_cpumask); } static DEVICE_ATTR_RO(cpumask_requested); static ssize_t cpumask_isolated_show(struct device *dev, struct device_attribute *attr, char *buf) { return __wq_cpumask_show(dev, attr, buf, wq_isolated_cpumask); } static DEVICE_ATTR_RO(cpumask_isolated); static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr, char *buf) { return __wq_cpumask_show(dev, attr, buf, wq_unbound_cpumask); } static ssize_t cpumask_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { cpumask_var_t cpumask; int ret; if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL)) return -ENOMEM; ret = cpumask_parse(buf, cpumask); if (!ret) ret = workqueue_set_unbound_cpumask(cpumask); free_cpumask_var(cpumask); return ret ? ret : count; } static DEVICE_ATTR_RW(cpumask); static struct attribute *wq_sysfs_cpumask_attrs[] = { &dev_attr_cpumask.attr, &dev_attr_cpumask_requested.attr, &dev_attr_cpumask_isolated.attr, NULL, }; ATTRIBUTE_GROUPS(wq_sysfs_cpumask); static int __init wq_sysfs_init(void) { return subsys_virtual_register(&wq_subsys, wq_sysfs_cpumask_groups); } core_initcall(wq_sysfs_init); static void wq_device_release(struct device *dev) { struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); kfree(wq_dev); } /** * workqueue_sysfs_register - make a workqueue visible in sysfs * @wq: the workqueue to register * * Expose @wq in sysfs under /sys/bus/workqueue/devices. * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set * which is the preferred method. * * Workqueue user should use this function directly iff it wants to apply * workqueue_attrs before making the workqueue visible in sysfs; otherwise, * apply_workqueue_attrs() may race against userland updating the * attributes. * * Return: 0 on success, -errno on failure. */ int workqueue_sysfs_register(struct workqueue_struct *wq) { struct wq_device *wq_dev; int ret; /* * Adjusting max_active breaks ordering guarantee. Disallow exposing * ordered workqueues. */ if (WARN_ON(wq->flags & __WQ_ORDERED)) return -EINVAL; wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL); if (!wq_dev) return -ENOMEM; wq_dev->wq = wq; wq_dev->dev.bus = &wq_subsys; wq_dev->dev.release = wq_device_release; dev_set_name(&wq_dev->dev, "%s", wq->name); /* * unbound_attrs are created separately. Suppress uevent until * everything is ready. */ dev_set_uevent_suppress(&wq_dev->dev, true); ret = device_register(&wq_dev->dev); if (ret) { put_device(&wq_dev->dev); wq->wq_dev = NULL; return ret; } if (wq->flags & WQ_UNBOUND) { struct device_attribute *attr; for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) { ret = device_create_file(&wq_dev->dev, attr); if (ret) { device_unregister(&wq_dev->dev); wq->wq_dev = NULL; return ret; } } } dev_set_uevent_suppress(&wq_dev->dev, false); kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD); return 0; } /** * workqueue_sysfs_unregister - undo workqueue_sysfs_register() * @wq: the workqueue to unregister * * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister. */ static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { struct wq_device *wq_dev = wq->wq_dev; if (!wq->wq_dev) return; wq->wq_dev = NULL; device_unregister(&wq_dev->dev); } #else /* CONFIG_SYSFS */ static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { } #endif /* CONFIG_SYSFS */ /* * Workqueue watchdog. * * Stall may be caused by various bugs - missing WQ_MEM_RECLAIM, illegal * flush dependency, a concurrency managed work item which stays RUNNING * indefinitely. Workqueue stalls can be very difficult to debug as the * usual warning mechanisms don't trigger and internal workqueue state is * largely opaque. * * Workqueue watchdog monitors all worker pools periodically and dumps * state if some pools failed to make forward progress for a while where * forward progress is defined as the first item on ->worklist changing. * * This mechanism is controlled through the kernel parameter * "workqueue.watchdog_thresh" which can be updated at runtime through the * corresponding sysfs parameter file. */ #ifdef CONFIG_WQ_WATCHDOG static unsigned long wq_watchdog_thresh = 30; static struct timer_list wq_watchdog_timer; static unsigned long wq_watchdog_touched = INITIAL_JIFFIES; static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES; static unsigned int wq_panic_on_stall; module_param_named(panic_on_stall, wq_panic_on_stall, uint, 0644); /* * Show workers that might prevent the processing of pending work items. * The only candidates are CPU-bound workers in the running state. * Pending work items should be handled by another idle worker * in all other situations. */ static void show_cpu_pool_hog(struct worker_pool *pool) { struct worker *worker; unsigned long irq_flags; int bkt; raw_spin_lock_irqsave(&pool->lock, irq_flags); hash_for_each(pool->busy_hash, bkt, worker, hentry) { if (task_is_running(worker->task)) { /* * Defer printing to avoid deadlocks in console * drivers that queue work while holding locks * also taken in their write paths. */ printk_deferred_enter(); pr_info("pool %d:\n", pool->id); sched_show_task(worker->task); printk_deferred_exit(); } } raw_spin_unlock_irqrestore(&pool->lock, irq_flags); } static void show_cpu_pools_hogs(void) { struct worker_pool *pool; int pi; pr_info("Showing backtraces of running workers in stalled CPU-bound worker pools:\n"); rcu_read_lock(); for_each_pool(pool, pi) { if (pool->cpu_stall) show_cpu_pool_hog(pool); } rcu_read_unlock(); } static void panic_on_wq_watchdog(void) { static unsigned int wq_stall; if (wq_panic_on_stall) { wq_stall++; BUG_ON(wq_stall >= wq_panic_on_stall); } } static void wq_watchdog_reset_touched(void) { int cpu; wq_watchdog_touched = jiffies; for_each_possible_cpu(cpu) per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies; } static void wq_watchdog_timer_fn(struct timer_list *unused) { unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ; bool lockup_detected = false; bool cpu_pool_stall = false; unsigned long now = jiffies; struct worker_pool *pool; int pi; if (!thresh) return; rcu_read_lock(); for_each_pool(pool, pi) { unsigned long pool_ts, touched, ts; pool->cpu_stall = false; if (list_empty(&pool->worklist)) continue; /* * If a virtual machine is stopped by the host it can look to * the watchdog like a stall. */ kvm_check_and_clear_guest_paused(); /* get the latest of pool and touched timestamps */ if (pool->cpu >= 0) touched = READ_ONCE(per_cpu(wq_watchdog_touched_cpu, pool->cpu)); else touched = READ_ONCE(wq_watchdog_touched); pool_ts = READ_ONCE(pool->watchdog_ts); if (time_after(pool_ts, touched)) ts = pool_ts; else ts = touched; /* did we stall? */ if (time_after(now, ts + thresh)) { lockup_detected = true; if (pool->cpu >= 0 && !(pool->flags & POOL_BH)) { pool->cpu_stall = true; cpu_pool_stall = true; } pr_emerg("BUG: workqueue lockup - pool"); pr_cont_pool_info(pool); pr_cont(" stuck for %us!\n", jiffies_to_msecs(now - pool_ts) / 1000); } } rcu_read_unlock(); if (lockup_detected) show_all_workqueues(); if (cpu_pool_stall) show_cpu_pools_hogs(); if (lockup_detected) panic_on_wq_watchdog(); wq_watchdog_reset_touched(); mod_timer(&wq_watchdog_timer, jiffies + thresh); } notrace void wq_watchdog_touch(int cpu) { unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ; unsigned long touch_ts = READ_ONCE(wq_watchdog_touched); unsigned long now = jiffies; if (cpu >= 0) per_cpu(wq_watchdog_touched_cpu, cpu) = now; else WARN_ONCE(1, "%s should be called with valid CPU", __func__); /* Don't unnecessarily store to global cacheline */ if (time_after(now, touch_ts + thresh / 4)) WRITE_ONCE(wq_watchdog_touched, jiffies); } static void wq_watchdog_set_thresh(unsigned long thresh) { wq_watchdog_thresh = 0; timer_delete_sync(&wq_watchdog_timer); if (thresh) { wq_watchdog_thresh = thresh; wq_watchdog_reset_touched(); mod_timer(&wq_watchdog_timer, jiffies + thresh * HZ); } } static int wq_watchdog_param_set_thresh(const char *val, const struct kernel_param *kp) { unsigned long thresh; int ret; ret = kstrtoul(val, 0, &thresh); if (ret) return ret; if (system_wq) wq_watchdog_set_thresh(thresh); else wq_watchdog_thresh = thresh; return 0; } static const struct kernel_param_ops wq_watchdog_thresh_ops = { .set = wq_watchdog_param_set_thresh, .get = param_get_ulong, }; module_param_cb(watchdog_thresh, &wq_watchdog_thresh_ops, &wq_watchdog_thresh, 0644); static void wq_watchdog_init(void) { timer_setup(&wq_watchdog_timer, wq_watchdog_timer_fn, TIMER_DEFERRABLE); wq_watchdog_set_thresh(wq_watchdog_thresh); } #else /* CONFIG_WQ_WATCHDOG */ static inline void wq_watchdog_init(void) { } #endif /* CONFIG_WQ_WATCHDOG */ static void bh_pool_kick_normal(struct irq_work *irq_work) { raise_softirq_irqoff(TASKLET_SOFTIRQ); } static void bh_pool_kick_highpri(struct irq_work *irq_work) { raise_softirq_irqoff(HI_SOFTIRQ); } static void __init restrict_unbound_cpumask(const char *name, const struct cpumask *mask) { if (!cpumask_intersects(wq_unbound_cpumask, mask)) { pr_warn("workqueue: Restricting unbound_cpumask (%*pb) with %s (%*pb) leaves no CPU, ignoring\n", cpumask_pr_args(wq_unbound_cpumask), name, cpumask_pr_args(mask)); return; } cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, mask); } static void __init init_cpu_worker_pool(struct worker_pool *pool, int cpu, int nice) { BUG_ON(init_worker_pool(pool)); pool->cpu = cpu; cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu)); cpumask_copy(pool->attrs->__pod_cpumask, cpumask_of(cpu)); pool->attrs->nice = nice; pool->attrs->affn_strict = true; pool->node = cpu_to_node(cpu); /* alloc pool ID */ mutex_lock(&wq_pool_mutex); BUG_ON(worker_pool_assign_id(pool)); mutex_unlock(&wq_pool_mutex); } /** * workqueue_init_early - early init for workqueue subsystem * * This is the first step of three-staged workqueue subsystem initialization and * invoked as soon as the bare basics - memory allocation, cpumasks and idr are * up. It sets up all the data structures and system workqueues and allows early * boot code to create workqueues and queue/cancel work items. Actual work item * execution starts only after kthreads can be created and scheduled right * before early initcalls. */ void __init workqueue_init_early(void) { struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_SYSTEM]; int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; void (*irq_work_fns[2])(struct irq_work *) = { bh_pool_kick_normal, bh_pool_kick_highpri }; int i, cpu; BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); BUG_ON(!alloc_cpumask_var(&wq_online_cpumask, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&wq_requested_unbound_cpumask, GFP_KERNEL)); BUG_ON(!zalloc_cpumask_var(&wq_isolated_cpumask, GFP_KERNEL)); cpumask_copy(wq_online_cpumask, cpu_online_mask); cpumask_copy(wq_unbound_cpumask, cpu_possible_mask); restrict_unbound_cpumask("HK_TYPE_WQ", housekeeping_cpumask(HK_TYPE_WQ)); restrict_unbound_cpumask("HK_TYPE_DOMAIN", housekeeping_cpumask(HK_TYPE_DOMAIN)); if (!cpumask_empty(&wq_cmdline_cpumask)) restrict_unbound_cpumask("workqueue.unbound_cpus", &wq_cmdline_cpumask); cpumask_copy(wq_requested_unbound_cpumask, wq_unbound_cpumask); cpumask_andnot(wq_isolated_cpumask, cpu_possible_mask, housekeeping_cpumask(HK_TYPE_DOMAIN)); pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); unbound_wq_update_pwq_attrs_buf = alloc_workqueue_attrs(); BUG_ON(!unbound_wq_update_pwq_attrs_buf); /* * If nohz_full is enabled, set power efficient workqueue as unbound. * This allows workqueue items to be moved to HK CPUs. */ if (housekeeping_enabled(HK_TYPE_TICK)) wq_power_efficient = true; /* initialize WQ_AFFN_SYSTEM pods */ pt->pod_cpus = kcalloc(1, sizeof(pt->pod_cpus[0]), GFP_KERNEL); pt->pod_node = kcalloc(1, sizeof(pt->pod_node[0]), GFP_KERNEL); pt->cpu_pod = kcalloc(nr_cpu_ids, sizeof(pt->cpu_pod[0]), GFP_KERNEL); BUG_ON(!pt->pod_cpus || !pt->pod_node || !pt->cpu_pod); BUG_ON(!zalloc_cpumask_var_node(&pt->pod_cpus[0], GFP_KERNEL, NUMA_NO_NODE)); pt->nr_pods = 1; cpumask_copy(pt->pod_cpus[0], cpu_possible_mask); pt->pod_node[0] = NUMA_NO_NODE; pt->cpu_pod[0] = 0; /* initialize BH and CPU pools */ for_each_possible_cpu(cpu) { struct worker_pool *pool; i = 0; for_each_bh_worker_pool(pool, cpu) { init_cpu_worker_pool(pool, cpu, std_nice[i]); pool->flags |= POOL_BH; init_irq_work(bh_pool_irq_work(pool), irq_work_fns[i]); i++; } i = 0; for_each_cpu_worker_pool(pool, cpu) init_cpu_worker_pool(pool, cpu, std_nice[i++]); } /* create default unbound and ordered wq attrs */ for (i = 0; i < NR_STD_WORKER_POOLS; i++) { struct workqueue_attrs *attrs; BUG_ON(!(attrs = alloc_workqueue_attrs())); attrs->nice = std_nice[i]; unbound_std_wq_attrs[i] = attrs; /* * An ordered wq should have only one pwq as ordering is * guaranteed by max_active which is enforced by pwqs. */ BUG_ON(!(attrs = alloc_workqueue_attrs())); attrs->nice = std_nice[i]; attrs->ordered = true; ordered_wq_attrs[i] = attrs; } system_wq = alloc_workqueue("events", 0, 0); system_percpu_wq = alloc_workqueue("events", 0, 0); system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0); system_long_wq = alloc_workqueue("events_long", 0, 0); system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE); system_dfl_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE); system_freezable_wq = alloc_workqueue("events_freezable", WQ_FREEZABLE, 0); system_power_efficient_wq = alloc_workqueue("events_power_efficient", WQ_POWER_EFFICIENT, 0); system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_pwr_efficient", WQ_FREEZABLE | WQ_POWER_EFFICIENT, 0); system_bh_wq = alloc_workqueue("events_bh", WQ_BH, 0); system_bh_highpri_wq = alloc_workqueue("events_bh_highpri", WQ_BH | WQ_HIGHPRI, 0); BUG_ON(!system_wq || !system_percpu_wq|| !system_highpri_wq || !system_long_wq || !system_unbound_wq || !system_freezable_wq || !system_dfl_wq || !system_power_efficient_wq || !system_freezable_power_efficient_wq || !system_bh_wq || !system_bh_highpri_wq); } static void __init wq_cpu_intensive_thresh_init(void) { unsigned long thresh; unsigned long bogo; pwq_release_worker = kthread_run_worker(0, "pool_workqueue_release"); BUG_ON(IS_ERR(pwq_release_worker)); /* if the user set it to a specific value, keep it */ if (wq_cpu_intensive_thresh_us != ULONG_MAX) return; /* * The default of 10ms is derived from the fact that most modern (as of * 2023) processors can do a lot in 10ms and that it's just below what * most consider human-perceivable. However, the kernel also runs on a * lot slower CPUs including microcontrollers where the threshold is way * too low. * * Let's scale up the threshold upto 1 second if BogoMips is below 4000. * This is by no means accurate but it doesn't have to be. The mechanism * is still useful even when the threshold is fully scaled up. Also, as * the reports would usually be applicable to everyone, some machines * operating on longer thresholds won't significantly diminish their * usefulness. */ thresh = 10 * USEC_PER_MSEC; /* see init/calibrate.c for lpj -> BogoMIPS calculation */ bogo = max_t(unsigned long, loops_per_jiffy / 500000 * HZ, 1); if (bogo < 4000) thresh = min_t(unsigned long, thresh * 4000 / bogo, USEC_PER_SEC); pr_debug("wq_cpu_intensive_thresh: lpj=%lu BogoMIPS=%lu thresh_us=%lu\n", loops_per_jiffy, bogo, thresh); wq_cpu_intensive_thresh_us = thresh; } /** * workqueue_init - bring workqueue subsystem fully online * * This is the second step of three-staged workqueue subsystem initialization * and invoked as soon as kthreads can be created and scheduled. Workqueues have * been created and work items queued on them, but there are no kworkers * executing the work items yet. Populate the worker pools with the initial * workers and enable future kworker creations. */ void __init workqueue_init(void) { struct workqueue_struct *wq; struct worker_pool *pool; int cpu, bkt; wq_cpu_intensive_thresh_init(); mutex_lock(&wq_pool_mutex); /* * Per-cpu pools created earlier could be missing node hint. Fix them * up. Also, create a rescuer for workqueues that requested it. */ for_each_possible_cpu(cpu) { for_each_bh_worker_pool(pool, cpu) pool->node = cpu_to_node(cpu); for_each_cpu_worker_pool(pool, cpu) pool->node = cpu_to_node(cpu); } list_for_each_entry(wq, &workqueues, list) { WARN(init_rescuer(wq), "workqueue: failed to create early rescuer for %s", wq->name); } mutex_unlock(&wq_pool_mutex); /* * Create the initial workers. A BH pool has one pseudo worker that * represents the shared BH execution context and thus doesn't get * affected by hotplug events. Create the BH pseudo workers for all * possible CPUs here. */ for_each_possible_cpu(cpu) for_each_bh_worker_pool(pool, cpu) BUG_ON(!create_worker(pool)); for_each_online_cpu(cpu) { for_each_cpu_worker_pool(pool, cpu) { pool->flags &= ~POOL_DISASSOCIATED; BUG_ON(!create_worker(pool)); } } hash_for_each(unbound_pool_hash, bkt, pool, hash_node) BUG_ON(!create_worker(pool)); wq_online = true; wq_watchdog_init(); } /* * Initialize @pt by first initializing @pt->cpu_pod[] with pod IDs according to * @cpu_shares_pod(). Each subset of CPUs that share a pod is assigned a unique * and consecutive pod ID. The rest of @pt is initialized accordingly. */ static void __init init_pod_type(struct wq_pod_type *pt, bool (*cpus_share_pod)(int, int)) { int cur, pre, cpu, pod; pt->nr_pods = 0; /* init @pt->cpu_pod[] according to @cpus_share_pod() */ pt->cpu_pod = kcalloc(nr_cpu_ids, sizeof(pt->cpu_pod[0]), GFP_KERNEL); BUG_ON(!pt->cpu_pod); for_each_possible_cpu(cur) { for_each_possible_cpu(pre) { if (pre >= cur) { pt->cpu_pod[cur] = pt->nr_pods++; break; } if (cpus_share_pod(cur, pre)) { pt->cpu_pod[cur] = pt->cpu_pod[pre]; break; } } } /* init the rest to match @pt->cpu_pod[] */ pt->pod_cpus = kcalloc(pt->nr_pods, sizeof(pt->pod_cpus[0]), GFP_KERNEL); pt->pod_node = kcalloc(pt->nr_pods, sizeof(pt->pod_node[0]), GFP_KERNEL); BUG_ON(!pt->pod_cpus || !pt->pod_node); for (pod = 0; pod < pt->nr_pods; pod++) BUG_ON(!zalloc_cpumask_var(&pt->pod_cpus[pod], GFP_KERNEL)); for_each_possible_cpu(cpu) { cpumask_set_cpu(cpu, pt->pod_cpus[pt->cpu_pod[cpu]]); pt->pod_node[pt->cpu_pod[cpu]] = cpu_to_node(cpu); } } static bool __init cpus_dont_share(int cpu0, int cpu1) { return false; } static bool __init cpus_share_smt(int cpu0, int cpu1) { #ifdef CONFIG_SCHED_SMT return cpumask_test_cpu(cpu0, cpu_smt_mask(cpu1)); #else return false; #endif } static bool __init cpus_share_numa(int cpu0, int cpu1) { return cpu_to_node(cpu0) == cpu_to_node(cpu1); } /** * workqueue_init_topology - initialize CPU pods for unbound workqueues * * This is the third step of three-staged workqueue subsystem initialization and * invoked after SMP and topology information are fully initialized. It * initializes the unbound CPU pods accordingly. */ void __init workqueue_init_topology(void) { struct workqueue_struct *wq; int cpu; init_pod_type(&wq_pod_types[WQ_AFFN_CPU], cpus_dont_share); init_pod_type(&wq_pod_types[WQ_AFFN_SMT], cpus_share_smt); init_pod_type(&wq_pod_types[WQ_AFFN_CACHE], cpus_share_cache); init_pod_type(&wq_pod_types[WQ_AFFN_NUMA], cpus_share_numa); wq_topo_initialized = true; mutex_lock(&wq_pool_mutex); /* * Workqueues allocated earlier would have all CPUs sharing the default * worker pool. Explicitly call unbound_wq_update_pwq() on all workqueue * and CPU combinations to apply per-pod sharing. */ list_for_each_entry(wq, &workqueues, list) { for_each_online_cpu(cpu) unbound_wq_update_pwq(wq, cpu); if (wq->flags & WQ_UNBOUND) { mutex_lock(&wq->mutex); wq_update_node_max_active(wq, -1); mutex_unlock(&wq->mutex); } } mutex_unlock(&wq_pool_mutex); } void __warn_flushing_systemwide_wq(void) { pr_warn("WARNING: Flushing system-wide workqueues will be prohibited in near future.\n"); dump_stack(); } EXPORT_SYMBOL(__warn_flushing_systemwide_wq); static int __init workqueue_unbound_cpus_setup(char *str) { if (cpulist_parse(str, &wq_cmdline_cpumask) < 0) { cpumask_clear(&wq_cmdline_cpumask); pr_warn("workqueue.unbound_cpus: incorrect CPU range, using default\n"); } return 1; } __setup("workqueue.unbound_cpus=", workqueue_unbound_cpus_setup);
211 212 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 // SPDX-License-Identifier: GPL-2.0-only /* * Memory merging support. * * This code enables dynamic sharing of identical pages found in different * memory areas, even if they are not shared by fork() * * Copyright (C) 2008-2009 Red Hat, Inc. * Authors: * Izik Eidus * Andrea Arcangeli * Chris Wright * Hugh Dickins */ #include <linux/errno.h> #include <linux/mm.h> #include <linux/mm_inline.h> #include <linux/fs.h> #include <linux/mman.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/sched/cputime.h> #include <linux/rwsem.h> #include <linux/pagemap.h> #include <linux/rmap.h> #include <linux/spinlock.h> #include <linux/xxhash.h> #include <linux/delay.h> #include <linux/kthread.h> #include <linux/wait.h> #include <linux/slab.h> #include <linux/rbtree.h> #include <linux/memory.h> #include <linux/mmu_notifier.h> #include <linux/swap.h> #include <linux/ksm.h> #include <linux/hashtable.h> #include <linux/freezer.h> #include <linux/oom.h> #include <linux/numa.h> #include <linux/pagewalk.h> #include <asm/tlbflush.h> #include "internal.h" #include "mm_slot.h" #define CREATE_TRACE_POINTS #include <trace/events/ksm.h> #ifdef CONFIG_NUMA #define NUMA(x) (x) #define DO_NUMA(x) do { (x); } while (0) #else #define NUMA(x) (0) #define DO_NUMA(x) do { } while (0) #endif typedef u8 rmap_age_t; /** * DOC: Overview * * A few notes about the KSM scanning process, * to make it easier to understand the data structures below: * * In order to reduce excessive scanning, KSM sorts the memory pages by their * contents into a data structure that holds pointers to the pages' locations. * * Since the contents of the pages may change at any moment, KSM cannot just * insert the pages into a normal sorted tree and expect it to find anything. * Therefore KSM uses two data structures - the stable and the unstable tree. * * The stable tree holds pointers to all the merged pages (ksm pages), sorted * by their contents. Because each such page is write-protected, searching on * this tree is fully assured to be working (except when pages are unmapped), * and therefore this tree is called the stable tree. * * The stable tree node includes information required for reverse * mapping from a KSM page to virtual addresses that map this page. * * In order to avoid large latencies of the rmap walks on KSM pages, * KSM maintains two types of nodes in the stable tree: * * * the regular nodes that keep the reverse mapping structures in a * linked list * * the "chains" that link nodes ("dups") that represent the same * write protected memory content, but each "dup" corresponds to a * different KSM page copy of that content * * Internally, the regular nodes, "dups" and "chains" are represented * using the same struct ksm_stable_node structure. * * In addition to the stable tree, KSM uses a second data structure called the * unstable tree: this tree holds pointers to pages which have been found to * be "unchanged for a period of time". The unstable tree sorts these pages * by their contents, but since they are not write-protected, KSM cannot rely * upon the unstable tree to work correctly - the unstable tree is liable to * be corrupted as its contents are modified, and so it is called unstable. * * KSM solves this problem by several techniques: * * 1) The unstable tree is flushed every time KSM completes scanning all * memory areas, and then the tree is rebuilt again from the beginning. * 2) KSM will only insert into the unstable tree, pages whose hash value * has not changed since the previous scan of all memory areas. * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the * colors of the nodes and not on their contents, assuring that even when * the tree gets "corrupted" it won't get out of balance, so scanning time * remains the same (also, searching and inserting nodes in an rbtree uses * the same algorithm, so we have no overhead when we flush and rebuild). * 4) KSM never flushes the stable tree, which means that even if it were to * take 10 attempts to find a page in the unstable tree, once it is found, * it is secured in the stable tree. (When we scan a new page, we first * compare it against the stable tree, and then against the unstable tree.) * * If the merge_across_nodes tunable is unset, then KSM maintains multiple * stable trees and multiple unstable trees: one of each for each NUMA node. */ /** * struct ksm_mm_slot - ksm information per mm that is being scanned * @slot: hash lookup from mm to mm_slot * @rmap_list: head for this mm_slot's singly-linked list of rmap_items */ struct ksm_mm_slot { struct mm_slot slot; struct ksm_rmap_item *rmap_list; }; /** * struct ksm_scan - cursor for scanning * @mm_slot: the current mm_slot we are scanning * @address: the next address inside that to be scanned * @rmap_list: link to the next rmap to be scanned in the rmap_list * @seqnr: count of completed full scans (needed when removing unstable node) * * There is only the one ksm_scan instance of this cursor structure. */ struct ksm_scan { struct ksm_mm_slot *mm_slot; unsigned long address; struct ksm_rmap_item **rmap_list; unsigned long seqnr; }; /** * struct ksm_stable_node - node of the stable rbtree * @node: rb node of this ksm page in the stable tree * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list * @hlist_dup: linked into the stable_node->hlist with a stable_node chain * @list: linked into migrate_nodes, pending placement in the proper node tree * @hlist: hlist head of rmap_items using this ksm page * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid) * @chain_prune_time: time of the last full garbage collection * @rmap_hlist_len: number of rmap_item entries in hlist or STABLE_NODE_CHAIN * @nid: NUMA node id of stable tree in which linked (may not match kpfn) */ struct ksm_stable_node { union { struct rb_node node; /* when node of stable tree */ struct { /* when listed for migration */ struct list_head *head; struct { struct hlist_node hlist_dup; struct list_head list; }; }; }; struct hlist_head hlist; union { unsigned long kpfn; unsigned long chain_prune_time; }; /* * STABLE_NODE_CHAIN can be any negative number in * rmap_hlist_len negative range, but better not -1 to be able * to reliably detect underflows. */ #define STABLE_NODE_CHAIN -1024 int rmap_hlist_len; #ifdef CONFIG_NUMA int nid; #endif }; /** * struct ksm_rmap_item - reverse mapping item for virtual addresses * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree * @nid: NUMA node id of unstable tree in which linked (may not match page) * @mm: the memory structure this rmap_item is pointing into * @address: the virtual address this rmap_item tracks (+ flags in low bits) * @oldchecksum: previous checksum of the page at that virtual address * @node: rb node of this rmap_item in the unstable tree * @head: pointer to stable_node heading this list in the stable tree * @hlist: link into hlist of rmap_items hanging off that stable_node * @age: number of scan iterations since creation * @remaining_skips: how many scans to skip */ struct ksm_rmap_item { struct ksm_rmap_item *rmap_list; union { struct anon_vma *anon_vma; /* when stable */ #ifdef CONFIG_NUMA int nid; /* when node of unstable tree */ #endif }; struct mm_struct *mm; unsigned long address; /* + low bits used for flags below */ unsigned int oldchecksum; /* when unstable */ rmap_age_t age; rmap_age_t remaining_skips; union { struct rb_node node; /* when node of unstable tree */ struct { /* when listed from stable tree */ struct ksm_stable_node *head; struct hlist_node hlist; }; }; }; #define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */ #define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */ #define STABLE_FLAG 0x200 /* is listed from the stable tree */ /* The stable and unstable tree heads */ static struct rb_root one_stable_tree[1] = { RB_ROOT }; static struct rb_root one_unstable_tree[1] = { RB_ROOT }; static struct rb_root *root_stable_tree = one_stable_tree; static struct rb_root *root_unstable_tree = one_unstable_tree; /* Recently migrated nodes of stable tree, pending proper placement */ static LIST_HEAD(migrate_nodes); #define STABLE_NODE_DUP_HEAD ((struct list_head *)&migrate_nodes.prev) #define MM_SLOTS_HASH_BITS 10 static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); static struct ksm_mm_slot ksm_mm_head = { .slot.mm_node = LIST_HEAD_INIT(ksm_mm_head.slot.mm_node), }; static struct ksm_scan ksm_scan = { .mm_slot = &ksm_mm_head, }; static struct kmem_cache *rmap_item_cache; static struct kmem_cache *stable_node_cache; static struct kmem_cache *mm_slot_cache; /* Default number of pages to scan per batch */ #define DEFAULT_PAGES_TO_SCAN 100 /* The number of pages scanned */ static unsigned long ksm_pages_scanned; /* The number of nodes in the stable tree */ static unsigned long ksm_pages_shared; /* The number of page slots additionally sharing those nodes */ static unsigned long ksm_pages_sharing; /* The number of nodes in the unstable tree */ static unsigned long ksm_pages_unshared; /* The number of rmap_items in use: to calculate pages_volatile */ static unsigned long ksm_rmap_items; /* The number of stable_node chains */ static unsigned long ksm_stable_node_chains; /* The number of stable_node dups linked to the stable_node chains */ static unsigned long ksm_stable_node_dups; /* Delay in pruning stale stable_node_dups in the stable_node_chains */ static unsigned int ksm_stable_node_chains_prune_millisecs = 2000; /* Maximum number of page slots sharing a stable node */ static int ksm_max_page_sharing = 256; /* Number of pages ksmd should scan in one batch */ static unsigned int ksm_thread_pages_to_scan = DEFAULT_PAGES_TO_SCAN; /* Milliseconds ksmd should sleep between batches */ static unsigned int ksm_thread_sleep_millisecs = 20; /* Checksum of an empty (zeroed) page */ static unsigned int zero_checksum __read_mostly; /* Whether to merge empty (zeroed) pages with actual zero pages */ static bool ksm_use_zero_pages __read_mostly; /* Skip pages that couldn't be de-duplicated previously */ /* Default to true at least temporarily, for testing */ static bool ksm_smart_scan = true; /* The number of zero pages which is placed by KSM */ atomic_long_t ksm_zero_pages = ATOMIC_LONG_INIT(0); /* The number of pages that have been skipped due to "smart scanning" */ static unsigned long ksm_pages_skipped; /* Don't scan more than max pages per batch. */ static unsigned long ksm_advisor_max_pages_to_scan = 30000; /* Min CPU for scanning pages per scan */ #define KSM_ADVISOR_MIN_CPU 10 /* Max CPU for scanning pages per scan */ static unsigned int ksm_advisor_max_cpu = 70; /* Target scan time in seconds to analyze all KSM candidate pages. */ static unsigned long ksm_advisor_target_scan_time = 200; /* Exponentially weighted moving average. */ #define EWMA_WEIGHT 30 /** * struct advisor_ctx - metadata for KSM advisor * @start_scan: start time of the current scan * @scan_time: scan time of previous scan * @change: change in percent to pages_to_scan parameter * @cpu_time: cpu time consumed by the ksmd thread in the previous scan */ struct advisor_ctx { ktime_t start_scan; unsigned long scan_time; unsigned long change; unsigned long long cpu_time; }; static struct advisor_ctx advisor_ctx; /* Define different advisor's */ enum ksm_advisor_type { KSM_ADVISOR_NONE, KSM_ADVISOR_SCAN_TIME, }; static enum ksm_advisor_type ksm_advisor; #ifdef CONFIG_SYSFS /* * Only called through the sysfs control interface: */ /* At least scan this many pages per batch. */ static unsigned long ksm_advisor_min_pages_to_scan = 500; static void set_advisor_defaults(void) { if (ksm_advisor == KSM_ADVISOR_NONE) { ksm_thread_pages_to_scan = DEFAULT_PAGES_TO_SCAN; } else if (ksm_advisor == KSM_ADVISOR_SCAN_TIME) { advisor_ctx = (const struct advisor_ctx){ 0 }; ksm_thread_pages_to_scan = ksm_advisor_min_pages_to_scan; } } #endif /* CONFIG_SYSFS */ static inline void advisor_start_scan(void) { if (ksm_advisor == KSM_ADVISOR_SCAN_TIME) advisor_ctx.start_scan = ktime_get(); } /* * Use previous scan time if available, otherwise use current scan time as an * approximation for the previous scan time. */ static inline unsigned long prev_scan_time(struct advisor_ctx *ctx, unsigned long scan_time) { return ctx->scan_time ? ctx->scan_time : scan_time; } /* Calculate exponential weighted moving average */ static unsigned long ewma(unsigned long prev, unsigned long curr) { return ((100 - EWMA_WEIGHT) * prev + EWMA_WEIGHT * curr) / 100; } /* * The scan time advisor is based on the current scan rate and the target * scan rate. * * new_pages_to_scan = pages_to_scan * (scan_time / target_scan_time) * * To avoid perturbations it calculates a change factor of previous changes. * A new change factor is calculated for each iteration and it uses an * exponentially weighted moving average. The new pages_to_scan value is * multiplied with that change factor: * * new_pages_to_scan *= change facor * * The new_pages_to_scan value is limited by the cpu min and max values. It * calculates the cpu percent for the last scan and calculates the new * estimated cpu percent cost for the next scan. That value is capped by the * cpu min and max setting. * * In addition the new pages_to_scan value is capped by the max and min * limits. */ static void scan_time_advisor(void) { unsigned int cpu_percent; unsigned long cpu_time; unsigned long cpu_time_diff; unsigned long cpu_time_diff_ms; unsigned long pages; unsigned long per_page_cost; unsigned long factor; unsigned long change; unsigned long last_scan_time; unsigned long scan_time; /* Convert scan time to seconds */ scan_time = div_s64(ktime_ms_delta(ktime_get(), advisor_ctx.start_scan), MSEC_PER_SEC); scan_time = scan_time ? scan_time : 1; /* Calculate CPU consumption of ksmd background thread */ cpu_time = task_sched_runtime(current); cpu_time_diff = cpu_time - advisor_ctx.cpu_time; cpu_time_diff_ms = cpu_time_diff / 1000 / 1000; cpu_percent = (cpu_time_diff_ms * 100) / (scan_time * 1000); cpu_percent = cpu_percent ? cpu_percent : 1; last_scan_time = prev_scan_time(&advisor_ctx, scan_time); /* Calculate scan time as percentage of target scan time */ factor = ksm_advisor_target_scan_time * 100 / scan_time; factor = factor ? factor : 1; /* * Calculate scan time as percentage of last scan time and use * exponentially weighted average to smooth it */ change = scan_time * 100 / last_scan_time; change = change ? change : 1; change = ewma(advisor_ctx.change, change); /* Calculate new scan rate based on target scan rate. */ pages = ksm_thread_pages_to_scan * 100 / factor; /* Update pages_to_scan by weighted change percentage. */ pages = pages * change / 100; /* Cap new pages_to_scan value */ per_page_cost = ksm_thread_pages_to_scan / cpu_percent; per_page_cost = per_page_cost ? per_page_cost : 1; pages = min(pages, per_page_cost * ksm_advisor_max_cpu); pages = max(pages, per_page_cost * KSM_ADVISOR_MIN_CPU); pages = min(pages, ksm_advisor_max_pages_to_scan); /* Update advisor context */ advisor_ctx.change = change; advisor_ctx.scan_time = scan_time; advisor_ctx.cpu_time = cpu_time; ksm_thread_pages_to_scan = pages; trace_ksm_advisor(scan_time, pages, cpu_percent); } static void advisor_stop_scan(void) { if (ksm_advisor == KSM_ADVISOR_SCAN_TIME) scan_time_advisor(); } #ifdef CONFIG_NUMA /* Zeroed when merging across nodes is not allowed */ static unsigned int ksm_merge_across_nodes = 1; static int ksm_nr_node_ids = 1; #else #define ksm_merge_across_nodes 1U #define ksm_nr_node_ids 1 #endif #define KSM_RUN_STOP 0 #define KSM_RUN_MERGE 1 #define KSM_RUN_UNMERGE 2 #define KSM_RUN_OFFLINE 4 static unsigned long ksm_run = KSM_RUN_STOP; static void wait_while_offlining(void); static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); static DECLARE_WAIT_QUEUE_HEAD(ksm_iter_wait); static DEFINE_MUTEX(ksm_thread_mutex); static DEFINE_SPINLOCK(ksm_mmlist_lock); static int __init ksm_slab_init(void) { rmap_item_cache = KMEM_CACHE(ksm_rmap_item, 0); if (!rmap_item_cache) goto out; stable_node_cache = KMEM_CACHE(ksm_stable_node, 0); if (!stable_node_cache) goto out_free1; mm_slot_cache = KMEM_CACHE(ksm_mm_slot, 0); if (!mm_slot_cache) goto out_free2; return 0; out_free2: kmem_cache_destroy(stable_node_cache); out_free1: kmem_cache_destroy(rmap_item_cache); out: return -ENOMEM; } static void __init ksm_slab_free(void) { kmem_cache_destroy(mm_slot_cache); kmem_cache_destroy(stable_node_cache); kmem_cache_destroy(rmap_item_cache); mm_slot_cache = NULL; } static __always_inline bool is_stable_node_chain(struct ksm_stable_node *chain) { return chain->rmap_hlist_len == STABLE_NODE_CHAIN; } static __always_inline bool is_stable_node_dup(struct ksm_stable_node *dup) { return dup->head == STABLE_NODE_DUP_HEAD; } static inline void stable_node_chain_add_dup(struct ksm_stable_node *dup, struct ksm_stable_node *chain) { VM_BUG_ON(is_stable_node_dup(dup)); dup->head = STABLE_NODE_DUP_HEAD; VM_BUG_ON(!is_stable_node_chain(chain)); hlist_add_head(&dup->hlist_dup, &chain->hlist); ksm_stable_node_dups++; } static inline void __stable_node_dup_del(struct ksm_stable_node *dup) { VM_BUG_ON(!is_stable_node_dup(dup)); hlist_del(&dup->hlist_dup); ksm_stable_node_dups--; } static inline void stable_node_dup_del(struct ksm_stable_node *dup) { VM_BUG_ON(is_stable_node_chain(dup)); if (is_stable_node_dup(dup)) __stable_node_dup_del(dup); else rb_erase(&dup->node, root_stable_tree + NUMA(dup->nid)); #ifdef CONFIG_DEBUG_VM dup->head = NULL; #endif } static inline struct ksm_rmap_item *alloc_rmap_item(void) { struct ksm_rmap_item *rmap_item; rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN); if (rmap_item) ksm_rmap_items++; return rmap_item; } static inline void free_rmap_item(struct ksm_rmap_item *rmap_item) { ksm_rmap_items--; rmap_item->mm->ksm_rmap_items--; rmap_item->mm = NULL; /* debug safety */ kmem_cache_free(rmap_item_cache, rmap_item); } static inline struct ksm_stable_node *alloc_stable_node(void) { /* * The allocation can take too long with GFP_KERNEL when memory is under * pressure, which may lead to hung task warnings. Adding __GFP_HIGH * grants access to memory reserves, helping to avoid this problem. */ return kmem_cache_alloc(stable_node_cache, GFP_KERNEL | __GFP_HIGH); } static inline void free_stable_node(struct ksm_stable_node *stable_node) { VM_BUG_ON(stable_node->rmap_hlist_len && !is_stable_node_chain(stable_node)); kmem_cache_free(stable_node_cache, stable_node); } /* * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's * page tables after it has passed through ksm_exit() - which, if necessary, * takes mmap_lock briefly to serialize against them. ksm_exit() does not set * a special flag: they can just back out as soon as mm_users goes to zero. * ksm_test_exit() is used throughout to make this test for exit: in some * places for correctness, in some places just to avoid unnecessary work. */ static inline bool ksm_test_exit(struct mm_struct *mm) { return atomic_read(&mm->mm_users) == 0; } /* * We use break_ksm to break COW on a ksm page by triggering unsharing, * such that the ksm page will get replaced by an exclusive anonymous page. * * We take great care only to touch a ksm page, in a VM_MERGEABLE vma, * in case the application has unmapped and remapped mm,addr meanwhile. * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP * mmap of /dev/mem, where we would not want to touch it. * * FAULT_FLAG_REMOTE/FOLL_REMOTE are because we do this outside the context * of the process that owns 'vma'. We also do not want to enforce * protection keys here anyway. */ static int break_ksm(struct vm_area_struct *vma, unsigned long addr, bool lock_vma) { vm_fault_t ret = 0; if (lock_vma) vma_start_write(vma); do { bool ksm_page = false; struct folio_walk fw; struct folio *folio; cond_resched(); folio = folio_walk_start(&fw, vma, addr, FW_MIGRATION | FW_ZEROPAGE); if (folio) { /* Small folio implies FW_LEVEL_PTE. */ if (!folio_test_large(folio) && (folio_test_ksm(folio) || is_ksm_zero_pte(fw.pte))) ksm_page = true; folio_walk_end(&fw, vma); } if (!ksm_page) return 0; ret = handle_mm_fault(vma, addr, FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE, NULL); } while (!(ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM))); /* * We must loop until we no longer find a KSM page because * handle_mm_fault() may back out if there's any difficulty e.g. if * pte accessed bit gets updated concurrently. * * VM_FAULT_SIGBUS could occur if we race with truncation of the * backing file, which also invalidates anonymous pages: that's * okay, that truncation will have unmapped the KSM page for us. * * VM_FAULT_OOM: at the time of writing (late July 2009), setting * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the * current task has TIF_MEMDIE set, and will be OOM killed on return * to user; and ksmd, having no mm, would never be chosen for that. * * But if the mm is in a limited mem_cgroup, then the fault may fail * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and * even ksmd can fail in this way - though it's usually breaking ksm * just to undo a merge it made a moment before, so unlikely to oom. * * That's a pity: we might therefore have more kernel pages allocated * than we're counting as nodes in the stable tree; but ksm_do_scan * will retry to break_cow on each pass, so should recover the page * in due course. The important thing is to not let VM_MERGEABLE * be cleared while any such pages might remain in the area. */ return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; } static bool ksm_compatible(const struct file *file, vm_flags_t vm_flags) { if (vm_flags & (VM_SHARED | VM_MAYSHARE | VM_SPECIAL | VM_HUGETLB | VM_DROPPABLE)) return false; /* just ignore the advice */ if (file_is_dax(file)) return false; #ifdef VM_SAO if (vm_flags & VM_SAO) return false; #endif #ifdef VM_SPARC_ADI if (vm_flags & VM_SPARC_ADI) return false; #endif return true; } static bool vma_ksm_compatible(struct vm_area_struct *vma) { return ksm_compatible(vma->vm_file, vma->vm_flags); } static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma; if (ksm_test_exit(mm)) return NULL; vma = vma_lookup(mm, addr); if (!vma || !(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) return NULL; return vma; } static void break_cow(struct ksm_rmap_item *rmap_item) { struct mm_struct *mm = rmap_item->mm; unsigned long addr = rmap_item->address; struct vm_area_struct *vma; /* * It is not an accident that whenever we want to break COW * to undo, we also need to drop a reference to the anon_vma. */ put_anon_vma(rmap_item->anon_vma); mmap_read_lock(mm); vma = find_mergeable_vma(mm, addr); if (vma) break_ksm(vma, addr, false); mmap_read_unlock(mm); } static struct page *get_mergeable_page(struct ksm_rmap_item *rmap_item) { struct mm_struct *mm = rmap_item->mm; unsigned long addr = rmap_item->address; struct vm_area_struct *vma; struct page *page = NULL; struct folio_walk fw; struct folio *folio; mmap_read_lock(mm); vma = find_mergeable_vma(mm, addr); if (!vma) goto out; folio = folio_walk_start(&fw, vma, addr, 0); if (folio) { if (!folio_is_zone_device(folio) && folio_test_anon(folio)) { folio_get(folio); page = fw.page; } folio_walk_end(&fw, vma); } out: if (page) { flush_anon_page(vma, page, addr); flush_dcache_page(page); } mmap_read_unlock(mm); return page; } /* * This helper is used for getting right index into array of tree roots. * When merge_across_nodes knob is set to 1, there are only two rb-trees for * stable and unstable pages from all nodes with roots in index 0. Otherwise, * every node has its own stable and unstable tree. */ static inline int get_kpfn_nid(unsigned long kpfn) { return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn)); } static struct ksm_stable_node *alloc_stable_node_chain(struct ksm_stable_node *dup, struct rb_root *root) { struct ksm_stable_node *chain = alloc_stable_node(); VM_BUG_ON(is_stable_node_chain(dup)); if (likely(chain)) { INIT_HLIST_HEAD(&chain->hlist); chain->chain_prune_time = jiffies; chain->rmap_hlist_len = STABLE_NODE_CHAIN; #if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA) chain->nid = NUMA_NO_NODE; /* debug */ #endif ksm_stable_node_chains++; /* * Put the stable node chain in the first dimension of * the stable tree and at the same time remove the old * stable node. */ rb_replace_node(&dup->node, &chain->node, root); /* * Move the old stable node to the second dimension * queued in the hlist_dup. The invariant is that all * dup stable_nodes in the chain->hlist point to pages * that are write protected and have the exact same * content. */ stable_node_chain_add_dup(dup, chain); } return chain; } static inline void free_stable_node_chain(struct ksm_stable_node *chain, struct rb_root *root) { rb_erase(&chain->node, root); free_stable_node(chain); ksm_stable_node_chains--; } static void remove_node_from_stable_tree(struct ksm_stable_node *stable_node) { struct ksm_rmap_item *rmap_item; /* check it's not STABLE_NODE_CHAIN or negative */ BUG_ON(stable_node->rmap_hlist_len < 0); hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { if (rmap_item->hlist.next) { ksm_pages_sharing--; trace_ksm_remove_rmap_item(stable_node->kpfn, rmap_item, rmap_item->mm); } else { ksm_pages_shared--; } rmap_item->mm->ksm_merging_pages--; VM_BUG_ON(stable_node->rmap_hlist_len <= 0); stable_node->rmap_hlist_len--; put_anon_vma(rmap_item->anon_vma); rmap_item->address &= PAGE_MASK; cond_resched(); } /* * We need the second aligned pointer of the migrate_nodes * list_head to stay clear from the rb_parent_color union * (aligned and different than any node) and also different * from &migrate_nodes. This will verify that future list.h changes * don't break STABLE_NODE_DUP_HEAD. Only recent gcc can handle it. */ BUILD_BUG_ON(STABLE_NODE_DUP_HEAD <= &migrate_nodes); BUILD_BUG_ON(STABLE_NODE_DUP_HEAD >= &migrate_nodes + 1); trace_ksm_remove_ksm_page(stable_node->kpfn); if (stable_node->head == &migrate_nodes) list_del(&stable_node->list); else stable_node_dup_del(stable_node); free_stable_node(stable_node); } enum ksm_get_folio_flags { KSM_GET_FOLIO_NOLOCK, KSM_GET_FOLIO_LOCK, KSM_GET_FOLIO_TRYLOCK }; /* * ksm_get_folio: checks if the page indicated by the stable node * is still its ksm page, despite having held no reference to it. * In which case we can trust the content of the page, and it * returns the gotten page; but if the page has now been zapped, * remove the stale node from the stable tree and return NULL. * But beware, the stable node's page might be being migrated. * * You would expect the stable_node to hold a reference to the ksm page. * But if it increments the page's count, swapping out has to wait for * ksmd to come around again before it can free the page, which may take * seconds or even minutes: much too unresponsive. So instead we use a * "keyhole reference": access to the ksm page from the stable node peeps * out through its keyhole to see if that page still holds the right key, * pointing back to this stable node. This relies on freeing a PageAnon * page to reset its page->mapping to NULL, and relies on no other use of * a page to put something that might look like our key in page->mapping. * is on its way to being freed; but it is an anomaly to bear in mind. */ static struct folio *ksm_get_folio(struct ksm_stable_node *stable_node, enum ksm_get_folio_flags flags) { struct folio *folio; void *expected_mapping; unsigned long kpfn; expected_mapping = (void *)((unsigned long)stable_node | FOLIO_MAPPING_KSM); again: kpfn = READ_ONCE(stable_node->kpfn); /* Address dependency. */ folio = pfn_folio(kpfn); if (READ_ONCE(folio->mapping) != expected_mapping) goto stale; /* * We cannot do anything with the page while its refcount is 0. * Usually 0 means free, or tail of a higher-order page: in which * case this node is no longer referenced, and should be freed; * however, it might mean that the page is under page_ref_freeze(). * The __remove_mapping() case is easy, again the node is now stale; * the same is in reuse_ksm_page() case; but if page is swapcache * in folio_migrate_mapping(), it might still be our page, * in which case it's essential to keep the node. */ while (!folio_try_get(folio)) { /* * Another check for folio->mapping != expected_mapping * would work here too. We have chosen to test the * swapcache flag to optimize the common case, when the * folio is or is about to be freed: the swapcache flag * is cleared (under spin_lock_irq) in the ref_freeze * section of __remove_mapping(); but anon folio->mapping * is reset to NULL later, in free_pages_prepare(). */ if (!folio_test_swapcache(folio)) goto stale; cpu_relax(); } if (READ_ONCE(folio->mapping) != expected_mapping) { folio_put(folio); goto stale; } if (flags == KSM_GET_FOLIO_TRYLOCK) { if (!folio_trylock(folio)) { folio_put(folio); return ERR_PTR(-EBUSY); } } else if (flags == KSM_GET_FOLIO_LOCK) folio_lock(folio); if (flags != KSM_GET_FOLIO_NOLOCK) { if (READ_ONCE(folio->mapping) != expected_mapping) { folio_unlock(folio); folio_put(folio); goto stale; } } return folio; stale: /* * We come here from above when folio->mapping or the swapcache flag * suggests that the node is stale; but it might be under migration. * We need smp_rmb(), matching the smp_wmb() in folio_migrate_ksm(), * before checking whether node->kpfn has been changed. */ smp_rmb(); if (READ_ONCE(stable_node->kpfn) != kpfn) goto again; remove_node_from_stable_tree(stable_node); return NULL; } /* * Removing rmap_item from stable or unstable tree. * This function will clean the information from the stable/unstable tree. */ static void remove_rmap_item_from_tree(struct ksm_rmap_item *rmap_item) { if (rmap_item->address & STABLE_FLAG) { struct ksm_stable_node *stable_node; struct folio *folio; stable_node = rmap_item->head; folio = ksm_get_folio(stable_node, KSM_GET_FOLIO_LOCK); if (!folio) goto out; hlist_del(&rmap_item->hlist); folio_unlock(folio); folio_put(folio); if (!hlist_empty(&stable_node->hlist)) ksm_pages_sharing--; else ksm_pages_shared--; rmap_item->mm->ksm_merging_pages--; VM_BUG_ON(stable_node->rmap_hlist_len <= 0); stable_node->rmap_hlist_len--; put_anon_vma(rmap_item->anon_vma); rmap_item->head = NULL; rmap_item->address &= PAGE_MASK; } else if (rmap_item->address & UNSTABLE_FLAG) { unsigned char age; /* * Usually ksmd can and must skip the rb_erase, because * root_unstable_tree was already reset to RB_ROOT. * But be careful when an mm is exiting: do the rb_erase * if this rmap_item was inserted by this scan, rather * than left over from before. */ age = (unsigned char)(ksm_scan.seqnr - rmap_item->address); BUG_ON(age > 1); if (!age) rb_erase(&rmap_item->node, root_unstable_tree + NUMA(rmap_item->nid)); ksm_pages_unshared--; rmap_item->address &= PAGE_MASK; } out: cond_resched(); /* we're called from many long loops */ } static void remove_trailing_rmap_items(struct ksm_rmap_item **rmap_list) { while (*rmap_list) { struct ksm_rmap_item *rmap_item = *rmap_list; *rmap_list = rmap_item->rmap_list; remove_rmap_item_from_tree(rmap_item); free_rmap_item(rmap_item); } } /* * Though it's very tempting to unmerge rmap_items from stable tree rather * than check every pte of a given vma, the locking doesn't quite work for * that - an rmap_item is assigned to the stable tree after inserting ksm * page and upping mmap_lock. Nor does it fit with the way we skip dup'ing * rmap_items from parent to child at fork time (so as not to waste time * if exit comes before the next scan reaches it). * * Similarly, although we'd like to remove rmap_items (so updating counts * and freeing memory) when unmerging an area, it's easier to leave that * to the next pass of ksmd - consider, for example, how ksmd might be * in cmp_and_merge_page on one of the rmap_items we would be removing. */ static int unmerge_ksm_pages(struct vm_area_struct *vma, unsigned long start, unsigned long end, bool lock_vma) { unsigned long addr; int err = 0; for (addr = start; addr < end && !err; addr += PAGE_SIZE) { if (ksm_test_exit(vma->vm_mm)) break; if (signal_pending(current)) err = -ERESTARTSYS; else err = break_ksm(vma, addr, lock_vma); } return err; } static inline struct ksm_stable_node *folio_stable_node(const struct folio *folio) { return folio_test_ksm(folio) ? folio_raw_mapping(folio) : NULL; } static inline struct ksm_stable_node *page_stable_node(struct page *page) { return folio_stable_node(page_folio(page)); } static inline void folio_set_stable_node(struct folio *folio, struct ksm_stable_node *stable_node) { VM_WARN_ON_FOLIO(folio_test_anon(folio) && PageAnonExclusive(&folio->page), folio); folio->mapping = (void *)((unsigned long)stable_node | FOLIO_MAPPING_KSM); } #ifdef CONFIG_SYSFS /* * Only called through the sysfs control interface: */ static int remove_stable_node(struct ksm_stable_node *stable_node) { struct folio *folio; int err; folio = ksm_get_folio(stable_node, KSM_GET_FOLIO_LOCK); if (!folio) { /* * ksm_get_folio did remove_node_from_stable_tree itself. */ return 0; } /* * Page could be still mapped if this races with __mmput() running in * between ksm_exit() and exit_mmap(). Just refuse to let * merge_across_nodes/max_page_sharing be switched. */ err = -EBUSY; if (!folio_mapped(folio)) { /* * The stable node did not yet appear stale to ksm_get_folio(), * since that allows for an unmapped ksm folio to be recognized * right up until it is freed; but the node is safe to remove. * This folio might be in an LRU cache waiting to be freed, * or it might be in the swapcache (perhaps under writeback), * or it might have been removed from swapcache a moment ago. */ folio_set_stable_node(folio, NULL); remove_node_from_stable_tree(stable_node); err = 0; } folio_unlock(folio); folio_put(folio); return err; } static int remove_stable_node_chain(struct ksm_stable_node *stable_node, struct rb_root *root) { struct ksm_stable_node *dup; struct hlist_node *hlist_safe; if (!is_stable_node_chain(stable_node)) { VM_BUG_ON(is_stable_node_dup(stable_node)); if (remove_stable_node(stable_node)) return true; else return false; } hlist_for_each_entry_safe(dup, hlist_safe, &stable_node->hlist, hlist_dup) { VM_BUG_ON(!is_stable_node_dup(dup)); if (remove_stable_node(dup)) return true; } BUG_ON(!hlist_empty(&stable_node->hlist)); free_stable_node_chain(stable_node, root); return false; } static int remove_all_stable_nodes(void) { struct ksm_stable_node *stable_node, *next; int nid; int err = 0; for (nid = 0; nid < ksm_nr_node_ids; nid++) { while (root_stable_tree[nid].rb_node) { stable_node = rb_entry(root_stable_tree[nid].rb_node, struct ksm_stable_node, node); if (remove_stable_node_chain(stable_node, root_stable_tree + nid)) { err = -EBUSY; break; /* proceed to next nid */ } cond_resched(); } } list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) { if (remove_stable_node(stable_node)) err = -EBUSY; cond_resched(); } return err; } static int unmerge_and_remove_all_rmap_items(void) { struct ksm_mm_slot *mm_slot; struct mm_slot *slot; struct mm_struct *mm; struct vm_area_struct *vma; int err = 0; spin_lock(&ksm_mmlist_lock); slot = list_entry(ksm_mm_head.slot.mm_node.next, struct mm_slot, mm_node); ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); spin_unlock(&ksm_mmlist_lock); for (mm_slot = ksm_scan.mm_slot; mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) { VMA_ITERATOR(vmi, mm_slot->slot.mm, 0); mm = mm_slot->slot.mm; mmap_read_lock(mm); /* * Exit right away if mm is exiting to avoid lockdep issue in * the maple tree */ if (ksm_test_exit(mm)) goto mm_exiting; for_each_vma(vmi, vma) { if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) continue; err = unmerge_ksm_pages(vma, vma->vm_start, vma->vm_end, false); if (err) goto error; } mm_exiting: remove_trailing_rmap_items(&mm_slot->rmap_list); mmap_read_unlock(mm); spin_lock(&ksm_mmlist_lock); slot = list_entry(mm_slot->slot.mm_node.next, struct mm_slot, mm_node); ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); if (ksm_test_exit(mm)) { hash_del(&mm_slot->slot.hash); list_del(&mm_slot->slot.mm_node); spin_unlock(&ksm_mmlist_lock); mm_slot_free(mm_slot_cache, mm_slot); clear_bit(MMF_VM_MERGEABLE, &mm->flags); clear_bit(MMF_VM_MERGE_ANY, &mm->flags); mmdrop(mm); } else spin_unlock(&ksm_mmlist_lock); } /* Clean up stable nodes, but don't worry if some are still busy */ remove_all_stable_nodes(); ksm_scan.seqnr = 0; return 0; error: mmap_read_unlock(mm); spin_lock(&ksm_mmlist_lock); ksm_scan.mm_slot = &ksm_mm_head; spin_unlock(&ksm_mmlist_lock); return err; } #endif /* CONFIG_SYSFS */ static u32 calc_checksum(struct page *page) { u32 checksum; void *addr = kmap_local_page(page); checksum = xxhash(addr, PAGE_SIZE, 0); kunmap_local(addr); return checksum; } static int write_protect_page(struct vm_area_struct *vma, struct folio *folio, pte_t *orig_pte) { struct mm_struct *mm = vma->vm_mm; DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, 0, 0); int swapped; int err = -EFAULT; struct mmu_notifier_range range; bool anon_exclusive; pte_t entry; if (WARN_ON_ONCE(folio_test_large(folio))) return err; pvmw.address = page_address_in_vma(folio, folio_page(folio, 0), vma); if (pvmw.address == -EFAULT) goto out; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, pvmw.address, pvmw.address + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); if (!page_vma_mapped_walk(&pvmw)) goto out_mn; if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?")) goto out_unlock; entry = ptep_get(pvmw.pte); /* * Handle PFN swap PTEs, such as device-exclusive ones, that actually * map pages: give up just like the next folio_walk would. */ if (unlikely(!pte_present(entry))) goto out_unlock; anon_exclusive = PageAnonExclusive(&folio->page); if (pte_write(entry) || pte_dirty(entry) || anon_exclusive || mm_tlb_flush_pending(mm)) { swapped = folio_test_swapcache(folio); flush_cache_page(vma, pvmw.address, folio_pfn(folio)); /* * Ok this is tricky, when get_user_pages_fast() run it doesn't * take any lock, therefore the check that we are going to make * with the pagecount against the mapcount is racy and * O_DIRECT can happen right after the check. * So we clear the pte and flush the tlb before the check * this assure us that no O_DIRECT can happen after the check * or in the middle of the check. * * No need to notify as we are downgrading page table to read * only not changing it to point to a new page. * * See Documentation/mm/mmu_notifier.rst */ entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte); /* * Check that no O_DIRECT or similar I/O is in progress on the * page */ if (folio_mapcount(folio) + 1 + swapped != folio_ref_count(folio)) { set_pte_at(mm, pvmw.address, pvmw.pte, entry); goto out_unlock; } /* See folio_try_share_anon_rmap_pte(): clear PTE first. */ if (anon_exclusive && folio_try_share_anon_rmap_pte(folio, &folio->page)) { set_pte_at(mm, pvmw.address, pvmw.pte, entry); goto out_unlock; } if (pte_dirty(entry)) folio_mark_dirty(folio); entry = pte_mkclean(entry); if (pte_write(entry)) entry = pte_wrprotect(entry); set_pte_at(mm, pvmw.address, pvmw.pte, entry); } *orig_pte = entry; err = 0; out_unlock: page_vma_mapped_walk_done(&pvmw); out_mn: mmu_notifier_invalidate_range_end(&range); out: return err; } /** * replace_page - replace page in vma by new ksm page * @vma: vma that holds the pte pointing to page * @page: the page we are replacing by kpage * @kpage: the ksm page we replace page by * @orig_pte: the original value of the pte * * Returns 0 on success, -EFAULT on failure. */ static int replace_page(struct vm_area_struct *vma, struct page *page, struct page *kpage, pte_t orig_pte) { struct folio *kfolio = page_folio(kpage); struct mm_struct *mm = vma->vm_mm; struct folio *folio = page_folio(page); pmd_t *pmd; pmd_t pmde; pte_t *ptep; pte_t newpte; spinlock_t *ptl; unsigned long addr; int err = -EFAULT; struct mmu_notifier_range range; addr = page_address_in_vma(folio, page, vma); if (addr == -EFAULT) goto out; pmd = mm_find_pmd(mm, addr); if (!pmd) goto out; /* * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at() * without holding anon_vma lock for write. So when looking for a * genuine pmde (in which to find pte), test present and !THP together. */ pmde = pmdp_get_lockless(pmd); if (!pmd_present(pmde) || pmd_trans_huge(pmde)) goto out; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr, addr + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); if (!ptep) goto out_mn; if (!pte_same(ptep_get(ptep), orig_pte)) { pte_unmap_unlock(ptep, ptl); goto out_mn; } VM_BUG_ON_PAGE(PageAnonExclusive(page), page); VM_BUG_ON_FOLIO(folio_test_anon(kfolio) && PageAnonExclusive(kpage), kfolio); /* * No need to check ksm_use_zero_pages here: we can only have a * zero_page here if ksm_use_zero_pages was enabled already. */ if (!is_zero_pfn(page_to_pfn(kpage))) { folio_get(kfolio); folio_add_anon_rmap_pte(kfolio, kpage, vma, addr, RMAP_NONE); newpte = mk_pte(kpage, vma->vm_page_prot); } else { /* * Use pte_mkdirty to mark the zero page mapped by KSM, and then * we can easily track all KSM-placed zero pages by checking if * the dirty bit in zero page's PTE is set. */ newpte = pte_mkdirty(pte_mkspecial(pfn_pte(page_to_pfn(kpage), vma->vm_page_prot))); ksm_map_zero_page(mm); /* * We're replacing an anonymous page with a zero page, which is * not anonymous. We need to do proper accounting otherwise we * will get wrong values in /proc, and a BUG message in dmesg * when tearing down the mm. */ dec_mm_counter(mm, MM_ANONPAGES); } flush_cache_page(vma, addr, pte_pfn(ptep_get(ptep))); /* * No need to notify as we are replacing a read only page with another * read only page with the same content. * * See Documentation/mm/mmu_notifier.rst */ ptep_clear_flush(vma, addr, ptep); set_pte_at(mm, addr, ptep, newpte); folio_remove_rmap_pte(folio, page, vma); if (!folio_mapped(folio)) folio_free_swap(folio); folio_put(folio); pte_unmap_unlock(ptep, ptl); err = 0; out_mn: mmu_notifier_invalidate_range_end(&range); out: return err; } /* * try_to_merge_one_page - take two pages and merge them into one * @vma: the vma that holds the pte pointing to page * @page: the PageAnon page that we want to replace with kpage * @kpage: the KSM page that we want to map instead of page, * or NULL the first time when we want to use page as kpage. * * This function returns 0 if the pages were merged, -EFAULT otherwise. */ static int try_to_merge_one_page(struct vm_area_struct *vma, struct page *page, struct page *kpage) { struct folio *folio = page_folio(page); pte_t orig_pte = __pte(0); int err = -EFAULT; if (page == kpage) /* ksm page forked */ return 0; if (!folio_test_anon(folio)) goto out; /* * We need the folio lock to read a stable swapcache flag in * write_protect_page(). We trylock because we don't want to wait * here - we prefer to continue scanning and merging different * pages, then come back to this page when it is unlocked. */ if (!folio_trylock(folio)) goto out; if (folio_test_large(folio)) { if (split_huge_page(page)) goto out_unlock; folio = page_folio(page); } /* * If this anonymous page is mapped only here, its pte may need * to be write-protected. If it's mapped elsewhere, all of its * ptes are necessarily already write-protected. But in either * case, we need to lock and check page_count is not raised. */ if (write_protect_page(vma, folio, &orig_pte) == 0) { if (!kpage) { /* * While we hold folio lock, upgrade folio from * anon to a NULL stable_node with the KSM flag set: * stable_tree_insert() will update stable_node. */ folio_set_stable_node(folio, NULL); folio_mark_accessed(folio); /* * Page reclaim just frees a clean folio with no dirty * ptes: make sure that the ksm page would be swapped. */ if (!folio_test_dirty(folio)) folio_mark_dirty(folio); err = 0; } else if (pages_identical(page, kpage)) err = replace_page(vma, page, kpage, orig_pte); } out_unlock: folio_unlock(folio); out: return err; } /* * This function returns 0 if the pages were merged or if they are * no longer merging candidates (e.g., VMA stale), -EFAULT otherwise. */ static int try_to_merge_with_zero_page(struct ksm_rmap_item *rmap_item, struct page *page) { struct mm_struct *mm = rmap_item->mm; int err = -EFAULT; /* * Same checksum as an empty page. We attempt to merge it with the * appropriate zero page if the user enabled this via sysfs. */ if (ksm_use_zero_pages && (rmap_item->oldchecksum == zero_checksum)) { struct vm_area_struct *vma; mmap_read_lock(mm); vma = find_mergeable_vma(mm, rmap_item->address); if (vma) { err = try_to_merge_one_page(vma, page, ZERO_PAGE(rmap_item->address)); trace_ksm_merge_one_page( page_to_pfn(ZERO_PAGE(rmap_item->address)), rmap_item, mm, err); } else { /* * If the vma is out of date, we do not need to * continue. */ err = 0; } mmap_read_unlock(mm); } return err; } /* * try_to_merge_with_ksm_page - like try_to_merge_two_pages, * but no new kernel page is allocated: kpage must already be a ksm page. * * This function returns 0 if the pages were merged, -EFAULT otherwise. */ static int try_to_merge_with_ksm_page(struct ksm_rmap_item *rmap_item, struct page *page, struct page *kpage) { struct mm_struct *mm = rmap_item->mm; struct vm_area_struct *vma; int err = -EFAULT; mmap_read_lock(mm); vma = find_mergeable_vma(mm, rmap_item->address); if (!vma) goto out; err = try_to_merge_one_page(vma, page, kpage); if (err) goto out; /* Unstable nid is in union with stable anon_vma: remove first */ remove_rmap_item_from_tree(rmap_item); /* Must get reference to anon_vma while still holding mmap_lock */ rmap_item->anon_vma = vma->anon_vma; get_anon_vma(vma->anon_vma); out: mmap_read_unlock(mm); trace_ksm_merge_with_ksm_page(kpage, page_to_pfn(kpage ? kpage : page), rmap_item, mm, err); return err; } /* * try_to_merge_two_pages - take two identical pages and prepare them * to be merged into one page. * * This function returns the kpage if we successfully merged two identical * pages into one ksm page, NULL otherwise. * * Note that this function upgrades page to ksm page: if one of the pages * is already a ksm page, try_to_merge_with_ksm_page should be used. */ static struct folio *try_to_merge_two_pages(struct ksm_rmap_item *rmap_item, struct page *page, struct ksm_rmap_item *tree_rmap_item, struct page *tree_page) { int err; err = try_to_merge_with_ksm_page(rmap_item, page, NULL); if (!err) { err = try_to_merge_with_ksm_page(tree_rmap_item, tree_page, page); /* * If that fails, we have a ksm page with only one pte * pointing to it: so break it. */ if (err) break_cow(rmap_item); } return err ? NULL : page_folio(page); } static __always_inline bool __is_page_sharing_candidate(struct ksm_stable_node *stable_node, int offset) { VM_BUG_ON(stable_node->rmap_hlist_len < 0); /* * Check that at least one mapping still exists, otherwise * there's no much point to merge and share with this * stable_node, as the underlying tree_page of the other * sharer is going to be freed soon. */ return stable_node->rmap_hlist_len && stable_node->rmap_hlist_len + offset < ksm_max_page_sharing; } static __always_inline bool is_page_sharing_candidate(struct ksm_stable_node *stable_node) { return __is_page_sharing_candidate(stable_node, 0); } static struct folio *stable_node_dup(struct ksm_stable_node **_stable_node_dup, struct ksm_stable_node **_stable_node, struct rb_root *root, bool prune_stale_stable_nodes) { struct ksm_stable_node *dup, *found = NULL, *stable_node = *_stable_node; struct hlist_node *hlist_safe; struct folio *folio, *tree_folio = NULL; int found_rmap_hlist_len; if (!prune_stale_stable_nodes || time_before(jiffies, stable_node->chain_prune_time + msecs_to_jiffies( ksm_stable_node_chains_prune_millisecs))) prune_stale_stable_nodes = false; else stable_node->chain_prune_time = jiffies; hlist_for_each_entry_safe(dup, hlist_safe, &stable_node->hlist, hlist_dup) { cond_resched(); /* * We must walk all stable_node_dup to prune the stale * stable nodes during lookup. * * ksm_get_folio can drop the nodes from the * stable_node->hlist if they point to freed pages * (that's why we do a _safe walk). The "dup" * stable_node parameter itself will be freed from * under us if it returns NULL. */ folio = ksm_get_folio(dup, KSM_GET_FOLIO_NOLOCK); if (!folio) continue; /* Pick the best candidate if possible. */ if (!found || (is_page_sharing_candidate(dup) && (!is_page_sharing_candidate(found) || dup->rmap_hlist_len > found_rmap_hlist_len))) { if (found) folio_put(tree_folio); found = dup; found_rmap_hlist_len = found->rmap_hlist_len; tree_folio = folio; /* skip put_page for found candidate */ if (!prune_stale_stable_nodes && is_page_sharing_candidate(found)) break; continue; } folio_put(folio); } if (found) { if (hlist_is_singular_node(&found->hlist_dup, &stable_node->hlist)) { /* * If there's not just one entry it would * corrupt memory, better BUG_ON. In KSM * context with no lock held it's not even * fatal. */ BUG_ON(stable_node->hlist.first->next); /* * There's just one entry and it is below the * deduplication limit so drop the chain. */ rb_replace_node(&stable_node->node, &found->node, root); free_stable_node(stable_node); ksm_stable_node_chains--; ksm_stable_node_dups--; /* * NOTE: the caller depends on the stable_node * to be equal to stable_node_dup if the chain * was collapsed. */ *_stable_node = found; /* * Just for robustness, as stable_node is * otherwise left as a stable pointer, the * compiler shall optimize it away at build * time. */ stable_node = NULL; } else if (stable_node->hlist.first != &found->hlist_dup && __is_page_sharing_candidate(found, 1)) { /* * If the found stable_node dup can accept one * more future merge (in addition to the one * that is underway) and is not at the head of * the chain, put it there so next search will * be quicker in the !prune_stale_stable_nodes * case. * * NOTE: it would be inaccurate to use nr > 1 * instead of checking the hlist.first pointer * directly, because in the * prune_stale_stable_nodes case "nr" isn't * the position of the found dup in the chain, * but the total number of dups in the chain. */ hlist_del(&found->hlist_dup); hlist_add_head(&found->hlist_dup, &stable_node->hlist); } } else { /* Its hlist must be empty if no one found. */ free_stable_node_chain(stable_node, root); } *_stable_node_dup = found; return tree_folio; } /* * Like for ksm_get_folio, this function can free the *_stable_node and * *_stable_node_dup if the returned tree_page is NULL. * * It can also free and overwrite *_stable_node with the found * stable_node_dup if the chain is collapsed (in which case * *_stable_node will be equal to *_stable_node_dup like if the chain * never existed). It's up to the caller to verify tree_page is not * NULL before dereferencing *_stable_node or *_stable_node_dup. * * *_stable_node_dup is really a second output parameter of this * function and will be overwritten in all cases, the caller doesn't * need to initialize it. */ static struct folio *__stable_node_chain(struct ksm_stable_node **_stable_node_dup, struct ksm_stable_node **_stable_node, struct rb_root *root, bool prune_stale_stable_nodes) { struct ksm_stable_node *stable_node = *_stable_node; if (!is_stable_node_chain(stable_node)) { *_stable_node_dup = stable_node; return ksm_get_folio(stable_node, KSM_GET_FOLIO_NOLOCK); } return stable_node_dup(_stable_node_dup, _stable_node, root, prune_stale_stable_nodes); } static __always_inline struct folio *chain_prune(struct ksm_stable_node **s_n_d, struct ksm_stable_node **s_n, struct rb_root *root) { return __stable_node_chain(s_n_d, s_n, root, true); } static __always_inline struct folio *chain(struct ksm_stable_node **s_n_d, struct ksm_stable_node **s_n, struct rb_root *root) { return __stable_node_chain(s_n_d, s_n, root, false); } /* * stable_tree_search - search for page inside the stable tree * * This function checks if there is a page inside the stable tree * with identical content to the page that we are scanning right now. * * This function returns the stable tree node of identical content if found, * -EBUSY if the stable node's page is being migrated, NULL otherwise. */ static struct folio *stable_tree_search(struct page *page) { int nid; struct rb_root *root; struct rb_node **new; struct rb_node *parent; struct ksm_stable_node *stable_node, *stable_node_dup; struct ksm_stable_node *page_node; struct folio *folio; folio = page_folio(page); page_node = folio_stable_node(folio); if (page_node && page_node->head != &migrate_nodes) { /* ksm page forked */ folio_get(folio); return folio; } nid = get_kpfn_nid(folio_pfn(folio)); root = root_stable_tree + nid; again: new = &root->rb_node; parent = NULL; while (*new) { struct folio *tree_folio; int ret; cond_resched(); stable_node = rb_entry(*new, struct ksm_stable_node, node); tree_folio = chain_prune(&stable_node_dup, &stable_node, root); if (!tree_folio) { /* * If we walked over a stale stable_node, * ksm_get_folio() will call rb_erase() and it * may rebalance the tree from under us. So * restart the search from scratch. Returning * NULL would be safe too, but we'd generate * false negative insertions just because some * stable_node was stale. */ goto again; } ret = memcmp_pages(page, &tree_folio->page); folio_put(tree_folio); parent = *new; if (ret < 0) new = &parent->rb_left; else if (ret > 0) new = &parent->rb_right; else { if (page_node) { VM_BUG_ON(page_node->head != &migrate_nodes); /* * If the mapcount of our migrated KSM folio is * at most 1, we can merge it with another * KSM folio where we know that we have space * for one more mapping without exceeding the * ksm_max_page_sharing limit: see * chain_prune(). This way, we can avoid adding * this stable node to the chain. */ if (folio_mapcount(folio) > 1) goto chain_append; } if (!is_page_sharing_candidate(stable_node_dup)) { /* * If the stable_node is a chain and * we got a payload match in memcmp * but we cannot merge the scanned * page in any of the existing * stable_node dups because they're * all full, we need to wait the * scanned page to find itself a match * in the unstable tree to create a * brand new KSM page to add later to * the dups of this stable_node. */ return NULL; } /* * Lock and unlock the stable_node's page (which * might already have been migrated) so that page * migration is sure to notice its raised count. * It would be more elegant to return stable_node * than kpage, but that involves more changes. */ tree_folio = ksm_get_folio(stable_node_dup, KSM_GET_FOLIO_TRYLOCK); if (PTR_ERR(tree_folio) == -EBUSY) return ERR_PTR(-EBUSY); if (unlikely(!tree_folio)) /* * The tree may have been rebalanced, * so re-evaluate parent and new. */ goto again; folio_unlock(tree_folio); if (get_kpfn_nid(stable_node_dup->kpfn) != NUMA(stable_node_dup->nid)) { folio_put(tree_folio); goto replace; } return tree_folio; } } if (!page_node) return NULL; list_del(&page_node->list); DO_NUMA(page_node->nid = nid); rb_link_node(&page_node->node, parent, new); rb_insert_color(&page_node->node, root); out: if (is_page_sharing_candidate(page_node)) { folio_get(folio); return folio; } else return NULL; replace: /* * If stable_node was a chain and chain_prune collapsed it, * stable_node has been updated to be the new regular * stable_node. A collapse of the chain is indistinguishable * from the case there was no chain in the stable * rbtree. Otherwise stable_node is the chain and * stable_node_dup is the dup to replace. */ if (stable_node_dup == stable_node) { VM_BUG_ON(is_stable_node_chain(stable_node_dup)); VM_BUG_ON(is_stable_node_dup(stable_node_dup)); /* there is no chain */ if (page_node) { VM_BUG_ON(page_node->head != &migrate_nodes); list_del(&page_node->list); DO_NUMA(page_node->nid = nid); rb_replace_node(&stable_node_dup->node, &page_node->node, root); if (is_page_sharing_candidate(page_node)) folio_get(folio); else folio = NULL; } else { rb_erase(&stable_node_dup->node, root); folio = NULL; } } else { VM_BUG_ON(!is_stable_node_chain(stable_node)); __stable_node_dup_del(stable_node_dup); if (page_node) { VM_BUG_ON(page_node->head != &migrate_nodes); list_del(&page_node->list); DO_NUMA(page_node->nid = nid); stable_node_chain_add_dup(page_node, stable_node); if (is_page_sharing_candidate(page_node)) folio_get(folio); else folio = NULL; } else { folio = NULL; } } stable_node_dup->head = &migrate_nodes; list_add(&stable_node_dup->list, stable_node_dup->head); return folio; chain_append: /* * If stable_node was a chain and chain_prune collapsed it, * stable_node has been updated to be the new regular * stable_node. A collapse of the chain is indistinguishable * from the case there was no chain in the stable * rbtree. Otherwise stable_node is the chain and * stable_node_dup is the dup to replace. */ if (stable_node_dup == stable_node) { VM_BUG_ON(is_stable_node_dup(stable_node_dup)); /* chain is missing so create it */ stable_node = alloc_stable_node_chain(stable_node_dup, root); if (!stable_node) return NULL; } /* * Add this stable_node dup that was * migrated to the stable_node chain * of the current nid for this page * content. */ VM_BUG_ON(!is_stable_node_dup(stable_node_dup)); VM_BUG_ON(page_node->head != &migrate_nodes); list_del(&page_node->list); DO_NUMA(page_node->nid = nid); stable_node_chain_add_dup(page_node, stable_node); goto out; } /* * stable_tree_insert - insert stable tree node pointing to new ksm page * into the stable tree. * * This function returns the stable tree node just allocated on success, * NULL otherwise. */ static struct ksm_stable_node *stable_tree_insert(struct folio *kfolio) { int nid; unsigned long kpfn; struct rb_root *root; struct rb_node **new; struct rb_node *parent; struct ksm_stable_node *stable_node, *stable_node_dup; bool need_chain = false; kpfn = folio_pfn(kfolio); nid = get_kpfn_nid(kpfn); root = root_stable_tree + nid; again: parent = NULL; new = &root->rb_node; while (*new) { struct folio *tree_folio; int ret; cond_resched(); stable_node = rb_entry(*new, struct ksm_stable_node, node); tree_folio = chain(&stable_node_dup, &stable_node, root); if (!tree_folio) { /* * If we walked over a stale stable_node, * ksm_get_folio() will call rb_erase() and it * may rebalance the tree from under us. So * restart the search from scratch. Returning * NULL would be safe too, but we'd generate * false negative insertions just because some * stable_node was stale. */ goto again; } ret = memcmp_pages(&kfolio->page, &tree_folio->page); folio_put(tree_folio); parent = *new; if (ret < 0) new = &parent->rb_left; else if (ret > 0) new = &parent->rb_right; else { need_chain = true; break; } } stable_node_dup = alloc_stable_node(); if (!stable_node_dup) return NULL; INIT_HLIST_HEAD(&stable_node_dup->hlist); stable_node_dup->kpfn = kpfn; stable_node_dup->rmap_hlist_len = 0; DO_NUMA(stable_node_dup->nid = nid); if (!need_chain) { rb_link_node(&stable_node_dup->node, parent, new); rb_insert_color(&stable_node_dup->node, root); } else { if (!is_stable_node_chain(stable_node)) { struct ksm_stable_node *orig = stable_node; /* chain is missing so create it */ stable_node = alloc_stable_node_chain(orig, root); if (!stable_node) { free_stable_node(stable_node_dup); return NULL; } } stable_node_chain_add_dup(stable_node_dup, stable_node); } folio_set_stable_node(kfolio, stable_node_dup); return stable_node_dup; } /* * unstable_tree_search_insert - search for identical page, * else insert rmap_item into the unstable tree. * * This function searches for a page in the unstable tree identical to the * page currently being scanned; and if no identical page is found in the * tree, we insert rmap_item as a new object into the unstable tree. * * This function returns pointer to rmap_item found to be identical * to the currently scanned page, NULL otherwise. * * This function does both searching and inserting, because they share * the same walking algorithm in an rbtree. */ static struct ksm_rmap_item *unstable_tree_search_insert(struct ksm_rmap_item *rmap_item, struct page *page, struct page **tree_pagep) { struct rb_node **new; struct rb_root *root; struct rb_node *parent = NULL; int nid; nid = get_kpfn_nid(page_to_pfn(page)); root = root_unstable_tree + nid; new = &root->rb_node; while (*new) { struct ksm_rmap_item *tree_rmap_item; struct page *tree_page; int ret; cond_resched(); tree_rmap_item = rb_entry(*new, struct ksm_rmap_item, node); tree_page = get_mergeable_page(tree_rmap_item); if (!tree_page) return NULL; /* * Don't substitute a ksm page for a forked page. */ if (page == tree_page) { put_page(tree_page); return NULL; } ret = memcmp_pages(page, tree_page); parent = *new; if (ret < 0) { put_page(tree_page); new = &parent->rb_left; } else if (ret > 0) { put_page(tree_page); new = &parent->rb_right; } else if (!ksm_merge_across_nodes && page_to_nid(tree_page) != nid) { /* * If tree_page has been migrated to another NUMA node, * it will be flushed out and put in the right unstable * tree next time: only merge with it when across_nodes. */ put_page(tree_page); return NULL; } else { *tree_pagep = tree_page; return tree_rmap_item; } } rmap_item->address |= UNSTABLE_FLAG; rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); DO_NUMA(rmap_item->nid = nid); rb_link_node(&rmap_item->node, parent, new); rb_insert_color(&rmap_item->node, root); ksm_pages_unshared++; return NULL; } /* * stable_tree_append - add another rmap_item to the linked list of * rmap_items hanging off a given node of the stable tree, all sharing * the same ksm page. */ static void stable_tree_append(struct ksm_rmap_item *rmap_item, struct ksm_stable_node *stable_node, bool max_page_sharing_bypass) { /* * rmap won't find this mapping if we don't insert the * rmap_item in the right stable_node * duplicate. page_migration could break later if rmap breaks, * so we can as well crash here. We really need to check for * rmap_hlist_len == STABLE_NODE_CHAIN, but we can as well check * for other negative values as an underflow if detected here * for the first time (and not when decreasing rmap_hlist_len) * would be sign of memory corruption in the stable_node. */ BUG_ON(stable_node->rmap_hlist_len < 0); stable_node->rmap_hlist_len++; if (!max_page_sharing_bypass) /* possibly non fatal but unexpected overflow, only warn */ WARN_ON_ONCE(stable_node->rmap_hlist_len > ksm_max_page_sharing); rmap_item->head = stable_node; rmap_item->address |= STABLE_FLAG; hlist_add_head(&rmap_item->hlist, &stable_node->hlist); if (rmap_item->hlist.next) ksm_pages_sharing++; else ksm_pages_shared++; rmap_item->mm->ksm_merging_pages++; } /* * cmp_and_merge_page - first see if page can be merged into the stable tree; * if not, compare checksum to previous and if it's the same, see if page can * be inserted into the unstable tree, or merged with a page already there and * both transferred to the stable tree. * * @page: the page that we are searching identical page to. * @rmap_item: the reverse mapping into the virtual address of this page */ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_item) { struct ksm_rmap_item *tree_rmap_item; struct page *tree_page = NULL; struct ksm_stable_node *stable_node; struct folio *kfolio; unsigned int checksum; int err; bool max_page_sharing_bypass = false; stable_node = page_stable_node(page); if (stable_node) { if (stable_node->head != &migrate_nodes && get_kpfn_nid(READ_ONCE(stable_node->kpfn)) != NUMA(stable_node->nid)) { stable_node_dup_del(stable_node); stable_node->head = &migrate_nodes; list_add(&stable_node->list, stable_node->head); } if (stable_node->head != &migrate_nodes && rmap_item->head == stable_node) return; /* * If it's a KSM fork, allow it to go over the sharing limit * without warnings. */ if (!is_page_sharing_candidate(stable_node)) max_page_sharing_bypass = true; } else { remove_rmap_item_from_tree(rmap_item); /* * If the hash value of the page has changed from the last time * we calculated it, this page is changing frequently: therefore we * don't want to insert it in the unstable tree, and we don't want * to waste our time searching for something identical to it there. */ checksum = calc_checksum(page); if (rmap_item->oldchecksum != checksum) { rmap_item->oldchecksum = checksum; return; } if (!try_to_merge_with_zero_page(rmap_item, page)) return; } /* Start by searching for the folio in the stable tree */ kfolio = stable_tree_search(page); if (&kfolio->page == page && rmap_item->head == stable_node) { folio_put(kfolio); return; } remove_rmap_item_from_tree(rmap_item); if (kfolio) { if (kfolio == ERR_PTR(-EBUSY)) return; err = try_to_merge_with_ksm_page(rmap_item, page, &kfolio->page); if (!err) { /* * The page was successfully merged: * add its rmap_item to the stable tree. */ folio_lock(kfolio); stable_tree_append(rmap_item, folio_stable_node(kfolio), max_page_sharing_bypass); folio_unlock(kfolio); } folio_put(kfolio); return; } tree_rmap_item = unstable_tree_search_insert(rmap_item, page, &tree_page); if (tree_rmap_item) { bool split; kfolio = try_to_merge_two_pages(rmap_item, page, tree_rmap_item, tree_page); /* * If both pages we tried to merge belong to the same compound * page, then we actually ended up increasing the reference * count of the same compound page twice, and split_huge_page * failed. * Here we set a flag if that happened, and we use it later to * try split_huge_page again. Since we call put_page right * afterwards, the reference count will be correct and * split_huge_page should succeed. */ split = PageTransCompound(page) && compound_head(page) == compound_head(tree_page); put_page(tree_page); if (kfolio) { /* * The pages were successfully merged: insert new * node in the stable tree and add both rmap_items. */ folio_lock(kfolio); stable_node = stable_tree_insert(kfolio); if (stable_node) { stable_tree_append(tree_rmap_item, stable_node, false); stable_tree_append(rmap_item, stable_node, false); } folio_unlock(kfolio); /* * If we fail to insert the page into the stable tree, * we will have 2 virtual addresses that are pointing * to a ksm page left outside the stable tree, * in which case we need to break_cow on both. */ if (!stable_node) { break_cow(tree_rmap_item); break_cow(rmap_item); } } else if (split) { /* * We are here if we tried to merge two pages and * failed because they both belonged to the same * compound page. We will split the page now, but no * merging will take place. * We do not want to add the cost of a full lock; if * the page is locked, it is better to skip it and * perhaps try again later. */ if (!trylock_page(page)) return; split_huge_page(page); unlock_page(page); } } } static struct ksm_rmap_item *get_next_rmap_item(struct ksm_mm_slot *mm_slot, struct ksm_rmap_item **rmap_list, unsigned long addr) { struct ksm_rmap_item *rmap_item; while (*rmap_list) { rmap_item = *rmap_list; if ((rmap_item->address & PAGE_MASK) == addr) return rmap_item; if (rmap_item->address > addr) break; *rmap_list = rmap_item->rmap_list; remove_rmap_item_from_tree(rmap_item); free_rmap_item(rmap_item); } rmap_item = alloc_rmap_item(); if (rmap_item) { /* It has already been zeroed */ rmap_item->mm = mm_slot->slot.mm; rmap_item->mm->ksm_rmap_items++; rmap_item->address = addr; rmap_item->rmap_list = *rmap_list; *rmap_list = rmap_item; } return rmap_item; } /* * Calculate skip age for the ksm page age. The age determines how often * de-duplicating has already been tried unsuccessfully. If the age is * smaller, the scanning of this page is skipped for less scans. * * @age: rmap_item age of page */ static unsigned int skip_age(rmap_age_t age) { if (age <= 3) return 1; if (age <= 5) return 2; if (age <= 8) return 4; return 8; } /* * Determines if a page should be skipped for the current scan. * * @folio: folio containing the page to check * @rmap_item: associated rmap_item of page */ static bool should_skip_rmap_item(struct folio *folio, struct ksm_rmap_item *rmap_item) { rmap_age_t age; if (!ksm_smart_scan) return false; /* * Never skip pages that are already KSM; pages cmp_and_merge_page() * will essentially ignore them, but we still have to process them * properly. */ if (folio_test_ksm(folio)) return false; age = rmap_item->age; if (age != U8_MAX) rmap_item->age++; /* * Smaller ages are not skipped, they need to get a chance to go * through the different phases of the KSM merging. */ if (age < 3) return false; /* * Are we still allowed to skip? If not, then don't skip it * and determine how much more often we are allowed to skip next. */ if (!rmap_item->remaining_skips) { rmap_item->remaining_skips = skip_age(age); return false; } /* Skip this page */ ksm_pages_skipped++; rmap_item->remaining_skips--; remove_rmap_item_from_tree(rmap_item); return true; } static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) { struct mm_struct *mm; struct ksm_mm_slot *mm_slot; struct mm_slot *slot; struct vm_area_struct *vma; struct ksm_rmap_item *rmap_item; struct vma_iterator vmi; int nid; if (list_empty(&ksm_mm_head.slot.mm_node)) return NULL; mm_slot = ksm_scan.mm_slot; if (mm_slot == &ksm_mm_head) { advisor_start_scan(); trace_ksm_start_scan(ksm_scan.seqnr, ksm_rmap_items); /* * A number of pages can hang around indefinitely in per-cpu * LRU cache, raised page count preventing write_protect_page * from merging them. Though it doesn't really matter much, * it is puzzling to see some stuck in pages_volatile until * other activity jostles them out, and they also prevented * LTP's KSM test from succeeding deterministically; so drain * them here (here rather than on entry to ksm_do_scan(), * so we don't IPI too often when pages_to_scan is set low). */ lru_add_drain_all(); /* * Whereas stale stable_nodes on the stable_tree itself * get pruned in the regular course of stable_tree_search(), * those moved out to the migrate_nodes list can accumulate: * so prune them once before each full scan. */ if (!ksm_merge_across_nodes) { struct ksm_stable_node *stable_node, *next; struct folio *folio; list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) { folio = ksm_get_folio(stable_node, KSM_GET_FOLIO_NOLOCK); if (folio) folio_put(folio); cond_resched(); } } for (nid = 0; nid < ksm_nr_node_ids; nid++) root_unstable_tree[nid] = RB_ROOT; spin_lock(&ksm_mmlist_lock); slot = list_entry(mm_slot->slot.mm_node.next, struct mm_slot, mm_node); mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); ksm_scan.mm_slot = mm_slot; spin_unlock(&ksm_mmlist_lock); /* * Although we tested list_empty() above, a racing __ksm_exit * of the last mm on the list may have removed it since then. */ if (mm_slot == &ksm_mm_head) return NULL; next_mm: ksm_scan.address = 0; ksm_scan.rmap_list = &mm_slot->rmap_list; } slot = &mm_slot->slot; mm = slot->mm; vma_iter_init(&vmi, mm, ksm_scan.address); mmap_read_lock(mm); if (ksm_test_exit(mm)) goto no_vmas; for_each_vma(vmi, vma) { if (!(vma->vm_flags & VM_MERGEABLE)) continue; if (ksm_scan.address < vma->vm_start) ksm_scan.address = vma->vm_start; if (!vma->anon_vma) ksm_scan.address = vma->vm_end; while (ksm_scan.address < vma->vm_end) { struct page *tmp_page = NULL; struct folio_walk fw; struct folio *folio; if (ksm_test_exit(mm)) break; folio = folio_walk_start(&fw, vma, ksm_scan.address, 0); if (folio) { if (!folio_is_zone_device(folio) && folio_test_anon(folio)) { folio_get(folio); tmp_page = fw.page; } folio_walk_end(&fw, vma); } if (tmp_page) { flush_anon_page(vma, tmp_page, ksm_scan.address); flush_dcache_page(tmp_page); rmap_item = get_next_rmap_item(mm_slot, ksm_scan.rmap_list, ksm_scan.address); if (rmap_item) { ksm_scan.rmap_list = &rmap_item->rmap_list; if (should_skip_rmap_item(folio, rmap_item)) { folio_put(folio); goto next_page; } ksm_scan.address += PAGE_SIZE; *page = tmp_page; } else { folio_put(folio); } mmap_read_unlock(mm); return rmap_item; } next_page: ksm_scan.address += PAGE_SIZE; cond_resched(); } } if (ksm_test_exit(mm)) { no_vmas: ksm_scan.address = 0; ksm_scan.rmap_list = &mm_slot->rmap_list; } /* * Nuke all the rmap_items that are above this current rmap: * because there were no VM_MERGEABLE vmas with such addresses. */ remove_trailing_rmap_items(ksm_scan.rmap_list); spin_lock(&ksm_mmlist_lock); slot = list_entry(mm_slot->slot.mm_node.next, struct mm_slot, mm_node); ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); if (ksm_scan.address == 0) { /* * We've completed a full scan of all vmas, holding mmap_lock * throughout, and found no VM_MERGEABLE: so do the same as * __ksm_exit does to remove this mm from all our lists now. * This applies either when cleaning up after __ksm_exit * (but beware: we can reach here even before __ksm_exit), * or when all VM_MERGEABLE areas have been unmapped (and * mmap_lock then protects against race with MADV_MERGEABLE). */ hash_del(&mm_slot->slot.hash); list_del(&mm_slot->slot.mm_node); spin_unlock(&ksm_mmlist_lock); mm_slot_free(mm_slot_cache, mm_slot); clear_bit(MMF_VM_MERGEABLE, &mm->flags); clear_bit(MMF_VM_MERGE_ANY, &mm->flags); mmap_read_unlock(mm); mmdrop(mm); } else { mmap_read_unlock(mm); /* * mmap_read_unlock(mm) first because after * spin_unlock(&ksm_mmlist_lock) run, the "mm" may * already have been freed under us by __ksm_exit() * because the "mm_slot" is still hashed and * ksm_scan.mm_slot doesn't point to it anymore. */ spin_unlock(&ksm_mmlist_lock); } /* Repeat until we've completed scanning the whole list */ mm_slot = ksm_scan.mm_slot; if (mm_slot != &ksm_mm_head) goto next_mm; advisor_stop_scan(); trace_ksm_stop_scan(ksm_scan.seqnr, ksm_rmap_items); ksm_scan.seqnr++; return NULL; } /** * ksm_do_scan - the ksm scanner main worker function. * @scan_npages: number of pages we want to scan before we return. */ static void ksm_do_scan(unsigned int scan_npages) { struct ksm_rmap_item *rmap_item; struct page *page; while (scan_npages-- && likely(!freezing(current))) { cond_resched(); rmap_item = scan_get_next_rmap_item(&page); if (!rmap_item) return; cmp_and_merge_page(page, rmap_item); put_page(page); ksm_pages_scanned++; } } static int ksmd_should_run(void) { return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.slot.mm_node); } static int ksm_scan_thread(void *nothing) { unsigned int sleep_ms; set_freezable(); set_user_nice(current, 5); while (!kthread_should_stop()) { mutex_lock(&ksm_thread_mutex); wait_while_offlining(); if (ksmd_should_run()) ksm_do_scan(ksm_thread_pages_to_scan); mutex_unlock(&ksm_thread_mutex); if (ksmd_should_run()) { sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs); wait_event_freezable_timeout(ksm_iter_wait, sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs), msecs_to_jiffies(sleep_ms)); } else { wait_event_freezable(ksm_thread_wait, ksmd_should_run() || kthread_should_stop()); } } return 0; } static bool __ksm_should_add_vma(const struct file *file, vm_flags_t vm_flags) { if (vm_flags & VM_MERGEABLE) return false; return ksm_compatible(file, vm_flags); } static void __ksm_add_vma(struct vm_area_struct *vma) { if (__ksm_should_add_vma(vma->vm_file, vma->vm_flags)) vm_flags_set(vma, VM_MERGEABLE); } static int __ksm_del_vma(struct vm_area_struct *vma) { int err; if (!(vma->vm_flags & VM_MERGEABLE)) return 0; if (vma->anon_vma) { err = unmerge_ksm_pages(vma, vma->vm_start, vma->vm_end, true); if (err) return err; } vm_flags_clear(vma, VM_MERGEABLE); return 0; } /** * ksm_vma_flags - Update VMA flags to mark as mergeable if compatible * * @mm: Proposed VMA's mm_struct * @file: Proposed VMA's file-backed mapping, if any. * @vm_flags: Proposed VMA"s flags. * * Returns: @vm_flags possibly updated to mark mergeable. */ vm_flags_t ksm_vma_flags(const struct mm_struct *mm, const struct file *file, vm_flags_t vm_flags) { if (test_bit(MMF_VM_MERGE_ANY, &mm->flags) && __ksm_should_add_vma(file, vm_flags)) vm_flags |= VM_MERGEABLE; return vm_flags; } static void ksm_add_vmas(struct mm_struct *mm) { struct vm_area_struct *vma; VMA_ITERATOR(vmi, mm, 0); for_each_vma(vmi, vma) __ksm_add_vma(vma); } static int ksm_del_vmas(struct mm_struct *mm) { struct vm_area_struct *vma; int err; VMA_ITERATOR(vmi, mm, 0); for_each_vma(vmi, vma) { err = __ksm_del_vma(vma); if (err) return err; } return 0; } /** * ksm_enable_merge_any - Add mm to mm ksm list and enable merging on all * compatible VMA's * * @mm: Pointer to mm * * Returns 0 on success, otherwise error code */ int ksm_enable_merge_any(struct mm_struct *mm) { int err; if (test_bit(MMF_VM_MERGE_ANY, &mm->flags)) return 0; if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { err = __ksm_enter(mm); if (err) return err; } set_bit(MMF_VM_MERGE_ANY, &mm->flags); ksm_add_vmas(mm); return 0; } /** * ksm_disable_merge_any - Disable merging on all compatible VMA's of the mm, * previously enabled via ksm_enable_merge_any(). * * Disabling merging implies unmerging any merged pages, like setting * MADV_UNMERGEABLE would. If unmerging fails, the whole operation fails and * merging on all compatible VMA's remains enabled. * * @mm: Pointer to mm * * Returns 0 on success, otherwise error code */ int ksm_disable_merge_any(struct mm_struct *mm) { int err; if (!test_bit(MMF_VM_MERGE_ANY, &mm->flags)) return 0; err = ksm_del_vmas(mm); if (err) { ksm_add_vmas(mm); return err; } clear_bit(MMF_VM_MERGE_ANY, &mm->flags); return 0; } int ksm_disable(struct mm_struct *mm) { mmap_assert_write_locked(mm); if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) return 0; if (test_bit(MMF_VM_MERGE_ANY, &mm->flags)) return ksm_disable_merge_any(mm); return ksm_del_vmas(mm); } int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, vm_flags_t *vm_flags) { struct mm_struct *mm = vma->vm_mm; int err; switch (advice) { case MADV_MERGEABLE: if (vma->vm_flags & VM_MERGEABLE) return 0; if (!vma_ksm_compatible(vma)) return 0; if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { err = __ksm_enter(mm); if (err) return err; } *vm_flags |= VM_MERGEABLE; break; case MADV_UNMERGEABLE: if (!(*vm_flags & VM_MERGEABLE)) return 0; /* just ignore the advice */ if (vma->anon_vma) { err = unmerge_ksm_pages(vma, start, end, true); if (err) return err; } *vm_flags &= ~VM_MERGEABLE; break; } return 0; } EXPORT_SYMBOL_GPL(ksm_madvise); int __ksm_enter(struct mm_struct *mm) { struct ksm_mm_slot *mm_slot; struct mm_slot *slot; int needs_wakeup; mm_slot = mm_slot_alloc(mm_slot_cache); if (!mm_slot) return -ENOMEM; slot = &mm_slot->slot; /* Check ksm_run too? Would need tighter locking */ needs_wakeup = list_empty(&ksm_mm_head.slot.mm_node); spin_lock(&ksm_mmlist_lock); mm_slot_insert(mm_slots_hash, mm, slot); /* * When KSM_RUN_MERGE (or KSM_RUN_STOP), * insert just behind the scanning cursor, to let the area settle * down a little; when fork is followed by immediate exec, we don't * want ksmd to waste time setting up and tearing down an rmap_list. * * But when KSM_RUN_UNMERGE, it's important to insert ahead of its * scanning cursor, otherwise KSM pages in newly forked mms will be * missed: then we might as well insert at the end of the list. */ if (ksm_run & KSM_RUN_UNMERGE) list_add_tail(&slot->mm_node, &ksm_mm_head.slot.mm_node); else list_add_tail(&slot->mm_node, &ksm_scan.mm_slot->slot.mm_node); spin_unlock(&ksm_mmlist_lock); set_bit(MMF_VM_MERGEABLE, &mm->flags); mmgrab(mm); if (needs_wakeup) wake_up_interruptible(&ksm_thread_wait); trace_ksm_enter(mm); return 0; } void __ksm_exit(struct mm_struct *mm) { struct ksm_mm_slot *mm_slot; struct mm_slot *slot; int easy_to_free = 0; /* * This process is exiting: if it's straightforward (as is the * case when ksmd was never running), free mm_slot immediately. * But if it's at the cursor or has rmap_items linked to it, use * mmap_lock to synchronize with any break_cows before pagetables * are freed, and leave the mm_slot on the list for ksmd to free. * Beware: ksm may already have noticed it exiting and freed the slot. */ spin_lock(&ksm_mmlist_lock); slot = mm_slot_lookup(mm_slots_hash, mm); mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); if (mm_slot && ksm_scan.mm_slot != mm_slot) { if (!mm_slot->rmap_list) { hash_del(&slot->hash); list_del(&slot->mm_node); easy_to_free = 1; } else { list_move(&slot->mm_node, &ksm_scan.mm_slot->slot.mm_node); } } spin_unlock(&ksm_mmlist_lock); if (easy_to_free) { mm_slot_free(mm_slot_cache, mm_slot); clear_bit(MMF_VM_MERGE_ANY, &mm->flags); clear_bit(MMF_VM_MERGEABLE, &mm->flags); mmdrop(mm); } else if (mm_slot) { mmap_write_lock(mm); mmap_write_unlock(mm); } trace_ksm_exit(mm); } struct folio *ksm_might_need_to_copy(struct folio *folio, struct vm_area_struct *vma, unsigned long addr) { struct page *page = folio_page(folio, 0); struct anon_vma *anon_vma = folio_anon_vma(folio); struct folio *new_folio; if (folio_test_large(folio)) return folio; if (folio_test_ksm(folio)) { if (folio_stable_node(folio) && !(ksm_run & KSM_RUN_UNMERGE)) return folio; /* no need to copy it */ } else if (!anon_vma) { return folio; /* no need to copy it */ } else if (folio->index == linear_page_index(vma, addr) && anon_vma->root == vma->anon_vma->root) { return folio; /* still no need to copy it */ } if (PageHWPoison(page)) return ERR_PTR(-EHWPOISON); if (!folio_test_uptodate(folio)) return folio; /* let do_swap_page report the error */ new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr); if (new_folio && mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL)) { folio_put(new_folio); new_folio = NULL; } if (new_folio) { if (copy_mc_user_highpage(folio_page(new_folio, 0), page, addr, vma)) { folio_put(new_folio); return ERR_PTR(-EHWPOISON); } folio_set_dirty(new_folio); __folio_mark_uptodate(new_folio); __folio_set_locked(new_folio); #ifdef CONFIG_SWAP count_vm_event(KSM_SWPIN_COPY); #endif } return new_folio; } void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc) { struct ksm_stable_node *stable_node; struct ksm_rmap_item *rmap_item; int search_new_forks = 0; VM_BUG_ON_FOLIO(!folio_test_ksm(folio), folio); /* * Rely on the page lock to protect against concurrent modifications * to that page's node of the stable tree. */ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); stable_node = folio_stable_node(folio); if (!stable_node) return; again: hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { struct anon_vma *anon_vma = rmap_item->anon_vma; struct anon_vma_chain *vmac; struct vm_area_struct *vma; cond_resched(); if (!anon_vma_trylock_read(anon_vma)) { if (rwc->try_lock) { rwc->contended = true; return; } anon_vma_lock_read(anon_vma); } anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 0, ULONG_MAX) { unsigned long addr; cond_resched(); vma = vmac->vma; /* Ignore the stable/unstable/sqnr flags */ addr = rmap_item->address & PAGE_MASK; if (addr < vma->vm_start || addr >= vma->vm_end) continue; /* * Initially we examine only the vma which covers this * rmap_item; but later, if there is still work to do, * we examine covering vmas in other mms: in case they * were forked from the original since ksmd passed. */ if ((rmap_item->mm == vma->vm_mm) == search_new_forks) continue; if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; if (!rwc->rmap_one(folio, vma, addr, rwc->arg)) { anon_vma_unlock_read(anon_vma); return; } if (rwc->done && rwc->done(folio)) { anon_vma_unlock_read(anon_vma); return; } } anon_vma_unlock_read(anon_vma); } if (!search_new_forks++) goto again; } #ifdef CONFIG_MEMORY_FAILURE /* * Collect processes when the error hit an ksm page. */ void collect_procs_ksm(const struct folio *folio, const struct page *page, struct list_head *to_kill, int force_early) { struct ksm_stable_node *stable_node; struct ksm_rmap_item *rmap_item; struct vm_area_struct *vma; struct task_struct *tsk; stable_node = folio_stable_node(folio); if (!stable_node) return; hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { struct anon_vma *av = rmap_item->anon_vma; anon_vma_lock_read(av); rcu_read_lock(); for_each_process(tsk) { struct anon_vma_chain *vmac; unsigned long addr; struct task_struct *t = task_early_kill(tsk, force_early); if (!t) continue; anon_vma_interval_tree_foreach(vmac, &av->rb_root, 0, ULONG_MAX) { vma = vmac->vma; if (vma->vm_mm == t->mm) { addr = rmap_item->address & PAGE_MASK; add_to_kill_ksm(t, page, vma, to_kill, addr); } } } rcu_read_unlock(); anon_vma_unlock_read(av); } } #endif #ifdef CONFIG_MIGRATION void folio_migrate_ksm(struct folio *newfolio, struct folio *folio) { struct ksm_stable_node *stable_node; VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(!folio_test_locked(newfolio), newfolio); VM_BUG_ON_FOLIO(newfolio->mapping != folio->mapping, newfolio); stable_node = folio_stable_node(folio); if (stable_node) { VM_BUG_ON_FOLIO(stable_node->kpfn != folio_pfn(folio), folio); stable_node->kpfn = folio_pfn(newfolio); /* * newfolio->mapping was set in advance; now we need smp_wmb() * to make sure that the new stable_node->kpfn is visible * to ksm_get_folio() before it can see that folio->mapping * has gone stale (or that the swapcache flag has been cleared). */ smp_wmb(); folio_set_stable_node(folio, NULL); } } #endif /* CONFIG_MIGRATION */ #ifdef CONFIG_MEMORY_HOTREMOVE static void wait_while_offlining(void) { while (ksm_run & KSM_RUN_OFFLINE) { mutex_unlock(&ksm_thread_mutex); wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE), TASK_UNINTERRUPTIBLE); mutex_lock(&ksm_thread_mutex); } } static bool stable_node_dup_remove_range(struct ksm_stable_node *stable_node, unsigned long start_pfn, unsigned long end_pfn) { if (stable_node->kpfn >= start_pfn && stable_node->kpfn < end_pfn) { /* * Don't ksm_get_folio, page has already gone: * which is why we keep kpfn instead of page* */ remove_node_from_stable_tree(stable_node); return true; } return false; } static bool stable_node_chain_remove_range(struct ksm_stable_node *stable_node, unsigned long start_pfn, unsigned long end_pfn, struct rb_root *root) { struct ksm_stable_node *dup; struct hlist_node *hlist_safe; if (!is_stable_node_chain(stable_node)) { VM_BUG_ON(is_stable_node_dup(stable_node)); return stable_node_dup_remove_range(stable_node, start_pfn, end_pfn); } hlist_for_each_entry_safe(dup, hlist_safe, &stable_node->hlist, hlist_dup) { VM_BUG_ON(!is_stable_node_dup(dup)); stable_node_dup_remove_range(dup, start_pfn, end_pfn); } if (hlist_empty(&stable_node->hlist)) { free_stable_node_chain(stable_node, root); return true; /* notify caller that tree was rebalanced */ } else return false; } static void ksm_check_stable_tree(unsigned long start_pfn, unsigned long end_pfn) { struct ksm_stable_node *stable_node, *next; struct rb_node *node; int nid; for (nid = 0; nid < ksm_nr_node_ids; nid++) { node = rb_first(root_stable_tree + nid); while (node) { stable_node = rb_entry(node, struct ksm_stable_node, node); if (stable_node_chain_remove_range(stable_node, start_pfn, end_pfn, root_stable_tree + nid)) node = rb_first(root_stable_tree + nid); else node = rb_next(node); cond_resched(); } } list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) { if (stable_node->kpfn >= start_pfn && stable_node->kpfn < end_pfn) remove_node_from_stable_tree(stable_node); cond_resched(); } } static int ksm_memory_callback(struct notifier_block *self, unsigned long action, void *arg) { struct memory_notify *mn = arg; switch (action) { case MEM_GOING_OFFLINE: /* * Prevent ksm_do_scan(), unmerge_and_remove_all_rmap_items() * and remove_all_stable_nodes() while memory is going offline: * it is unsafe for them to touch the stable tree at this time. * But unmerge_ksm_pages(), rmap lookups and other entry points * which do not need the ksm_thread_mutex are all safe. */ mutex_lock(&ksm_thread_mutex); ksm_run |= KSM_RUN_OFFLINE; mutex_unlock(&ksm_thread_mutex); break; case MEM_OFFLINE: /* * Most of the work is done by page migration; but there might * be a few stable_nodes left over, still pointing to struct * pages which have been offlined: prune those from the tree, * otherwise ksm_get_folio() might later try to access a * non-existent struct page. */ ksm_check_stable_tree(mn->start_pfn, mn->start_pfn + mn->nr_pages); fallthrough; case MEM_CANCEL_OFFLINE: mutex_lock(&ksm_thread_mutex); ksm_run &= ~KSM_RUN_OFFLINE; mutex_unlock(&ksm_thread_mutex); smp_mb(); /* wake_up_bit advises this */ wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE)); break; } return NOTIFY_OK; } #else static void wait_while_offlining(void) { } #endif /* CONFIG_MEMORY_HOTREMOVE */ #ifdef CONFIG_PROC_FS /* * The process is mergeable only if any VMA is currently * applicable to KSM. * * The mmap lock must be held in read mode. */ bool ksm_process_mergeable(struct mm_struct *mm) { struct vm_area_struct *vma; mmap_assert_locked(mm); VMA_ITERATOR(vmi, mm, 0); for_each_vma(vmi, vma) if (vma->vm_flags & VM_MERGEABLE) return true; return false; } long ksm_process_profit(struct mm_struct *mm) { return (long)(mm->ksm_merging_pages + mm_ksm_zero_pages(mm)) * PAGE_SIZE - mm->ksm_rmap_items * sizeof(struct ksm_rmap_item); } #endif /* CONFIG_PROC_FS */ #ifdef CONFIG_SYSFS /* * This all compiles without CONFIG_SYSFS, but is a waste of space. */ #define KSM_ATTR_RO(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) #define KSM_ATTR(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RW(_name) static ssize_t sleep_millisecs_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", ksm_thread_sleep_millisecs); } static ssize_t sleep_millisecs_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { unsigned int msecs; int err; err = kstrtouint(buf, 10, &msecs); if (err) return -EINVAL; ksm_thread_sleep_millisecs = msecs; wake_up_interruptible(&ksm_iter_wait); return count; } KSM_ATTR(sleep_millisecs); static ssize_t pages_to_scan_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", ksm_thread_pages_to_scan); } static ssize_t pages_to_scan_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { unsigned int nr_pages; int err; if (ksm_advisor != KSM_ADVISOR_NONE) return -EINVAL; err = kstrtouint(buf, 10, &nr_pages); if (err) return -EINVAL; ksm_thread_pages_to_scan = nr_pages; return count; } KSM_ATTR(pages_to_scan); static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_run); } static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { unsigned int flags; int err; err = kstrtouint(buf, 10, &flags); if (err) return -EINVAL; if (flags > KSM_RUN_UNMERGE) return -EINVAL; /* * KSM_RUN_MERGE sets ksmd running, and 0 stops it running. * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items, * breaking COW to free the pages_shared (but leaves mm_slots * on the list for when ksmd may be set running again). */ mutex_lock(&ksm_thread_mutex); wait_while_offlining(); if (ksm_run != flags) { ksm_run = flags; if (flags & KSM_RUN_UNMERGE) { set_current_oom_origin(); err = unmerge_and_remove_all_rmap_items(); clear_current_oom_origin(); if (err) { ksm_run = KSM_RUN_STOP; count = err; } } } mutex_unlock(&ksm_thread_mutex); if (flags & KSM_RUN_MERGE) wake_up_interruptible(&ksm_thread_wait); return count; } KSM_ATTR(run); #ifdef CONFIG_NUMA static ssize_t merge_across_nodes_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", ksm_merge_across_nodes); } static ssize_t merge_across_nodes_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; unsigned long knob; err = kstrtoul(buf, 10, &knob); if (err) return err; if (knob > 1) return -EINVAL; mutex_lock(&ksm_thread_mutex); wait_while_offlining(); if (ksm_merge_across_nodes != knob) { if (ksm_pages_shared || remove_all_stable_nodes()) err = -EBUSY; else if (root_stable_tree == one_stable_tree) { struct rb_root *buf; /* * This is the first time that we switch away from the * default of merging across nodes: must now allocate * a buffer to hold as many roots as may be needed. * Allocate stable and unstable together: * MAXSMP NODES_SHIFT 10 will use 16kB. */ buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf), GFP_KERNEL); /* Let us assume that RB_ROOT is NULL is zero */ if (!buf) err = -ENOMEM; else { root_stable_tree = buf; root_unstable_tree = buf + nr_node_ids; /* Stable tree is empty but not the unstable */ root_unstable_tree[0] = one_unstable_tree[0]; } } if (!err) { ksm_merge_across_nodes = knob; ksm_nr_node_ids = knob ? 1 : nr_node_ids; } } mutex_unlock(&ksm_thread_mutex); return err ? err : count; } KSM_ATTR(merge_across_nodes); #endif static ssize_t use_zero_pages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", ksm_use_zero_pages); } static ssize_t use_zero_pages_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; bool value; err = kstrtobool(buf, &value); if (err) return -EINVAL; ksm_use_zero_pages = value; return count; } KSM_ATTR(use_zero_pages); static ssize_t max_page_sharing_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", ksm_max_page_sharing); } static ssize_t max_page_sharing_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; int knob; err = kstrtoint(buf, 10, &knob); if (err) return err; /* * When a KSM page is created it is shared by 2 mappings. This * being a signed comparison, it implicitly verifies it's not * negative. */ if (knob < 2) return -EINVAL; if (READ_ONCE(ksm_max_page_sharing) == knob) return count; mutex_lock(&ksm_thread_mutex); wait_while_offlining(); if (ksm_max_page_sharing != knob) { if (ksm_pages_shared || remove_all_stable_nodes()) err = -EBUSY; else ksm_max_page_sharing = knob; } mutex_unlock(&ksm_thread_mutex); return err ? err : count; } KSM_ATTR(max_page_sharing); static ssize_t pages_scanned_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_pages_scanned); } KSM_ATTR_RO(pages_scanned); static ssize_t pages_shared_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_pages_shared); } KSM_ATTR_RO(pages_shared); static ssize_t pages_sharing_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_pages_sharing); } KSM_ATTR_RO(pages_sharing); static ssize_t pages_unshared_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_pages_unshared); } KSM_ATTR_RO(pages_unshared); static ssize_t pages_volatile_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { long ksm_pages_volatile; ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared - ksm_pages_sharing - ksm_pages_unshared; /* * It was not worth any locking to calculate that statistic, * but it might therefore sometimes be negative: conceal that. */ if (ksm_pages_volatile < 0) ksm_pages_volatile = 0; return sysfs_emit(buf, "%ld\n", ksm_pages_volatile); } KSM_ATTR_RO(pages_volatile); static ssize_t pages_skipped_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_pages_skipped); } KSM_ATTR_RO(pages_skipped); static ssize_t ksm_zero_pages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%ld\n", atomic_long_read(&ksm_zero_pages)); } KSM_ATTR_RO(ksm_zero_pages); static ssize_t general_profit_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { long general_profit; general_profit = (ksm_pages_sharing + atomic_long_read(&ksm_zero_pages)) * PAGE_SIZE - ksm_rmap_items * sizeof(struct ksm_rmap_item); return sysfs_emit(buf, "%ld\n", general_profit); } KSM_ATTR_RO(general_profit); static ssize_t stable_node_dups_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_stable_node_dups); } KSM_ATTR_RO(stable_node_dups); static ssize_t stable_node_chains_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_stable_node_chains); } KSM_ATTR_RO(stable_node_chains); static ssize_t stable_node_chains_prune_millisecs_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", ksm_stable_node_chains_prune_millisecs); } static ssize_t stable_node_chains_prune_millisecs_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { unsigned int msecs; int err; err = kstrtouint(buf, 10, &msecs); if (err) return -EINVAL; ksm_stable_node_chains_prune_millisecs = msecs; return count; } KSM_ATTR(stable_node_chains_prune_millisecs); static ssize_t full_scans_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_scan.seqnr); } KSM_ATTR_RO(full_scans); static ssize_t smart_scan_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", ksm_smart_scan); } static ssize_t smart_scan_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; bool value; err = kstrtobool(buf, &value); if (err) return -EINVAL; ksm_smart_scan = value; return count; } KSM_ATTR(smart_scan); static ssize_t advisor_mode_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { const char *output; if (ksm_advisor == KSM_ADVISOR_SCAN_TIME) output = "none [scan-time]"; else output = "[none] scan-time"; return sysfs_emit(buf, "%s\n", output); } static ssize_t advisor_mode_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { enum ksm_advisor_type curr_advisor = ksm_advisor; if (sysfs_streq("scan-time", buf)) ksm_advisor = KSM_ADVISOR_SCAN_TIME; else if (sysfs_streq("none", buf)) ksm_advisor = KSM_ADVISOR_NONE; else return -EINVAL; /* Set advisor default values */ if (curr_advisor != ksm_advisor) set_advisor_defaults(); return count; } KSM_ATTR(advisor_mode); static ssize_t advisor_max_cpu_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", ksm_advisor_max_cpu); } static ssize_t advisor_max_cpu_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; unsigned long value; err = kstrtoul(buf, 10, &value); if (err) return -EINVAL; ksm_advisor_max_cpu = value; return count; } KSM_ATTR(advisor_max_cpu); static ssize_t advisor_min_pages_to_scan_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_advisor_min_pages_to_scan); } static ssize_t advisor_min_pages_to_scan_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; unsigned long value; err = kstrtoul(buf, 10, &value); if (err) return -EINVAL; ksm_advisor_min_pages_to_scan = value; return count; } KSM_ATTR(advisor_min_pages_to_scan); static ssize_t advisor_max_pages_to_scan_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_advisor_max_pages_to_scan); } static ssize_t advisor_max_pages_to_scan_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; unsigned long value; err = kstrtoul(buf, 10, &value); if (err) return -EINVAL; ksm_advisor_max_pages_to_scan = value; return count; } KSM_ATTR(advisor_max_pages_to_scan); static ssize_t advisor_target_scan_time_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_advisor_target_scan_time); } static ssize_t advisor_target_scan_time_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; unsigned long value; err = kstrtoul(buf, 10, &value); if (err) return -EINVAL; if (value < 1) return -EINVAL; ksm_advisor_target_scan_time = value; return count; } KSM_ATTR(advisor_target_scan_time); static struct attribute *ksm_attrs[] = { &sleep_millisecs_attr.attr, &pages_to_scan_attr.attr, &run_attr.attr, &pages_scanned_attr.attr, &pages_shared_attr.attr, &pages_sharing_attr.attr, &pages_unshared_attr.attr, &pages_volatile_attr.attr, &pages_skipped_attr.attr, &ksm_zero_pages_attr.attr, &full_scans_attr.attr, #ifdef CONFIG_NUMA &merge_across_nodes_attr.attr, #endif &max_page_sharing_attr.attr, &stable_node_chains_attr.attr, &stable_node_dups_attr.attr, &stable_node_chains_prune_millisecs_attr.attr, &use_zero_pages_attr.attr, &general_profit_attr.attr, &smart_scan_attr.attr, &advisor_mode_attr.attr, &advisor_max_cpu_attr.attr, &advisor_min_pages_to_scan_attr.attr, &advisor_max_pages_to_scan_attr.attr, &advisor_target_scan_time_attr.attr, NULL, }; static const struct attribute_group ksm_attr_group = { .attrs = ksm_attrs, .name = "ksm", }; #endif /* CONFIG_SYSFS */ static int __init ksm_init(void) { struct task_struct *ksm_thread; int err; /* The correct value depends on page size and endianness */ zero_checksum = calc_checksum(ZERO_PAGE(0)); /* Default to false for backwards compatibility */ ksm_use_zero_pages = false; err = ksm_slab_init(); if (err) goto out; ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd"); if (IS_ERR(ksm_thread)) { pr_err("ksm: creating kthread failed\n"); err = PTR_ERR(ksm_thread); goto out_free; } #ifdef CONFIG_SYSFS err = sysfs_create_group(mm_kobj, &ksm_attr_group); if (err) { pr_err("ksm: register sysfs failed\n"); kthread_stop(ksm_thread); goto out_free; } #else ksm_run = KSM_RUN_MERGE; /* no way for user to start it */ #endif /* CONFIG_SYSFS */ #ifdef CONFIG_MEMORY_HOTREMOVE /* There is no significance to this priority 100 */ hotplug_memory_notifier(ksm_memory_callback, KSM_CALLBACK_PRI); #endif return 0; out_free: ksm_slab_free(); out: return err; } subsys_initcall(ksm_init);
25 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_KSM_H #define __LINUX_KSM_H /* * Memory merging support. * * This code enables dynamic sharing of identical pages found in different * memory areas, even if they are not shared by fork(). */ #include <linux/bitops.h> #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/rmap.h> #include <linux/sched.h> #ifdef CONFIG_KSM int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, vm_flags_t *vm_flags); vm_flags_t ksm_vma_flags(const struct mm_struct *mm, const struct file *file, vm_flags_t vm_flags); int ksm_enable_merge_any(struct mm_struct *mm); int ksm_disable_merge_any(struct mm_struct *mm); int ksm_disable(struct mm_struct *mm); int __ksm_enter(struct mm_struct *mm); void __ksm_exit(struct mm_struct *mm); /* * To identify zeropages that were mapped by KSM, we reuse the dirty bit * in the PTE. If the PTE is dirty, the zeropage was mapped by KSM when * deduplicating memory. */ #define is_ksm_zero_pte(pte) (is_zero_pfn(pte_pfn(pte)) && pte_dirty(pte)) extern atomic_long_t ksm_zero_pages; static inline void ksm_map_zero_page(struct mm_struct *mm) { atomic_long_inc(&ksm_zero_pages); atomic_long_inc(&mm->ksm_zero_pages); } static inline void ksm_might_unmap_zero_page(struct mm_struct *mm, pte_t pte) { if (is_ksm_zero_pte(pte)) { atomic_long_dec(&ksm_zero_pages); atomic_long_dec(&mm->ksm_zero_pages); } } static inline long mm_ksm_zero_pages(struct mm_struct *mm) { return atomic_long_read(&mm->ksm_zero_pages); } static inline void ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) { /* Adding mm to ksm is best effort on fork. */ if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) __ksm_enter(mm); } static inline int ksm_execve(struct mm_struct *mm) { if (test_bit(MMF_VM_MERGE_ANY, &mm->flags)) return __ksm_enter(mm); return 0; } static inline void ksm_exit(struct mm_struct *mm) { if (test_bit(MMF_VM_MERGEABLE, &mm->flags)) __ksm_exit(mm); } /* * When do_swap_page() first faults in from swap what used to be a KSM page, * no problem, it will be assigned to this vma's anon_vma; but thereafter, * it might be faulted into a different anon_vma (or perhaps to a different * offset in the same anon_vma). do_swap_page() cannot do all the locking * needed to reconstitute a cross-anon_vma KSM page: for now it has to make * a copy, and leave remerging the pages to a later pass of ksmd. * * We'd like to make this conditional on vma->vm_flags & VM_MERGEABLE, * but what if the vma was unmerged while the page was swapped out? */ struct folio *ksm_might_need_to_copy(struct folio *folio, struct vm_area_struct *vma, unsigned long addr); void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc); void folio_migrate_ksm(struct folio *newfolio, struct folio *folio); void collect_procs_ksm(const struct folio *folio, const struct page *page, struct list_head *to_kill, int force_early); long ksm_process_profit(struct mm_struct *); bool ksm_process_mergeable(struct mm_struct *mm); #else /* !CONFIG_KSM */ static inline vm_flags_t ksm_vma_flags(const struct mm_struct *mm, const struct file *file, vm_flags_t vm_flags) { return vm_flags; } static inline int ksm_disable(struct mm_struct *mm) { return 0; } static inline void ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) { } static inline int ksm_execve(struct mm_struct *mm) { return 0; } static inline void ksm_exit(struct mm_struct *mm) { } static inline void ksm_might_unmap_zero_page(struct mm_struct *mm, pte_t pte) { } static inline void collect_procs_ksm(const struct folio *folio, const struct page *page, struct list_head *to_kill, int force_early) { } #ifdef CONFIG_MMU static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, vm_flags_t *vm_flags) { return 0; } static inline struct folio *ksm_might_need_to_copy(struct folio *folio, struct vm_area_struct *vma, unsigned long addr) { return folio; } static inline void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc) { } static inline void folio_migrate_ksm(struct folio *newfolio, struct folio *old) { } #endif /* CONFIG_MMU */ #endif /* !CONFIG_KSM */ #endif /* __LINUX_KSM_H */
1209 1214 1209 1210 1213 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 // SPDX-License-Identifier: GPL-2.0 /* * security/tomoyo/domain.c * * Copyright (C) 2005-2011 NTT DATA CORPORATION */ #include "common.h" #include <linux/binfmts.h> #include <linux/slab.h> #include <linux/rculist.h> /* Variables definitions.*/ /* The initial domain. */ struct tomoyo_domain_info tomoyo_kernel_domain; /** * tomoyo_update_policy - Update an entry for exception policy. * * @new_entry: Pointer to "struct tomoyo_acl_info". * @size: Size of @new_entry in bytes. * @param: Pointer to "struct tomoyo_acl_param". * @check_duplicate: Callback function to find duplicated entry. * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ int tomoyo_update_policy(struct tomoyo_acl_head *new_entry, const int size, struct tomoyo_acl_param *param, bool (*check_duplicate)(const struct tomoyo_acl_head *, const struct tomoyo_acl_head *)) { int error = param->is_delete ? -ENOENT : -ENOMEM; struct tomoyo_acl_head *entry; struct list_head *list = param->list; if (mutex_lock_interruptible(&tomoyo_policy_lock)) return -ENOMEM; list_for_each_entry_rcu(entry, list, list, srcu_read_lock_held(&tomoyo_ss)) { if (entry->is_deleted == TOMOYO_GC_IN_PROGRESS) continue; if (!check_duplicate(entry, new_entry)) continue; entry->is_deleted = param->is_delete; error = 0; break; } if (error && !param->is_delete) { entry = tomoyo_commit_ok(new_entry, size); if (entry) { list_add_tail_rcu(&entry->list, list); error = 0; } } mutex_unlock(&tomoyo_policy_lock); return error; } /** * tomoyo_same_acl_head - Check for duplicated "struct tomoyo_acl_info" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * * Returns true if @a == @b, false otherwise. */ static inline bool tomoyo_same_acl_head(const struct tomoyo_acl_info *a, const struct tomoyo_acl_info *b) { return a->type == b->type && a->cond == b->cond; } /** * tomoyo_update_domain - Update an entry for domain policy. * * @new_entry: Pointer to "struct tomoyo_acl_info". * @size: Size of @new_entry in bytes. * @param: Pointer to "struct tomoyo_acl_param". * @check_duplicate: Callback function to find duplicated entry. * @merge_duplicate: Callback function to merge duplicated entry. * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ int tomoyo_update_domain(struct tomoyo_acl_info *new_entry, const int size, struct tomoyo_acl_param *param, bool (*check_duplicate)(const struct tomoyo_acl_info *, const struct tomoyo_acl_info *), bool (*merge_duplicate)(struct tomoyo_acl_info *, struct tomoyo_acl_info *, const bool)) { const bool is_delete = param->is_delete; int error = is_delete ? -ENOENT : -ENOMEM; struct tomoyo_acl_info *entry; struct list_head * const list = param->list; if (param->data[0]) { new_entry->cond = tomoyo_get_condition(param); if (!new_entry->cond) return -EINVAL; /* * Domain transition preference is allowed for only * "file execute" entries. */ if (new_entry->cond->transit && !(new_entry->type == TOMOYO_TYPE_PATH_ACL && container_of(new_entry, struct tomoyo_path_acl, head) ->perm == 1 << TOMOYO_TYPE_EXECUTE)) goto out; } if (mutex_lock_interruptible(&tomoyo_policy_lock)) goto out; list_for_each_entry_rcu(entry, list, list, srcu_read_lock_held(&tomoyo_ss)) { if (entry->is_deleted == TOMOYO_GC_IN_PROGRESS) continue; if (!tomoyo_same_acl_head(entry, new_entry) || !check_duplicate(entry, new_entry)) continue; if (merge_duplicate) entry->is_deleted = merge_duplicate(entry, new_entry, is_delete); else entry->is_deleted = is_delete; error = 0; break; } if (error && !is_delete) { entry = tomoyo_commit_ok(new_entry, size); if (entry) { list_add_tail_rcu(&entry->list, list); error = 0; } } mutex_unlock(&tomoyo_policy_lock); out: tomoyo_put_condition(new_entry->cond); return error; } /** * tomoyo_check_acl - Do permission check. * * @r: Pointer to "struct tomoyo_request_info". * @check_entry: Callback function to check type specific parameters. * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ void tomoyo_check_acl(struct tomoyo_request_info *r, bool (*check_entry)(struct tomoyo_request_info *, const struct tomoyo_acl_info *)) { const struct tomoyo_domain_info *domain = r->domain; struct tomoyo_acl_info *ptr; const struct list_head *list = &domain->acl_info_list; u16 i = 0; retry: list_for_each_entry_rcu(ptr, list, list, srcu_read_lock_held(&tomoyo_ss)) { if (ptr->is_deleted || ptr->type != r->param_type) continue; if (!check_entry(r, ptr)) continue; if (!tomoyo_condition(r, ptr->cond)) continue; r->matched_acl = ptr; r->granted = true; return; } for (; i < TOMOYO_MAX_ACL_GROUPS; i++) { if (!test_bit(i, domain->group)) continue; list = &domain->ns->acl_group[i++]; goto retry; } r->granted = false; } /* The list for "struct tomoyo_domain_info". */ LIST_HEAD(tomoyo_domain_list); /** * tomoyo_last_word - Get last component of a domainname. * * @name: Domainname to check. * * Returns the last word of @domainname. */ static const char *tomoyo_last_word(const char *name) { const char *cp = strrchr(name, ' '); if (cp) return cp + 1; return name; } /** * tomoyo_same_transition_control - Check for duplicated "struct tomoyo_transition_control" entry. * * @a: Pointer to "struct tomoyo_acl_head". * @b: Pointer to "struct tomoyo_acl_head". * * Returns true if @a == @b, false otherwise. */ static bool tomoyo_same_transition_control(const struct tomoyo_acl_head *a, const struct tomoyo_acl_head *b) { const struct tomoyo_transition_control *p1 = container_of(a, typeof(*p1), head); const struct tomoyo_transition_control *p2 = container_of(b, typeof(*p2), head); return p1->type == p2->type && p1->is_last_name == p2->is_last_name && p1->domainname == p2->domainname && p1->program == p2->program; } /** * tomoyo_write_transition_control - Write "struct tomoyo_transition_control" list. * * @param: Pointer to "struct tomoyo_acl_param". * @type: Type of this entry. * * Returns 0 on success, negative value otherwise. */ int tomoyo_write_transition_control(struct tomoyo_acl_param *param, const u8 type) { struct tomoyo_transition_control e = { .type = type }; int error = param->is_delete ? -ENOENT : -ENOMEM; char *program = param->data; char *domainname = strstr(program, " from "); if (domainname) { *domainname = '\0'; domainname += 6; } else if (type == TOMOYO_TRANSITION_CONTROL_NO_KEEP || type == TOMOYO_TRANSITION_CONTROL_KEEP) { domainname = program; program = NULL; } if (program && strcmp(program, "any")) { if (!tomoyo_correct_path(program)) return -EINVAL; e.program = tomoyo_get_name(program); if (!e.program) goto out; } if (domainname && strcmp(domainname, "any")) { if (!tomoyo_correct_domain(domainname)) { if (!tomoyo_correct_path(domainname)) goto out; e.is_last_name = true; } e.domainname = tomoyo_get_name(domainname); if (!e.domainname) goto out; } param->list = &param->ns->policy_list[TOMOYO_ID_TRANSITION_CONTROL]; error = tomoyo_update_policy(&e.head, sizeof(e), param, tomoyo_same_transition_control); out: tomoyo_put_name(e.domainname); tomoyo_put_name(e.program); return error; } /** * tomoyo_scan_transition - Try to find specific domain transition type. * * @list: Pointer to "struct list_head". * @domainname: The name of current domain. * @program: The name of requested program. * @last_name: The last component of @domainname. * @type: One of values in "enum tomoyo_transition_type". * * Returns true if found one, false otherwise. * * Caller holds tomoyo_read_lock(). */ static inline bool tomoyo_scan_transition (const struct list_head *list, const struct tomoyo_path_info *domainname, const struct tomoyo_path_info *program, const char *last_name, const enum tomoyo_transition_type type) { const struct tomoyo_transition_control *ptr; list_for_each_entry_rcu(ptr, list, head.list, srcu_read_lock_held(&tomoyo_ss)) { if (ptr->head.is_deleted || ptr->type != type) continue; if (ptr->domainname) { if (!ptr->is_last_name) { if (ptr->domainname != domainname) continue; } else { /* * Use direct strcmp() since this is * unlikely used. */ if (strcmp(ptr->domainname->name, last_name)) continue; } } if (ptr->program && tomoyo_pathcmp(ptr->program, program)) continue; return true; } return false; } /** * tomoyo_transition_type - Get domain transition type. * * @ns: Pointer to "struct tomoyo_policy_namespace". * @domainname: The name of current domain. * @program: The name of requested program. * * Returns TOMOYO_TRANSITION_CONTROL_TRANSIT if executing @program causes * domain transition across namespaces, TOMOYO_TRANSITION_CONTROL_INITIALIZE if * executing @program reinitializes domain transition within that namespace, * TOMOYO_TRANSITION_CONTROL_KEEP if executing @program stays at @domainname , * others otherwise. * * Caller holds tomoyo_read_lock(). */ static enum tomoyo_transition_type tomoyo_transition_type (const struct tomoyo_policy_namespace *ns, const struct tomoyo_path_info *domainname, const struct tomoyo_path_info *program) { const char *last_name = tomoyo_last_word(domainname->name); enum tomoyo_transition_type type = TOMOYO_TRANSITION_CONTROL_NO_RESET; while (type < TOMOYO_MAX_TRANSITION_TYPE) { const struct list_head * const list = &ns->policy_list[TOMOYO_ID_TRANSITION_CONTROL]; if (!tomoyo_scan_transition(list, domainname, program, last_name, type)) { type++; continue; } if (type != TOMOYO_TRANSITION_CONTROL_NO_RESET && type != TOMOYO_TRANSITION_CONTROL_NO_INITIALIZE) break; /* * Do not check for reset_domain if no_reset_domain matched. * Do not check for initialize_domain if no_initialize_domain * matched. */ type++; type++; } return type; } /** * tomoyo_same_aggregator - Check for duplicated "struct tomoyo_aggregator" entry. * * @a: Pointer to "struct tomoyo_acl_head". * @b: Pointer to "struct tomoyo_acl_head". * * Returns true if @a == @b, false otherwise. */ static bool tomoyo_same_aggregator(const struct tomoyo_acl_head *a, const struct tomoyo_acl_head *b) { const struct tomoyo_aggregator *p1 = container_of(a, typeof(*p1), head); const struct tomoyo_aggregator *p2 = container_of(b, typeof(*p2), head); return p1->original_name == p2->original_name && p1->aggregated_name == p2->aggregated_name; } /** * tomoyo_write_aggregator - Write "struct tomoyo_aggregator" list. * * @param: Pointer to "struct tomoyo_acl_param". * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ int tomoyo_write_aggregator(struct tomoyo_acl_param *param) { struct tomoyo_aggregator e = { }; int error = param->is_delete ? -ENOENT : -ENOMEM; const char *original_name = tomoyo_read_token(param); const char *aggregated_name = tomoyo_read_token(param); if (!tomoyo_correct_word(original_name) || !tomoyo_correct_path(aggregated_name)) return -EINVAL; e.original_name = tomoyo_get_name(original_name); e.aggregated_name = tomoyo_get_name(aggregated_name); if (!e.original_name || !e.aggregated_name || e.aggregated_name->is_patterned) /* No patterns allowed. */ goto out; param->list = &param->ns->policy_list[TOMOYO_ID_AGGREGATOR]; error = tomoyo_update_policy(&e.head, sizeof(e), param, tomoyo_same_aggregator); out: tomoyo_put_name(e.original_name); tomoyo_put_name(e.aggregated_name); return error; } /** * tomoyo_find_namespace - Find specified namespace. * * @name: Name of namespace to find. * @len: Length of @name. * * Returns pointer to "struct tomoyo_policy_namespace" if found, * NULL otherwise. * * Caller holds tomoyo_read_lock(). */ static struct tomoyo_policy_namespace *tomoyo_find_namespace (const char *name, const unsigned int len) { struct tomoyo_policy_namespace *ns; list_for_each_entry(ns, &tomoyo_namespace_list, namespace_list) { if (strncmp(name, ns->name, len) || (name[len] && name[len] != ' ')) continue; return ns; } return NULL; } /** * tomoyo_assign_namespace - Create a new namespace. * * @domainname: Name of namespace to create. * * Returns pointer to "struct tomoyo_policy_namespace" on success, * NULL otherwise. * * Caller holds tomoyo_read_lock(). */ struct tomoyo_policy_namespace *tomoyo_assign_namespace(const char *domainname) { struct tomoyo_policy_namespace *ptr; struct tomoyo_policy_namespace *entry; const char *cp = domainname; unsigned int len = 0; while (*cp && *cp++ != ' ') len++; ptr = tomoyo_find_namespace(domainname, len); if (ptr) return ptr; if (len >= TOMOYO_EXEC_TMPSIZE - 10 || !tomoyo_domain_def(domainname)) return NULL; entry = kzalloc(sizeof(*entry) + len + 1, GFP_NOFS | __GFP_NOWARN); if (mutex_lock_interruptible(&tomoyo_policy_lock)) goto out; ptr = tomoyo_find_namespace(domainname, len); if (!ptr && tomoyo_memory_ok(entry)) { char *name = (char *) (entry + 1); ptr = entry; memmove(name, domainname, len); name[len] = '\0'; entry->name = name; tomoyo_init_policy_namespace(entry); entry = NULL; } mutex_unlock(&tomoyo_policy_lock); out: kfree(entry); return ptr; } /** * tomoyo_namespace_jump - Check for namespace jump. * * @domainname: Name of domain. * * Returns true if namespace differs, false otherwise. */ static bool tomoyo_namespace_jump(const char *domainname) { const char *namespace = tomoyo_current_namespace()->name; const int len = strlen(namespace); return strncmp(domainname, namespace, len) || (domainname[len] && domainname[len] != ' '); } /** * tomoyo_assign_domain - Create a domain or a namespace. * * @domainname: The name of domain. * @transit: True if transit to domain found or created. * * Returns pointer to "struct tomoyo_domain_info" on success, NULL otherwise. * * Caller holds tomoyo_read_lock(). */ struct tomoyo_domain_info *tomoyo_assign_domain(const char *domainname, const bool transit) { struct tomoyo_domain_info e = { }; struct tomoyo_domain_info *entry = tomoyo_find_domain(domainname); bool created = false; if (entry) { if (transit) { /* * Since namespace is created at runtime, profiles may * not be created by the moment the process transits to * that domain. Do not perform domain transition if * profile for that domain is not yet created. */ if (tomoyo_policy_loaded && !entry->ns->profile_ptr[entry->profile]) return NULL; } return entry; } /* Requested domain does not exist. */ /* Don't create requested domain if domainname is invalid. */ if (strlen(domainname) >= TOMOYO_EXEC_TMPSIZE - 10 || !tomoyo_correct_domain(domainname)) return NULL; /* * Since definition of profiles and acl_groups may differ across * namespaces, do not inherit "use_profile" and "use_group" settings * by automatically creating requested domain upon domain transition. */ if (transit && tomoyo_namespace_jump(domainname)) return NULL; e.ns = tomoyo_assign_namespace(domainname); if (!e.ns) return NULL; /* * "use_profile" and "use_group" settings for automatically created * domains are inherited from current domain. These are 0 for manually * created domains. */ if (transit) { const struct tomoyo_domain_info *domain = tomoyo_domain(); e.profile = domain->profile; memcpy(e.group, domain->group, sizeof(e.group)); } e.domainname = tomoyo_get_name(domainname); if (!e.domainname) return NULL; if (mutex_lock_interruptible(&tomoyo_policy_lock)) goto out; entry = tomoyo_find_domain(domainname); if (!entry) { entry = tomoyo_commit_ok(&e, sizeof(e)); if (entry) { INIT_LIST_HEAD(&entry->acl_info_list); list_add_tail_rcu(&entry->list, &tomoyo_domain_list); created = true; } } mutex_unlock(&tomoyo_policy_lock); out: tomoyo_put_name(e.domainname); if (entry && transit) { if (created) { struct tomoyo_request_info r; int i; tomoyo_init_request_info(&r, entry, TOMOYO_MAC_FILE_EXECUTE); r.granted = false; tomoyo_write_log(&r, "use_profile %u\n", entry->profile); for (i = 0; i < TOMOYO_MAX_ACL_GROUPS; i++) if (test_bit(i, entry->group)) tomoyo_write_log(&r, "use_group %u\n", i); tomoyo_update_stat(TOMOYO_STAT_POLICY_UPDATES); } } return entry; } /** * tomoyo_environ - Check permission for environment variable names. * * @ee: Pointer to "struct tomoyo_execve". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_environ(struct tomoyo_execve *ee) { struct tomoyo_request_info *r = &ee->r; struct linux_binprm *bprm = ee->bprm; /* env_page.data is allocated by tomoyo_dump_page(). */ struct tomoyo_page_dump env_page = { }; char *arg_ptr; /* Size is TOMOYO_EXEC_TMPSIZE bytes */ int arg_len = 0; unsigned long pos = bprm->p; int offset = pos % PAGE_SIZE; int argv_count = bprm->argc; int envp_count = bprm->envc; int error = -ENOMEM; ee->r.type = TOMOYO_MAC_ENVIRON; ee->r.profile = r->domain->profile; ee->r.mode = tomoyo_get_mode(r->domain->ns, ee->r.profile, TOMOYO_MAC_ENVIRON); if (!r->mode || !envp_count) return 0; arg_ptr = kzalloc(TOMOYO_EXEC_TMPSIZE, GFP_NOFS); if (!arg_ptr) goto out; while (error == -ENOMEM) { if (!tomoyo_dump_page(bprm, pos, &env_page)) goto out; pos += PAGE_SIZE - offset; /* Read. */ while (argv_count && offset < PAGE_SIZE) { if (!env_page.data[offset++]) argv_count--; } if (argv_count) { offset = 0; continue; } while (offset < PAGE_SIZE) { const unsigned char c = env_page.data[offset++]; if (c && arg_len < TOMOYO_EXEC_TMPSIZE - 10) { if (c == '=') { arg_ptr[arg_len++] = '\0'; } else if (c == '\\') { arg_ptr[arg_len++] = '\\'; arg_ptr[arg_len++] = '\\'; } else if (c > ' ' && c < 127) { arg_ptr[arg_len++] = c; } else { arg_ptr[arg_len++] = '\\'; arg_ptr[arg_len++] = (c >> 6) + '0'; arg_ptr[arg_len++] = ((c >> 3) & 7) + '0'; arg_ptr[arg_len++] = (c & 7) + '0'; } } else { arg_ptr[arg_len] = '\0'; } if (c) continue; if (tomoyo_env_perm(r, arg_ptr)) { error = -EPERM; break; } if (!--envp_count) { error = 0; break; } arg_len = 0; } offset = 0; } out: if (r->mode != TOMOYO_CONFIG_ENFORCING) error = 0; kfree(env_page.data); kfree(arg_ptr); return error; } /** * tomoyo_find_next_domain - Find a domain. * * @bprm: Pointer to "struct linux_binprm". * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ int tomoyo_find_next_domain(struct linux_binprm *bprm) { struct tomoyo_domain_info *old_domain = tomoyo_domain(); struct tomoyo_domain_info *domain = NULL; const char *original_name = bprm->filename; int retval = -ENOMEM; bool reject_on_transition_failure = false; const struct tomoyo_path_info *candidate; struct tomoyo_path_info exename; struct tomoyo_execve *ee = kzalloc(sizeof(*ee), GFP_NOFS); if (!ee) return -ENOMEM; ee->tmp = kzalloc(TOMOYO_EXEC_TMPSIZE, GFP_NOFS); if (!ee->tmp) { kfree(ee); return -ENOMEM; } /* ee->dump->data is allocated by tomoyo_dump_page(). */ tomoyo_init_request_info(&ee->r, NULL, TOMOYO_MAC_FILE_EXECUTE); ee->r.ee = ee; ee->bprm = bprm; ee->r.obj = &ee->obj; ee->obj.path1 = bprm->file->f_path; /* * Get symlink's pathname of program, but fallback to realpath if * symlink's pathname does not exist or symlink's pathname refers * to proc filesystem (e.g. /dev/fd/<num> or /proc/self/fd/<num> ). */ exename.name = tomoyo_realpath_nofollow(original_name); if (exename.name && !strncmp(exename.name, "proc:/", 6)) { kfree(exename.name); exename.name = NULL; } if (!exename.name) { exename.name = tomoyo_realpath_from_path(&bprm->file->f_path); if (!exename.name) goto out; } tomoyo_fill_path_info(&exename); retry: /* Check 'aggregator' directive. */ { struct tomoyo_aggregator *ptr; struct list_head *list = &old_domain->ns->policy_list[TOMOYO_ID_AGGREGATOR]; /* Check 'aggregator' directive. */ candidate = &exename; list_for_each_entry_rcu(ptr, list, head.list, srcu_read_lock_held(&tomoyo_ss)) { if (ptr->head.is_deleted || !tomoyo_path_matches_pattern(&exename, ptr->original_name)) continue; candidate = ptr->aggregated_name; break; } } /* Check execute permission. */ retval = tomoyo_execute_permission(&ee->r, candidate); if (retval == TOMOYO_RETRY_REQUEST) goto retry; if (retval < 0) goto out; /* * To be able to specify domainnames with wildcards, use the * pathname specified in the policy (which may contain * wildcard) rather than the pathname passed to execve() * (which never contains wildcard). */ if (ee->r.param.path.matched_path) candidate = ee->r.param.path.matched_path; /* * Check for domain transition preference if "file execute" matched. * If preference is given, make execve() fail if domain transition * has failed, for domain transition preference should be used with * destination domain defined. */ if (ee->transition) { const char *domainname = ee->transition->name; reject_on_transition_failure = true; if (!strcmp(domainname, "keep")) goto force_keep_domain; if (!strcmp(domainname, "child")) goto force_child_domain; if (!strcmp(domainname, "reset")) goto force_reset_domain; if (!strcmp(domainname, "initialize")) goto force_initialize_domain; if (!strcmp(domainname, "parent")) { char *cp; strscpy(ee->tmp, old_domain->domainname->name, TOMOYO_EXEC_TMPSIZE); cp = strrchr(ee->tmp, ' '); if (cp) *cp = '\0'; } else if (*domainname == '<') strscpy(ee->tmp, domainname, TOMOYO_EXEC_TMPSIZE); else snprintf(ee->tmp, TOMOYO_EXEC_TMPSIZE - 1, "%s %s", old_domain->domainname->name, domainname); goto force_jump_domain; } /* * No domain transition preference specified. * Calculate domain to transit to. */ switch (tomoyo_transition_type(old_domain->ns, old_domain->domainname, candidate)) { case TOMOYO_TRANSITION_CONTROL_RESET: force_reset_domain: /* Transit to the root of specified namespace. */ snprintf(ee->tmp, TOMOYO_EXEC_TMPSIZE - 1, "<%s>", candidate->name); /* * Make execve() fail if domain transition across namespaces * has failed. */ reject_on_transition_failure = true; break; case TOMOYO_TRANSITION_CONTROL_INITIALIZE: force_initialize_domain: /* Transit to the child of current namespace's root. */ snprintf(ee->tmp, TOMOYO_EXEC_TMPSIZE - 1, "%s %s", old_domain->ns->name, candidate->name); break; case TOMOYO_TRANSITION_CONTROL_KEEP: force_keep_domain: /* Keep current domain. */ domain = old_domain; break; default: if (old_domain == &tomoyo_kernel_domain && !tomoyo_policy_loaded) { /* * Needn't to transit from kernel domain before * starting /sbin/init. But transit from kernel domain * if executing initializers because they might start * before /sbin/init. */ domain = old_domain; break; } force_child_domain: /* Normal domain transition. */ snprintf(ee->tmp, TOMOYO_EXEC_TMPSIZE - 1, "%s %s", old_domain->domainname->name, candidate->name); break; } force_jump_domain: if (!domain) domain = tomoyo_assign_domain(ee->tmp, true); if (domain) retval = 0; else if (reject_on_transition_failure) { pr_warn("ERROR: Domain '%s' not ready.\n", ee->tmp); retval = -ENOMEM; } else if (ee->r.mode == TOMOYO_CONFIG_ENFORCING) retval = -ENOMEM; else { retval = 0; if (!old_domain->flags[TOMOYO_DIF_TRANSITION_FAILED]) { old_domain->flags[TOMOYO_DIF_TRANSITION_FAILED] = true; ee->r.granted = false; tomoyo_write_log(&ee->r, "%s", tomoyo_dif [TOMOYO_DIF_TRANSITION_FAILED]); pr_warn("ERROR: Domain '%s' not defined.\n", ee->tmp); } } out: if (!domain) domain = old_domain; /* Update reference count on "struct tomoyo_domain_info". */ { struct tomoyo_task *s = tomoyo_task(current); s->old_domain_info = s->domain_info; s->domain_info = domain; atomic_inc(&domain->users); } kfree(exename.name); if (!retval) { ee->r.domain = domain; retval = tomoyo_environ(ee); } kfree(ee->tmp); kfree(ee->dump.data); kfree(ee); return retval; } /** * tomoyo_dump_page - Dump a page to buffer. * * @bprm: Pointer to "struct linux_binprm". * @pos: Location to dump. * @dump: Pointer to "struct tomoyo_page_dump". * * Returns true on success, false otherwise. */ bool tomoyo_dump_page(struct linux_binprm *bprm, unsigned long pos, struct tomoyo_page_dump *dump) { struct page *page; #ifdef CONFIG_MMU int ret; #endif /* dump->data is released by tomoyo_find_next_domain(). */ if (!dump->data) { dump->data = kzalloc(PAGE_SIZE, GFP_NOFS); if (!dump->data) return false; } /* Same with get_arg_page(bprm, pos, 0) in fs/exec.c */ #ifdef CONFIG_MMU /* * This is called at execve() time in order to dig around * in the argv/environment of the new process * (represented by bprm). */ mmap_read_lock(bprm->mm); ret = get_user_pages_remote(bprm->mm, pos, 1, FOLL_FORCE, &page, NULL); mmap_read_unlock(bprm->mm); if (ret <= 0) return false; #else page = bprm->page[pos / PAGE_SIZE]; #endif if (page != dump->page) { const unsigned int offset = pos % PAGE_SIZE; /* * Maybe kmap()/kunmap() should be used here. * But remove_arg_zero() uses kmap_atomic()/kunmap_atomic(). * So do I. */ char *kaddr = kmap_atomic(page); dump->page = page; memcpy(dump->data + offset, kaddr + offset, PAGE_SIZE - offset); kunmap_atomic(kaddr); } /* Same with put_arg_page(page) in fs/exec.c */ #ifdef CONFIG_MMU put_page(page); #endif return true; }
7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 // SPDX-License-Identifier: GPL-2.0 /* * Hyp portion of the (not much of an) Emulation layer for 32bit guests. * * Copyright (C) 2012,2013 - ARM Ltd * Author: Marc Zyngier <marc.zyngier@arm.com> * * based on arch/arm/kvm/emulate.c * Copyright (C) 2012 - Virtual Open Systems and Columbia University * Author: Christoffer Dall <c.dall@virtualopensystems.com> */ #include <linux/kvm_host.h> #include <asm/kvm_emulate.h> #include <asm/kvm_hyp.h> /* * stolen from arch/arm/kernel/opcodes.c * * condition code lookup table * index into the table is test code: EQ, NE, ... LT, GT, AL, NV * * bit position in short is condition code: NZCV */ static const unsigned short cc_map[16] = { 0xF0F0, /* EQ == Z set */ 0x0F0F, /* NE */ 0xCCCC, /* CS == C set */ 0x3333, /* CC */ 0xFF00, /* MI == N set */ 0x00FF, /* PL */ 0xAAAA, /* VS == V set */ 0x5555, /* VC */ 0x0C0C, /* HI == C set && Z clear */ 0xF3F3, /* LS == C clear || Z set */ 0xAA55, /* GE == (N==V) */ 0x55AA, /* LT == (N!=V) */ 0x0A05, /* GT == (!Z && (N==V)) */ 0xF5FA, /* LE == (Z || (N!=V)) */ 0xFFFF, /* AL always */ 0 /* NV */ }; /* * Check if a trapped instruction should have been executed or not. */ bool kvm_condition_valid32(const struct kvm_vcpu *vcpu) { unsigned long cpsr; u32 cpsr_cond; int cond; /* * These are the exception classes that could fire with a * conditional instruction. */ switch (kvm_vcpu_trap_get_class(vcpu)) { case ESR_ELx_EC_CP15_32: case ESR_ELx_EC_CP15_64: case ESR_ELx_EC_CP14_MR: case ESR_ELx_EC_CP14_LS: case ESR_ELx_EC_FP_ASIMD: case ESR_ELx_EC_CP10_ID: case ESR_ELx_EC_CP14_64: case ESR_ELx_EC_SVC32: break; default: return true; } /* Is condition field valid? */ cond = kvm_vcpu_get_condition(vcpu); if (cond == 0xE) return true; cpsr = *vcpu_cpsr(vcpu); if (cond < 0) { /* This can happen in Thumb mode: examine IT state. */ unsigned long it; it = ((cpsr >> 8) & 0xFC) | ((cpsr >> 25) & 0x3); /* it == 0 => unconditional. */ if (it == 0) return true; /* The cond for this insn works out as the top 4 bits. */ cond = (it >> 4); } cpsr_cond = cpsr >> 28; if (!((cc_map[cond] >> cpsr_cond) & 1)) return false; return true; } /** * kvm_adjust_itstate - adjust ITSTATE when emulating instructions in IT-block * @vcpu: The VCPU pointer * * When exceptions occur while instructions are executed in Thumb IF-THEN * blocks, the ITSTATE field of the CPSR is not advanced (updated), so we have * to do this little bit of work manually. The fields map like this: * * IT[7:0] -> CPSR[26:25],CPSR[15:10] */ static void kvm_adjust_itstate(struct kvm_vcpu *vcpu) { unsigned long itbits, cond; unsigned long cpsr = *vcpu_cpsr(vcpu); bool is_arm = !(cpsr & PSR_AA32_T_BIT); if (is_arm || !(cpsr & PSR_AA32_IT_MASK)) return; cond = (cpsr & 0xe000) >> 13; itbits = (cpsr & 0x1c00) >> (10 - 2); itbits |= (cpsr & (0x3 << 25)) >> 25; /* Perform ITAdvance (see page A2-52 in ARM DDI 0406C) */ if ((itbits & 0x7) == 0) itbits = cond = 0; else itbits = (itbits << 1) & 0x1f; cpsr &= ~PSR_AA32_IT_MASK; cpsr |= cond << 13; cpsr |= (itbits & 0x1c) << (10 - 2); cpsr |= (itbits & 0x3) << 25; *vcpu_cpsr(vcpu) = cpsr; } /** * kvm_skip_instr32 - skip a trapped instruction and proceed to the next * @vcpu: The vcpu pointer */ void kvm_skip_instr32(struct kvm_vcpu *vcpu) { u32 pc = *vcpu_pc(vcpu); bool is_thumb; is_thumb = !!(*vcpu_cpsr(vcpu) & PSR_AA32_T_BIT); if (is_thumb && !kvm_vcpu_trap_il_is32bit(vcpu)) pc += 2; else pc += 4; *vcpu_pc(vcpu) = pc; kvm_adjust_itstate(vcpu); }
2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2012 ARM Ltd. */ #ifndef __ASM_HW_BREAKPOINT_H #define __ASM_HW_BREAKPOINT_H #include <asm/cputype.h> #include <asm/cpufeature.h> #include <asm/sysreg.h> #include <asm/virt.h> struct arch_hw_breakpoint_ctrl { u32 __reserved : 19, len : 8, type : 2, privilege : 2, enabled : 1; }; struct arch_hw_breakpoint { u64 address; u64 trigger; struct arch_hw_breakpoint_ctrl ctrl; }; /* Privilege Levels */ #define AARCH64_BREAKPOINT_EL1 1 #define AARCH64_BREAKPOINT_EL0 2 #define DBG_HMC_HYP (1 << 13) static inline u32 encode_ctrl_reg(struct arch_hw_breakpoint_ctrl ctrl) { u32 val = (ctrl.len << 5) | (ctrl.type << 3) | (ctrl.privilege << 1) | ctrl.enabled; if (is_kernel_in_hyp_mode() && ctrl.privilege == AARCH64_BREAKPOINT_EL1) val |= DBG_HMC_HYP; return val; } static inline void decode_ctrl_reg(u32 reg, struct arch_hw_breakpoint_ctrl *ctrl) { ctrl->enabled = reg & 0x1; reg >>= 1; ctrl->privilege = reg & 0x3; reg >>= 2; ctrl->type = reg & 0x3; reg >>= 2; ctrl->len = reg & 0xff; } /* Breakpoint */ #define ARM_BREAKPOINT_EXECUTE 0 /* Watchpoints */ #define ARM_BREAKPOINT_LOAD 1 #define ARM_BREAKPOINT_STORE 2 /* Lengths */ #define ARM_BREAKPOINT_LEN_1 0x1 #define ARM_BREAKPOINT_LEN_2 0x3 #define ARM_BREAKPOINT_LEN_3 0x7 #define ARM_BREAKPOINT_LEN_4 0xf #define ARM_BREAKPOINT_LEN_5 0x1f #define ARM_BREAKPOINT_LEN_6 0x3f #define ARM_BREAKPOINT_LEN_7 0x7f #define ARM_BREAKPOINT_LEN_8 0xff /* Kernel stepping */ #define ARM_KERNEL_STEP_NONE 0 #define ARM_KERNEL_STEP_ACTIVE 1 #define ARM_KERNEL_STEP_SUSPEND 2 /* * Limits. * Changing these will require modifications to the register accessors. */ #define ARM_MAX_BRP 16 #define ARM_MAX_WRP 16 /* Virtual debug register bases. */ #define AARCH64_DBG_REG_BVR 0 #define AARCH64_DBG_REG_BCR (AARCH64_DBG_REG_BVR + ARM_MAX_BRP) #define AARCH64_DBG_REG_WVR (AARCH64_DBG_REG_BCR + ARM_MAX_BRP) #define AARCH64_DBG_REG_WCR (AARCH64_DBG_REG_WVR + ARM_MAX_WRP) /* Debug register names. */ #define AARCH64_DBG_REG_NAME_BVR bvr #define AARCH64_DBG_REG_NAME_BCR bcr #define AARCH64_DBG_REG_NAME_WVR wvr #define AARCH64_DBG_REG_NAME_WCR wcr /* Accessor macros for the debug registers. */ #define AARCH64_DBG_READ(N, REG, VAL) do {\ VAL = read_sysreg(dbg##REG##N##_el1);\ } while (0) #define AARCH64_DBG_WRITE(N, REG, VAL) do {\ write_sysreg(VAL, dbg##REG##N##_el1);\ } while (0) struct task_struct; struct notifier_block; struct perf_event_attr; struct perf_event; struct pmu; extern int arch_bp_generic_fields(struct arch_hw_breakpoint_ctrl ctrl, int *gen_len, int *gen_type, int *offset); extern int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw); extern int hw_breakpoint_arch_parse(struct perf_event *bp, const struct perf_event_attr *attr, struct arch_hw_breakpoint *hw); extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused, unsigned long val, void *data); extern int arch_install_hw_breakpoint(struct perf_event *bp); extern void arch_uninstall_hw_breakpoint(struct perf_event *bp); extern void hw_breakpoint_pmu_read(struct perf_event *bp); extern int hw_breakpoint_slots(int type); #ifdef CONFIG_HAVE_HW_BREAKPOINT extern void hw_breakpoint_thread_switch(struct task_struct *next); extern void ptrace_hw_copy_thread(struct task_struct *task); #else static inline void hw_breakpoint_thread_switch(struct task_struct *next) { } static inline void ptrace_hw_copy_thread(struct task_struct *task) { } #endif /* Determine number of BRP registers available. */ static inline int get_num_brps(void) { u64 dfr0 = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1); return 1 + cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_BRPs_SHIFT); } /* Determine number of WRP registers available. */ static inline int get_num_wrps(void) { u64 dfr0 = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1); return 1 + cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_WRPs_SHIFT); } #ifdef CONFIG_CPU_PM extern void cpu_suspend_set_dbg_restorer(int (*hw_bp_restore)(unsigned int)); #else static inline void cpu_suspend_set_dbg_restorer(int (*hw_bp_restore)(unsigned int)) { } #endif #endif /* __ASM_BREAKPOINT_H */
9 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM net #if !defined(_TRACE_NET_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_NET_H #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/if_vlan.h> #include <linux/ip.h> #include <linux/tracepoint.h> TRACE_EVENT(net_dev_start_xmit, TP_PROTO(const struct sk_buff *skb, const struct net_device *dev), TP_ARGS(skb, dev), TP_STRUCT__entry( __string( name, dev->name ) __field( u16, queue_mapping ) __field( const void *, skbaddr ) __field( bool, vlan_tagged ) __field( u16, vlan_proto ) __field( u16, vlan_tci ) __field( u16, protocol ) __field( u8, ip_summed ) __field( unsigned int, len ) __field( unsigned int, data_len ) __field( int, network_offset ) __field( bool, transport_offset_valid) __field( int, transport_offset) __field( u8, tx_flags ) __field( u16, gso_size ) __field( u16, gso_segs ) __field( u16, gso_type ) ), TP_fast_assign( __assign_str(name); __entry->queue_mapping = skb->queue_mapping; __entry->skbaddr = skb; __entry->vlan_tagged = skb_vlan_tag_present(skb); __entry->vlan_proto = ntohs(skb->vlan_proto); __entry->vlan_tci = skb_vlan_tag_get(skb); __entry->protocol = ntohs(skb->protocol); __entry->ip_summed = skb->ip_summed; __entry->len = skb->len; __entry->data_len = skb->data_len; __entry->network_offset = skb_network_offset(skb); __entry->transport_offset_valid = skb_transport_header_was_set(skb); __entry->transport_offset = skb_transport_header_was_set(skb) ? skb_transport_offset(skb) : 0; __entry->tx_flags = skb_shinfo(skb)->tx_flags; __entry->gso_size = skb_shinfo(skb)->gso_size; __entry->gso_segs = skb_shinfo(skb)->gso_segs; __entry->gso_type = skb_shinfo(skb)->gso_type; ), TP_printk("dev=%s queue_mapping=%u skbaddr=%p vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d len=%u data_len=%u network_offset=%d transport_offset_valid=%d transport_offset=%d tx_flags=%d gso_size=%d gso_segs=%d gso_type=%#x", __get_str(name), __entry->queue_mapping, __entry->skbaddr, __entry->vlan_tagged, __entry->vlan_proto, __entry->vlan_tci, __entry->protocol, __entry->ip_summed, __entry->len, __entry->data_len, __entry->network_offset, __entry->transport_offset_valid, __entry->transport_offset, __entry->tx_flags, __entry->gso_size, __entry->gso_segs, __entry->gso_type) ); TRACE_EVENT(net_dev_xmit, TP_PROTO(struct sk_buff *skb, int rc, struct net_device *dev, unsigned int skb_len), TP_ARGS(skb, rc, dev, skb_len), TP_STRUCT__entry( __field( void *, skbaddr ) __field( unsigned int, len ) __field( int, rc ) __string( name, dev->name ) ), TP_fast_assign( __entry->skbaddr = skb; __entry->len = skb_len; __entry->rc = rc; __assign_str(name); ), TP_printk("dev=%s skbaddr=%p len=%u rc=%d", __get_str(name), __entry->skbaddr, __entry->len, __entry->rc) ); TRACE_EVENT(net_dev_xmit_timeout, TP_PROTO(struct net_device *dev, int queue_index), TP_ARGS(dev, queue_index), TP_STRUCT__entry( __string( name, dev->name ) __string( driver, netdev_drivername(dev)) __field( int, queue_index ) ), TP_fast_assign( __assign_str(name); __assign_str(driver); __entry->queue_index = queue_index; ), TP_printk("dev=%s driver=%s queue=%d", __get_str(name), __get_str(driver), __entry->queue_index) ); DECLARE_EVENT_CLASS(net_dev_template, TP_PROTO(struct sk_buff *skb), TP_ARGS(skb), TP_STRUCT__entry( __field( void *, skbaddr ) __field( unsigned int, len ) __string( name, skb->dev->name ) ), TP_fast_assign( __entry->skbaddr = skb; __entry->len = skb->len; __assign_str(name); ), TP_printk("dev=%s skbaddr=%p len=%u", __get_str(name), __entry->skbaddr, __entry->len) ) DEFINE_EVENT(net_dev_template, net_dev_queue, TP_PROTO(struct sk_buff *skb), TP_ARGS(skb) ); DEFINE_EVENT(net_dev_template, netif_receive_skb, TP_PROTO(struct sk_buff *skb), TP_ARGS(skb) ); DEFINE_EVENT(net_dev_template, netif_rx, TP_PROTO(struct sk_buff *skb), TP_ARGS(skb) ); DECLARE_EVENT_CLASS(net_dev_rx_verbose_template, TP_PROTO(const struct sk_buff *skb), TP_ARGS(skb), TP_STRUCT__entry( __string( name, skb->dev->name ) __field( unsigned int, napi_id ) __field( u16, queue_mapping ) __field( const void *, skbaddr ) __field( bool, vlan_tagged ) __field( u16, vlan_proto ) __field( u16, vlan_tci ) __field( u16, protocol ) __field( u8, ip_summed ) __field( u32, hash ) __field( bool, l4_hash ) __field( unsigned int, len ) __field( unsigned int, data_len ) __field( unsigned int, truesize ) __field( bool, mac_header_valid) __field( int, mac_header ) __field( unsigned char, nr_frags ) __field( u16, gso_size ) __field( u16, gso_type ) ), TP_fast_assign( __assign_str(name); #ifdef CONFIG_NET_RX_BUSY_POLL __entry->napi_id = skb->napi_id; #else __entry->napi_id = 0; #endif __entry->queue_mapping = skb->queue_mapping; __entry->skbaddr = skb; __entry->vlan_tagged = skb_vlan_tag_present(skb); __entry->vlan_proto = ntohs(skb->vlan_proto); __entry->vlan_tci = skb_vlan_tag_get(skb); __entry->protocol = ntohs(skb->protocol); __entry->ip_summed = skb->ip_summed; __entry->hash = skb->hash; __entry->l4_hash = skb->l4_hash; __entry->len = skb->len; __entry->data_len = skb->data_len; __entry->truesize = skb->truesize; __entry->mac_header_valid = skb_mac_header_was_set(skb); __entry->mac_header = skb_mac_header(skb) - skb->data; __entry->nr_frags = skb_shinfo(skb)->nr_frags; __entry->gso_size = skb_shinfo(skb)->gso_size; __entry->gso_type = skb_shinfo(skb)->gso_type; ), TP_printk("dev=%s napi_id=%#x queue_mapping=%u skbaddr=%p vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d hash=0x%08x l4_hash=%d len=%u data_len=%u truesize=%u mac_header_valid=%d mac_header=%d nr_frags=%d gso_size=%d gso_type=%#x", __get_str(name), __entry->napi_id, __entry->queue_mapping, __entry->skbaddr, __entry->vlan_tagged, __entry->vlan_proto, __entry->vlan_tci, __entry->protocol, __entry->ip_summed, __entry->hash, __entry->l4_hash, __entry->len, __entry->data_len, __entry->truesize, __entry->mac_header_valid, __entry->mac_header, __entry->nr_frags, __entry->gso_size, __entry->gso_type) ); DEFINE_EVENT(net_dev_rx_verbose_template, napi_gro_frags_entry, TP_PROTO(const struct sk_buff *skb), TP_ARGS(skb) ); DEFINE_EVENT(net_dev_rx_verbose_template, napi_gro_receive_entry, TP_PROTO(const struct sk_buff *skb), TP_ARGS(skb) ); DEFINE_EVENT(net_dev_rx_verbose_template, netif_receive_skb_entry, TP_PROTO(const struct sk_buff *skb), TP_ARGS(skb) ); DEFINE_EVENT(net_dev_rx_verbose_template, netif_receive_skb_list_entry, TP_PROTO(const struct sk_buff *skb), TP_ARGS(skb) ); DEFINE_EVENT(net_dev_rx_verbose_template, netif_rx_entry, TP_PROTO(const struct sk_buff *skb), TP_ARGS(skb) ); DECLARE_EVENT_CLASS(net_dev_rx_exit_template, TP_PROTO(int ret), TP_ARGS(ret), TP_STRUCT__entry( __field(int, ret) ), TP_fast_assign( __entry->ret = ret; ), TP_printk("ret=%d", __entry->ret) ); DEFINE_EVENT(net_dev_rx_exit_template, napi_gro_frags_exit, TP_PROTO(int ret), TP_ARGS(ret) ); DEFINE_EVENT(net_dev_rx_exit_template, napi_gro_receive_exit, TP_PROTO(int ret), TP_ARGS(ret) ); DEFINE_EVENT(net_dev_rx_exit_template, netif_receive_skb_exit, TP_PROTO(int ret), TP_ARGS(ret) ); DEFINE_EVENT(net_dev_rx_exit_template, netif_rx_exit, TP_PROTO(int ret), TP_ARGS(ret) ); DEFINE_EVENT(net_dev_rx_exit_template, netif_receive_skb_list_exit, TP_PROTO(int ret), TP_ARGS(ret) ); #endif /* _TRACE_NET_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
69 63 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef LINUX_RESUME_USER_MODE_H #define LINUX_RESUME_USER_MODE_H #include <linux/sched.h> #include <linux/task_work.h> #include <linux/memcontrol.h> #include <linux/rseq.h> #include <linux/blk-cgroup.h> /** * set_notify_resume - cause resume_user_mode_work() to be called * @task: task that will call resume_user_mode_work() * * Calling this arranges that @task will call resume_user_mode_work() * before returning to user mode. If it's already running in user mode, * it will enter the kernel and call resume_user_mode_work() soon. * If it's blocked, it will not be woken. */ static inline void set_notify_resume(struct task_struct *task) { if (!test_and_set_tsk_thread_flag(task, TIF_NOTIFY_RESUME)) kick_process(task); } /** * resume_user_mode_work - Perform work before returning to user mode * @regs: user-mode registers of @current task * * This is called when %TIF_NOTIFY_RESUME has been set. Now we are * about to return to user mode, and the user state in @regs can be * inspected or adjusted. The caller in arch code has cleared * %TIF_NOTIFY_RESUME before the call. If the flag gets set again * asynchronously, this will be called again before we return to * user mode. * * Called without locks. */ static inline void resume_user_mode_work(struct pt_regs *regs) { clear_thread_flag(TIF_NOTIFY_RESUME); /* * This barrier pairs with task_work_add()->set_notify_resume() after * hlist_add_head(task->task_works); */ smp_mb__after_atomic(); if (unlikely(task_work_pending(current))) task_work_run(); #ifdef CONFIG_KEYS_REQUEST_CACHE if (unlikely(current->cached_requested_key)) { key_put(current->cached_requested_key); current->cached_requested_key = NULL; } #endif mem_cgroup_handle_over_high(GFP_KERNEL); blkcg_maybe_throttle_current(); rseq_handle_notify_resume(NULL, regs); } #endif /* LINUX_RESUME_USER_MODE_H */
246 409 211 217 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 /* SPDX-License-Identifier: GPL-2.0 */ /* * NUMA memory policies for Linux. * Copyright 2003,2004 Andi Kleen SuSE Labs */ #ifndef _LINUX_MEMPOLICY_H #define _LINUX_MEMPOLICY_H 1 #include <linux/sched.h> #include <linux/mmzone.h> #include <linux/slab.h> #include <linux/rbtree.h> #include <linux/spinlock.h> #include <linux/node.h> #include <linux/nodemask.h> #include <linux/pagemap.h> #include <uapi/linux/mempolicy.h> struct mm_struct; #define NO_INTERLEAVE_INDEX (-1UL) /* use task il_prev for interleaving */ #ifdef CONFIG_NUMA /* * Describe a memory policy. * * A mempolicy can be either associated with a process or with a VMA. * For VMA related allocations the VMA policy is preferred, otherwise * the process policy is used. Interrupts ignore the memory policy * of the current process. * * Locking policy for interleave: * In process context there is no locking because only the process accesses * its own state. All vma manipulation is somewhat protected by a down_read on * mmap_lock. * * Freeing policy: * Mempolicy objects are reference counted. A mempolicy will be freed when * mpol_put() decrements the reference count to zero. * * Duplicating policy objects: * mpol_dup() allocates a new mempolicy and copies the specified mempolicy * to the new storage. The reference count of the new object is initialized * to 1, representing the caller of mpol_dup(). */ struct mempolicy { atomic_t refcnt; unsigned short mode; /* See MPOL_* above */ unsigned short flags; /* See set_mempolicy() MPOL_F_* above */ nodemask_t nodes; /* interleave/bind/preferred/etc */ int home_node; /* Home node to use for MPOL_BIND and MPOL_PREFERRED_MANY */ union { nodemask_t cpuset_mems_allowed; /* relative to these nodes */ nodemask_t user_nodemask; /* nodemask passed by user */ } w; }; /* * Support for managing mempolicy data objects (clone, copy, destroy) * The default fast path of a NULL MPOL_DEFAULT policy is always inlined. */ extern void __mpol_put(struct mempolicy *pol); static inline void mpol_put(struct mempolicy *pol) { if (pol) __mpol_put(pol); } /* * Does mempolicy pol need explicit unref after use? * Currently only needed for shared policies. */ static inline int mpol_needs_cond_ref(struct mempolicy *pol) { return (pol && (pol->flags & MPOL_F_SHARED)); } static inline void mpol_cond_put(struct mempolicy *pol) { if (mpol_needs_cond_ref(pol)) __mpol_put(pol); } extern struct mempolicy *__mpol_dup(struct mempolicy *pol); static inline struct mempolicy *mpol_dup(struct mempolicy *pol) { if (pol) pol = __mpol_dup(pol); return pol; } static inline void mpol_get(struct mempolicy *pol) { if (pol) atomic_inc(&pol->refcnt); } extern bool __mpol_equal(struct mempolicy *a, struct mempolicy *b); static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b) { if (a == b) return true; return __mpol_equal(a, b); } /* * Tree of shared policies for a shared memory region. */ struct shared_policy { struct rb_root root; rwlock_t lock; }; struct sp_node { struct rb_node nd; pgoff_t start, end; struct mempolicy *policy; }; int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst); void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol); int mpol_set_shared_policy(struct shared_policy *sp, struct vm_area_struct *vma, struct mempolicy *mpol); void mpol_free_shared_policy(struct shared_policy *sp); struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, pgoff_t idx); struct mempolicy *get_task_policy(struct task_struct *p); struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, unsigned long addr, pgoff_t *ilx); struct mempolicy *get_vma_policy(struct vm_area_struct *vma, unsigned long addr, int order, pgoff_t *ilx); bool vma_policy_mof(struct vm_area_struct *vma); extern void numa_default_policy(void); extern void numa_policy_init(void); extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new); extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new); extern int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, nodemask_t **nodemask); extern bool init_nodemask_of_mempolicy(nodemask_t *mask); extern bool mempolicy_in_oom_domain(struct task_struct *tsk, const nodemask_t *mask); extern unsigned int mempolicy_slab_node(void); extern enum zone_type policy_zone; static inline void check_highest_zone(enum zone_type k) { if (k > policy_zone && k != ZONE_MOVABLE) policy_zone = k; } int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, int flags); #ifdef CONFIG_TMPFS extern int mpol_parse_str(char *str, struct mempolicy **mpol); #endif extern void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol); /* Check if a vma is migratable */ extern bool vma_migratable(struct vm_area_struct *vma); int mpol_misplaced(struct folio *folio, struct vm_fault *vmf, unsigned long addr); extern void mpol_put_task_policy(struct task_struct *); static inline bool mpol_is_preferred_many(struct mempolicy *pol) { return (pol->mode == MPOL_PREFERRED_MANY); } extern bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone); extern int mempolicy_set_node_perf(unsigned int node, struct access_coordinate *coords); #else struct mempolicy {}; static inline struct mempolicy *get_task_policy(struct task_struct *p) { return NULL; } static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b) { return true; } static inline void mpol_put(struct mempolicy *pol) { } static inline void mpol_cond_put(struct mempolicy *pol) { } static inline void mpol_get(struct mempolicy *pol) { } struct shared_policy {}; static inline void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) { } static inline void mpol_free_shared_policy(struct shared_policy *sp) { } static inline struct mempolicy * mpol_shared_policy_lookup(struct shared_policy *sp, pgoff_t idx) { return NULL; } static inline struct mempolicy *get_vma_policy(struct vm_area_struct *vma, unsigned long addr, int order, pgoff_t *ilx) { *ilx = 0; return NULL; } static inline int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) { return 0; } static inline void numa_policy_init(void) { } static inline void numa_default_policy(void) { } static inline void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) { } static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) { } static inline int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, nodemask_t **nodemask) { *mpol = NULL; *nodemask = NULL; return 0; } static inline bool init_nodemask_of_mempolicy(nodemask_t *m) { return false; } static inline int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, int flags) { return 0; } static inline void check_highest_zone(int k) { } #ifdef CONFIG_TMPFS static inline int mpol_parse_str(char *str, struct mempolicy **mpol) { return 1; /* error */ } #endif static inline int mpol_misplaced(struct folio *folio, struct vm_fault *vmf, unsigned long address) { return -1; /* no node preference */ } static inline void mpol_put_task_policy(struct task_struct *task) { } static inline bool mpol_is_preferred_many(struct mempolicy *pol) { return false; } #endif /* CONFIG_NUMA */ #endif
971 971 12 237 236 237 971 971 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_CPUMASK_H #define __LINUX_CPUMASK_H /* * Cpumasks provide a bitmap suitable for representing the * set of CPUs in a system, one bit position per CPU number. In general, * only nr_cpu_ids (<= NR_CPUS) bits are valid. */ #include <linux/cleanup.h> #include <linux/kernel.h> #include <linux/bitmap.h> #include <linux/cpumask_types.h> #include <linux/atomic.h> #include <linux/bug.h> #include <linux/gfp_types.h> #include <linux/numa.h> /** * cpumask_pr_args - printf args to output a cpumask * @maskp: cpumask to be printed * * Can be used to provide arguments for '%*pb[l]' when printing a cpumask. */ #define cpumask_pr_args(maskp) nr_cpu_ids, cpumask_bits(maskp) #if (NR_CPUS == 1) || defined(CONFIG_FORCE_NR_CPUS) #define nr_cpu_ids ((unsigned int)NR_CPUS) #else extern unsigned int nr_cpu_ids; #endif static __always_inline void set_nr_cpu_ids(unsigned int nr) { #if (NR_CPUS == 1) || defined(CONFIG_FORCE_NR_CPUS) WARN_ON(nr != nr_cpu_ids); #else nr_cpu_ids = nr; #endif } /* * We have several different "preferred sizes" for the cpumask * operations, depending on operation. * * For example, the bitmap scanning and operating operations have * optimized routines that work for the single-word case, but only when * the size is constant. So if NR_CPUS fits in one single word, we are * better off using that small constant, in order to trigger the * optimized bit finding. That is 'small_cpumask_size'. * * The clearing and copying operations will similarly perform better * with a constant size, but we limit that size arbitrarily to four * words. We call this 'large_cpumask_size'. * * Finally, some operations just want the exact limit, either because * they set bits or just don't have any faster fixed-sized versions. We * call this just 'nr_cpumask_bits'. * * Note that these optional constants are always guaranteed to be at * least as big as 'nr_cpu_ids' itself is, and all our cpumask * allocations are at least that size (see cpumask_size()). The * optimization comes from being able to potentially use a compile-time * constant instead of a run-time generated exact number of CPUs. */ #if NR_CPUS <= BITS_PER_LONG #define small_cpumask_bits ((unsigned int)NR_CPUS) #define large_cpumask_bits ((unsigned int)NR_CPUS) #elif NR_CPUS <= 4*BITS_PER_LONG #define small_cpumask_bits nr_cpu_ids #define large_cpumask_bits ((unsigned int)NR_CPUS) #else #define small_cpumask_bits nr_cpu_ids #define large_cpumask_bits nr_cpu_ids #endif #define nr_cpumask_bits nr_cpu_ids /* * The following particular system cpumasks and operations manage * possible, present, active and online cpus. * * cpu_possible_mask- has bit 'cpu' set iff cpu is populatable * cpu_present_mask - has bit 'cpu' set iff cpu is populated * cpu_enabled_mask - has bit 'cpu' set iff cpu can be brought online * cpu_online_mask - has bit 'cpu' set iff cpu available to scheduler * cpu_active_mask - has bit 'cpu' set iff cpu available to migration * * If !CONFIG_HOTPLUG_CPU, present == possible, and active == online. * * The cpu_possible_mask is fixed at boot time, as the set of CPU IDs * that it is possible might ever be plugged in at anytime during the * life of that system boot. The cpu_present_mask is dynamic(*), * representing which CPUs are currently plugged in. And * cpu_online_mask is the dynamic subset of cpu_present_mask, * indicating those CPUs available for scheduling. * * If HOTPLUG is enabled, then cpu_present_mask varies dynamically, * depending on what ACPI reports as currently plugged in, otherwise * cpu_present_mask is just a copy of cpu_possible_mask. * * (*) Well, cpu_present_mask is dynamic in the hotplug case. If not * hotplug, it's a copy of cpu_possible_mask, hence fixed at boot. * * Subtleties: * 1) UP ARCHes (NR_CPUS == 1, CONFIG_SMP not defined) hardcode * assumption that their single CPU is online. The UP * cpu_{online,possible,present}_masks are placebos. Changing them * will have no useful affect on the following num_*_cpus() * and cpu_*() macros in the UP case. This ugliness is a UP * optimization - don't waste any instructions or memory references * asking if you're online or how many CPUs there are if there is * only one CPU. */ extern struct cpumask __cpu_possible_mask; extern struct cpumask __cpu_online_mask; extern struct cpumask __cpu_enabled_mask; extern struct cpumask __cpu_present_mask; extern struct cpumask __cpu_active_mask; extern struct cpumask __cpu_dying_mask; #define cpu_possible_mask ((const struct cpumask *)&__cpu_possible_mask) #define cpu_online_mask ((const struct cpumask *)&__cpu_online_mask) #define cpu_enabled_mask ((const struct cpumask *)&__cpu_enabled_mask) #define cpu_present_mask ((const struct cpumask *)&__cpu_present_mask) #define cpu_active_mask ((const struct cpumask *)&__cpu_active_mask) #define cpu_dying_mask ((const struct cpumask *)&__cpu_dying_mask) extern atomic_t __num_online_cpus; extern cpumask_t cpus_booted_once_mask; static __always_inline void cpu_max_bits_warn(unsigned int cpu, unsigned int bits) { #ifdef CONFIG_DEBUG_PER_CPU_MAPS WARN_ON_ONCE(cpu >= bits); #endif /* CONFIG_DEBUG_PER_CPU_MAPS */ } /* verify cpu argument to cpumask_* operators */ static __always_inline unsigned int cpumask_check(unsigned int cpu) { cpu_max_bits_warn(cpu, small_cpumask_bits); return cpu; } /** * cpumask_first - get the first cpu in a cpumask * @srcp: the cpumask pointer * * Return: >= nr_cpu_ids if no cpus set. */ static __always_inline unsigned int cpumask_first(const struct cpumask *srcp) { return find_first_bit(cpumask_bits(srcp), small_cpumask_bits); } /** * cpumask_first_zero - get the first unset cpu in a cpumask * @srcp: the cpumask pointer * * Return: >= nr_cpu_ids if all cpus are set. */ static __always_inline unsigned int cpumask_first_zero(const struct cpumask *srcp) { return find_first_zero_bit(cpumask_bits(srcp), small_cpumask_bits); } /** * cpumask_first_and - return the first cpu from *srcp1 & *srcp2 * @srcp1: the first input * @srcp2: the second input * * Return: >= nr_cpu_ids if no cpus set in both. See also cpumask_next_and(). */ static __always_inline unsigned int cpumask_first_and(const struct cpumask *srcp1, const struct cpumask *srcp2) { return find_first_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits); } /** * cpumask_first_andnot - return the first cpu from *srcp1 & ~*srcp2 * @srcp1: the first input * @srcp2: the second input * * Return: >= nr_cpu_ids if no such cpu found. */ static __always_inline unsigned int cpumask_first_andnot(const struct cpumask *srcp1, const struct cpumask *srcp2) { return find_first_andnot_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits); } /** * cpumask_first_and_and - return the first cpu from *srcp1 & *srcp2 & *srcp3 * @srcp1: the first input * @srcp2: the second input * @srcp3: the third input * * Return: >= nr_cpu_ids if no cpus set in all. */ static __always_inline unsigned int cpumask_first_and_and(const struct cpumask *srcp1, const struct cpumask *srcp2, const struct cpumask *srcp3) { return find_first_and_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), cpumask_bits(srcp3), small_cpumask_bits); } /** * cpumask_last - get the last CPU in a cpumask * @srcp: - the cpumask pointer * * Return: >= nr_cpumask_bits if no CPUs set. */ static __always_inline unsigned int cpumask_last(const struct cpumask *srcp) { return find_last_bit(cpumask_bits(srcp), small_cpumask_bits); } /** * cpumask_next - get the next cpu in a cpumask * @n: the cpu prior to the place to search (i.e. return will be > @n) * @srcp: the cpumask pointer * * Return: >= nr_cpu_ids if no further cpus set. */ static __always_inline unsigned int cpumask_next(int n, const struct cpumask *srcp) { /* -1 is a legal arg here. */ if (n != -1) cpumask_check(n); return find_next_bit(cpumask_bits(srcp), small_cpumask_bits, n + 1); } /** * cpumask_next_zero - get the next unset cpu in a cpumask * @n: the cpu prior to the place to search (i.e. return will be > @n) * @srcp: the cpumask pointer * * Return: >= nr_cpu_ids if no further cpus unset. */ static __always_inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp) { /* -1 is a legal arg here. */ if (n != -1) cpumask_check(n); return find_next_zero_bit(cpumask_bits(srcp), small_cpumask_bits, n+1); } #if NR_CPUS == 1 /* Uniprocessor: there is only one valid CPU */ static __always_inline unsigned int cpumask_local_spread(unsigned int i, int node) { return 0; } static __always_inline unsigned int cpumask_any_and_distribute(const struct cpumask *src1p, const struct cpumask *src2p) { return cpumask_first_and(src1p, src2p); } static __always_inline unsigned int cpumask_any_distribute(const struct cpumask *srcp) { return cpumask_first(srcp); } #else unsigned int cpumask_local_spread(unsigned int i, int node); unsigned int cpumask_any_and_distribute(const struct cpumask *src1p, const struct cpumask *src2p); unsigned int cpumask_any_distribute(const struct cpumask *srcp); #endif /* NR_CPUS */ /** * cpumask_next_and - get the next cpu in *src1p & *src2p * @n: the cpu prior to the place to search (i.e. return will be > @n) * @src1p: the first cpumask pointer * @src2p: the second cpumask pointer * * Return: >= nr_cpu_ids if no further cpus set in both. */ static __always_inline unsigned int cpumask_next_and(int n, const struct cpumask *src1p, const struct cpumask *src2p) { /* -1 is a legal arg here. */ if (n != -1) cpumask_check(n); return find_next_and_bit(cpumask_bits(src1p), cpumask_bits(src2p), small_cpumask_bits, n + 1); } /** * cpumask_next_andnot - get the next cpu in *src1p & ~*src2p * @n: the cpu prior to the place to search (i.e. return will be > @n) * @src1p: the first cpumask pointer * @src2p: the second cpumask pointer * * Return: >= nr_cpu_ids if no further cpus set in both. */ static __always_inline unsigned int cpumask_next_andnot(int n, const struct cpumask *src1p, const struct cpumask *src2p) { /* -1 is a legal arg here. */ if (n != -1) cpumask_check(n); return find_next_andnot_bit(cpumask_bits(src1p), cpumask_bits(src2p), small_cpumask_bits, n + 1); } /** * cpumask_next_and_wrap - get the next cpu in *src1p & *src2p, starting from * @n+1. If nothing found, wrap around and start from * the beginning * @n: the cpu prior to the place to search (i.e. search starts from @n+1) * @src1p: the first cpumask pointer * @src2p: the second cpumask pointer * * Return: next set bit, wrapped if needed, or >= nr_cpu_ids if @src1p & @src2p is empty. */ static __always_inline unsigned int cpumask_next_and_wrap(int n, const struct cpumask *src1p, const struct cpumask *src2p) { /* -1 is a legal arg here. */ if (n != -1) cpumask_check(n); return find_next_and_bit_wrap(cpumask_bits(src1p), cpumask_bits(src2p), small_cpumask_bits, n + 1); } /** * cpumask_next_wrap - get the next cpu in *src, starting from @n+1. If nothing * found, wrap around and start from the beginning * @n: the cpu prior to the place to search (i.e. search starts from @n+1) * @src: cpumask pointer * * Return: next set bit, wrapped if needed, or >= nr_cpu_ids if @src is empty. */ static __always_inline unsigned int cpumask_next_wrap(int n, const struct cpumask *src) { /* -1 is a legal arg here. */ if (n != -1) cpumask_check(n); return find_next_bit_wrap(cpumask_bits(src), small_cpumask_bits, n + 1); } /** * cpumask_random - get random cpu in *src. * @src: cpumask pointer * * Return: random set bit, or >= nr_cpu_ids if @src is empty. */ static __always_inline unsigned int cpumask_random(const struct cpumask *src) { return find_random_bit(cpumask_bits(src), nr_cpu_ids); } /** * for_each_cpu - iterate over every cpu in a mask * @cpu: the (optionally unsigned) integer iterator * @mask: the cpumask pointer * * After the loop, cpu is >= nr_cpu_ids. */ #define for_each_cpu(cpu, mask) \ for_each_set_bit(cpu, cpumask_bits(mask), small_cpumask_bits) /** * for_each_cpu_wrap - iterate over every cpu in a mask, starting at a specified location * @cpu: the (optionally unsigned) integer iterator * @mask: the cpumask pointer * @start: the start location * * The implementation does not assume any bit in @mask is set (including @start). * * After the loop, cpu is >= nr_cpu_ids. */ #define for_each_cpu_wrap(cpu, mask, start) \ for_each_set_bit_wrap(cpu, cpumask_bits(mask), small_cpumask_bits, start) /** * for_each_cpu_and - iterate over every cpu in both masks * @cpu: the (optionally unsigned) integer iterator * @mask1: the first cpumask pointer * @mask2: the second cpumask pointer * * This saves a temporary CPU mask in many places. It is equivalent to: * struct cpumask tmp; * cpumask_and(&tmp, &mask1, &mask2); * for_each_cpu(cpu, &tmp) * ... * * After the loop, cpu is >= nr_cpu_ids. */ #define for_each_cpu_and(cpu, mask1, mask2) \ for_each_and_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits) /** * for_each_cpu_andnot - iterate over every cpu present in one mask, excluding * those present in another. * @cpu: the (optionally unsigned) integer iterator * @mask1: the first cpumask pointer * @mask2: the second cpumask pointer * * This saves a temporary CPU mask in many places. It is equivalent to: * struct cpumask tmp; * cpumask_andnot(&tmp, &mask1, &mask2); * for_each_cpu(cpu, &tmp) * ... * * After the loop, cpu is >= nr_cpu_ids. */ #define for_each_cpu_andnot(cpu, mask1, mask2) \ for_each_andnot_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits) /** * for_each_cpu_or - iterate over every cpu present in either mask * @cpu: the (optionally unsigned) integer iterator * @mask1: the first cpumask pointer * @mask2: the second cpumask pointer * * This saves a temporary CPU mask in many places. It is equivalent to: * struct cpumask tmp; * cpumask_or(&tmp, &mask1, &mask2); * for_each_cpu(cpu, &tmp) * ... * * After the loop, cpu is >= nr_cpu_ids. */ #define for_each_cpu_or(cpu, mask1, mask2) \ for_each_or_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits) /** * for_each_cpu_from - iterate over CPUs present in @mask, from @cpu to the end of @mask. * @cpu: the (optionally unsigned) integer iterator * @mask: the cpumask pointer * * After the loop, cpu is >= nr_cpu_ids. */ #define for_each_cpu_from(cpu, mask) \ for_each_set_bit_from(cpu, cpumask_bits(mask), small_cpumask_bits) /** * cpumask_any_but - return an arbitrary cpu in a cpumask, but not this one. * @mask: the cpumask to search * @cpu: the cpu to ignore. * * Often used to find any cpu but smp_processor_id() in a mask. * If @cpu == -1, the function is equivalent to cpumask_any(). * Return: >= nr_cpu_ids if no cpus set. */ static __always_inline unsigned int cpumask_any_but(const struct cpumask *mask, int cpu) { unsigned int i; /* -1 is a legal arg here. */ if (cpu != -1) cpumask_check(cpu); for_each_cpu(i, mask) if (i != cpu) break; return i; } /** * cpumask_any_and_but - pick an arbitrary cpu from *mask1 & *mask2, but not this one. * @mask1: the first input cpumask * @mask2: the second input cpumask * @cpu: the cpu to ignore * * If @cpu == -1, the function is equivalent to cpumask_any_and(). * Returns >= nr_cpu_ids if no cpus set. */ static __always_inline unsigned int cpumask_any_and_but(const struct cpumask *mask1, const struct cpumask *mask2, int cpu) { unsigned int i; /* -1 is a legal arg here. */ if (cpu != -1) cpumask_check(cpu); i = cpumask_first_and(mask1, mask2); if (i != cpu) return i; return cpumask_next_and(cpu, mask1, mask2); } /** * cpumask_any_andnot_but - pick an arbitrary cpu from *mask1 & ~*mask2, but not this one. * @mask1: the first input cpumask * @mask2: the second input cpumask * @cpu: the cpu to ignore * * If @cpu == -1, the function returns the first matching cpu. * Returns >= nr_cpu_ids if no cpus set. */ static __always_inline unsigned int cpumask_any_andnot_but(const struct cpumask *mask1, const struct cpumask *mask2, int cpu) { unsigned int i; /* -1 is a legal arg here. */ if (cpu != -1) cpumask_check(cpu); i = cpumask_first_andnot(mask1, mask2); if (i != cpu) return i; return cpumask_next_andnot(cpu, mask1, mask2); } /** * cpumask_nth - get the Nth cpu in a cpumask * @srcp: the cpumask pointer * @cpu: the Nth cpu to find, starting from 0 * * Return: >= nr_cpu_ids if such cpu doesn't exist. */ static __always_inline unsigned int cpumask_nth(unsigned int cpu, const struct cpumask *srcp) { return find_nth_bit(cpumask_bits(srcp), small_cpumask_bits, cpumask_check(cpu)); } /** * cpumask_nth_and - get the Nth cpu in 2 cpumasks * @srcp1: the cpumask pointer * @srcp2: the cpumask pointer * @cpu: the Nth cpu to find, starting from 0 * * Return: >= nr_cpu_ids if such cpu doesn't exist. */ static __always_inline unsigned int cpumask_nth_and(unsigned int cpu, const struct cpumask *srcp1, const struct cpumask *srcp2) { return find_nth_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits, cpumask_check(cpu)); } /** * cpumask_nth_and_andnot - get the Nth cpu set in 1st and 2nd cpumask, and clear in 3rd. * @srcp1: the cpumask pointer * @srcp2: the cpumask pointer * @srcp3: the cpumask pointer * @cpu: the Nth cpu to find, starting from 0 * * Return: >= nr_cpu_ids if such cpu doesn't exist. */ static __always_inline unsigned int cpumask_nth_and_andnot(unsigned int cpu, const struct cpumask *srcp1, const struct cpumask *srcp2, const struct cpumask *srcp3) { return find_nth_and_andnot_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), cpumask_bits(srcp3), small_cpumask_bits, cpumask_check(cpu)); } #define CPU_BITS_NONE \ { \ [0 ... BITS_TO_LONGS(NR_CPUS)-1] = 0UL \ } #define CPU_BITS_CPU0 \ { \ [0] = 1UL \ } /** * cpumask_set_cpu - set a cpu in a cpumask * @cpu: cpu number (< nr_cpu_ids) * @dstp: the cpumask pointer */ static __always_inline void cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp) { set_bit(cpumask_check(cpu), cpumask_bits(dstp)); } static __always_inline void __cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp) { __set_bit(cpumask_check(cpu), cpumask_bits(dstp)); } /** * cpumask_clear_cpus - clear cpus in a cpumask * @dstp: the cpumask pointer * @cpu: cpu number (< nr_cpu_ids) * @ncpus: number of cpus to clear (< nr_cpu_ids) */ static __always_inline void cpumask_clear_cpus(struct cpumask *dstp, unsigned int cpu, unsigned int ncpus) { cpumask_check(cpu + ncpus - 1); bitmap_clear(cpumask_bits(dstp), cpumask_check(cpu), ncpus); } /** * cpumask_clear_cpu - clear a cpu in a cpumask * @cpu: cpu number (< nr_cpu_ids) * @dstp: the cpumask pointer */ static __always_inline void cpumask_clear_cpu(int cpu, struct cpumask *dstp) { clear_bit(cpumask_check(cpu), cpumask_bits(dstp)); } static __always_inline void __cpumask_clear_cpu(int cpu, struct cpumask *dstp) { __clear_bit(cpumask_check(cpu), cpumask_bits(dstp)); } /** * cpumask_test_cpu - test for a cpu in a cpumask * @cpu: cpu number (< nr_cpu_ids) * @cpumask: the cpumask pointer * * Return: true if @cpu is set in @cpumask, else returns false */ static __always_inline bool cpumask_test_cpu(int cpu, const struct cpumask *cpumask) { return test_bit(cpumask_check(cpu), cpumask_bits((cpumask))); } /** * cpumask_test_and_set_cpu - atomically test and set a cpu in a cpumask * @cpu: cpu number (< nr_cpu_ids) * @cpumask: the cpumask pointer * * test_and_set_bit wrapper for cpumasks. * * Return: true if @cpu is set in old bitmap of @cpumask, else returns false */ static __always_inline bool cpumask_test_and_set_cpu(int cpu, struct cpumask *cpumask) { return test_and_set_bit(cpumask_check(cpu), cpumask_bits(cpumask)); } /** * cpumask_test_and_clear_cpu - atomically test and clear a cpu in a cpumask * @cpu: cpu number (< nr_cpu_ids) * @cpumask: the cpumask pointer * * test_and_clear_bit wrapper for cpumasks. * * Return: true if @cpu is set in old bitmap of @cpumask, else returns false */ static __always_inline bool cpumask_test_and_clear_cpu(int cpu, struct cpumask *cpumask) { return test_and_clear_bit(cpumask_check(cpu), cpumask_bits(cpumask)); } /** * cpumask_setall - set all cpus (< nr_cpu_ids) in a cpumask * @dstp: the cpumask pointer */ static __always_inline void cpumask_setall(struct cpumask *dstp) { if (small_const_nbits(small_cpumask_bits)) { cpumask_bits(dstp)[0] = BITMAP_LAST_WORD_MASK(nr_cpumask_bits); return; } bitmap_fill(cpumask_bits(dstp), nr_cpumask_bits); } /** * cpumask_clear - clear all cpus (< nr_cpu_ids) in a cpumask * @dstp: the cpumask pointer */ static __always_inline void cpumask_clear(struct cpumask *dstp) { bitmap_zero(cpumask_bits(dstp), large_cpumask_bits); } /** * cpumask_and - *dstp = *src1p & *src2p * @dstp: the cpumask result * @src1p: the first input * @src2p: the second input * * Return: false if *@dstp is empty, else returns true */ static __always_inline bool cpumask_and(struct cpumask *dstp, const struct cpumask *src1p, const struct cpumask *src2p) { return bitmap_and(cpumask_bits(dstp), cpumask_bits(src1p), cpumask_bits(src2p), small_cpumask_bits); } /** * cpumask_or - *dstp = *src1p | *src2p * @dstp: the cpumask result * @src1p: the first input * @src2p: the second input */ static __always_inline void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p, const struct cpumask *src2p) { bitmap_or(cpumask_bits(dstp), cpumask_bits(src1p), cpumask_bits(src2p), small_cpumask_bits); } /** * cpumask_xor - *dstp = *src1p ^ *src2p * @dstp: the cpumask result * @src1p: the first input * @src2p: the second input */ static __always_inline void cpumask_xor(struct cpumask *dstp, const struct cpumask *src1p, const struct cpumask *src2p) { bitmap_xor(cpumask_bits(dstp), cpumask_bits(src1p), cpumask_bits(src2p), small_cpumask_bits); } /** * cpumask_andnot - *dstp = *src1p & ~*src2p * @dstp: the cpumask result * @src1p: the first input * @src2p: the second input * * Return: false if *@dstp is empty, else returns true */ static __always_inline bool cpumask_andnot(struct cpumask *dstp, const struct cpumask *src1p, const struct cpumask *src2p) { return bitmap_andnot(cpumask_bits(dstp), cpumask_bits(src1p), cpumask_bits(src2p), small_cpumask_bits); } /** * cpumask_equal - *src1p == *src2p * @src1p: the first input * @src2p: the second input * * Return: true if the cpumasks are equal, false if not */ static __always_inline bool cpumask_equal(const struct cpumask *src1p, const struct cpumask *src2p) { return bitmap_equal(cpumask_bits(src1p), cpumask_bits(src2p), small_cpumask_bits); } /** * cpumask_or_equal - *src1p | *src2p == *src3p * @src1p: the first input * @src2p: the second input * @src3p: the third input * * Return: true if first cpumask ORed with second cpumask == third cpumask, * otherwise false */ static __always_inline bool cpumask_or_equal(const struct cpumask *src1p, const struct cpumask *src2p, const struct cpumask *src3p) { return bitmap_or_equal(cpumask_bits(src1p), cpumask_bits(src2p), cpumask_bits(src3p), small_cpumask_bits); } /** * cpumask_intersects - (*src1p & *src2p) != 0 * @src1p: the first input * @src2p: the second input * * Return: true if first cpumask ANDed with second cpumask is non-empty, * otherwise false */ static __always_inline bool cpumask_intersects(const struct cpumask *src1p, const struct cpumask *src2p) { return bitmap_intersects(cpumask_bits(src1p), cpumask_bits(src2p), small_cpumask_bits); } /** * cpumask_subset - (*src1p & ~*src2p) == 0 * @src1p: the first input * @src2p: the second input * * Return: true if *@src1p is a subset of *@src2p, else returns false */ static __always_inline bool cpumask_subset(const struct cpumask *src1p, const struct cpumask *src2p) { return bitmap_subset(cpumask_bits(src1p), cpumask_bits(src2p), small_cpumask_bits); } /** * cpumask_empty - *srcp == 0 * @srcp: the cpumask to that all cpus < nr_cpu_ids are clear. * * Return: true if srcp is empty (has no bits set), else false */ static __always_inline bool cpumask_empty(const struct cpumask *srcp) { return bitmap_empty(cpumask_bits(srcp), small_cpumask_bits); } /** * cpumask_full - *srcp == 0xFFFFFFFF... * @srcp: the cpumask to that all cpus < nr_cpu_ids are set. * * Return: true if srcp is full (has all bits set), else false */ static __always_inline bool cpumask_full(const struct cpumask *srcp) { return bitmap_full(cpumask_bits(srcp), nr_cpumask_bits); } /** * cpumask_weight - Count of bits in *srcp * @srcp: the cpumask to count bits (< nr_cpu_ids) in. * * Return: count of bits set in *srcp */ static __always_inline unsigned int cpumask_weight(const struct cpumask *srcp) { return bitmap_weight(cpumask_bits(srcp), small_cpumask_bits); } /** * cpumask_weight_and - Count of bits in (*srcp1 & *srcp2) * @srcp1: the cpumask to count bits (< nr_cpu_ids) in. * @srcp2: the cpumask to count bits (< nr_cpu_ids) in. * * Return: count of bits set in both *srcp1 and *srcp2 */ static __always_inline unsigned int cpumask_weight_and(const struct cpumask *srcp1, const struct cpumask *srcp2) { return bitmap_weight_and(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits); } /** * cpumask_weight_andnot - Count of bits in (*srcp1 & ~*srcp2) * @srcp1: the cpumask to count bits (< nr_cpu_ids) in. * @srcp2: the cpumask to count bits (< nr_cpu_ids) in. * * Return: count of bits set in both *srcp1 and *srcp2 */ static __always_inline unsigned int cpumask_weight_andnot(const struct cpumask *srcp1, const struct cpumask *srcp2) { return bitmap_weight_andnot(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits); } /** * cpumask_shift_right - *dstp = *srcp >> n * @dstp: the cpumask result * @srcp: the input to shift * @n: the number of bits to shift by */ static __always_inline void cpumask_shift_right(struct cpumask *dstp, const struct cpumask *srcp, int n) { bitmap_shift_right(cpumask_bits(dstp), cpumask_bits(srcp), n, small_cpumask_bits); } /** * cpumask_shift_left - *dstp = *srcp << n * @dstp: the cpumask result * @srcp: the input to shift * @n: the number of bits to shift by */ static __always_inline void cpumask_shift_left(struct cpumask *dstp, const struct cpumask *srcp, int n) { bitmap_shift_left(cpumask_bits(dstp), cpumask_bits(srcp), n, nr_cpumask_bits); } /** * cpumask_copy - *dstp = *srcp * @dstp: the result * @srcp: the input cpumask */ static __always_inline void cpumask_copy(struct cpumask *dstp, const struct cpumask *srcp) { bitmap_copy(cpumask_bits(dstp), cpumask_bits(srcp), large_cpumask_bits); } /** * cpumask_any - pick an arbitrary cpu from *srcp * @srcp: the input cpumask * * Return: >= nr_cpu_ids if no cpus set. */ #define cpumask_any(srcp) cpumask_first(srcp) /** * cpumask_any_and - pick an arbitrary cpu from *mask1 & *mask2 * @mask1: the first input cpumask * @mask2: the second input cpumask * * Return: >= nr_cpu_ids if no cpus set. */ #define cpumask_any_and(mask1, mask2) cpumask_first_and((mask1), (mask2)) /** * cpumask_of - the cpumask containing just a given cpu * @cpu: the cpu (<= nr_cpu_ids) */ #define cpumask_of(cpu) (get_cpu_mask(cpu)) /** * cpumask_parse_user - extract a cpumask from a user string * @buf: the buffer to extract from * @len: the length of the buffer * @dstp: the cpumask to set. * * Return: -errno, or 0 for success. */ static __always_inline int cpumask_parse_user(const char __user *buf, int len, struct cpumask *dstp) { return bitmap_parse_user(buf, len, cpumask_bits(dstp), nr_cpumask_bits); } /** * cpumask_parselist_user - extract a cpumask from a user string * @buf: the buffer to extract from * @len: the length of the buffer * @dstp: the cpumask to set. * * Return: -errno, or 0 for success. */ static __always_inline int cpumask_parselist_user(const char __user *buf, int len, struct cpumask *dstp) { return bitmap_parselist_user(buf, len, cpumask_bits(dstp), nr_cpumask_bits); } /** * cpumask_parse - extract a cpumask from a string * @buf: the buffer to extract from * @dstp: the cpumask to set. * * Return: -errno, or 0 for success. */ static __always_inline int cpumask_parse(const char *buf, struct cpumask *dstp) { return bitmap_parse(buf, UINT_MAX, cpumask_bits(dstp), nr_cpumask_bits); } /** * cpulist_parse - extract a cpumask from a user string of ranges * @buf: the buffer to extract from * @dstp: the cpumask to set. * * Return: -errno, or 0 for success. */ static __always_inline int cpulist_parse(const char *buf, struct cpumask *dstp) { return bitmap_parselist(buf, cpumask_bits(dstp), nr_cpumask_bits); } /** * cpumask_size - calculate size to allocate for a 'struct cpumask' in bytes * * Return: size to allocate for a &struct cpumask in bytes */ static __always_inline unsigned int cpumask_size(void) { return bitmap_size(large_cpumask_bits); } #ifdef CONFIG_CPUMASK_OFFSTACK #define this_cpu_cpumask_var_ptr(x) this_cpu_read(x) #define __cpumask_var_read_mostly __read_mostly bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node); static __always_inline bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node) { return alloc_cpumask_var_node(mask, flags | __GFP_ZERO, node); } /** * alloc_cpumask_var - allocate a struct cpumask * @mask: pointer to cpumask_var_t where the cpumask is returned * @flags: GFP_ flags * * Only defined when CONFIG_CPUMASK_OFFSTACK=y, otherwise is * a nop returning a constant 1 (in <linux/cpumask.h>). * * See alloc_cpumask_var_node. * * Return: %true if allocation succeeded, %false if not */ static __always_inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) { return alloc_cpumask_var_node(mask, flags, NUMA_NO_NODE); } static __always_inline bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) { return alloc_cpumask_var(mask, flags | __GFP_ZERO); } void alloc_bootmem_cpumask_var(cpumask_var_t *mask); void free_cpumask_var(cpumask_var_t mask); void free_bootmem_cpumask_var(cpumask_var_t mask); static __always_inline bool cpumask_available(cpumask_var_t mask) { return mask != NULL; } #else #define this_cpu_cpumask_var_ptr(x) this_cpu_ptr(x) #define __cpumask_var_read_mostly static __always_inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) { return true; } static __always_inline bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node) { return true; } static __always_inline bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) { cpumask_clear(*mask); return true; } static __always_inline bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node) { cpumask_clear(*mask); return true; } static __always_inline void alloc_bootmem_cpumask_var(cpumask_var_t *mask) { } static __always_inline void free_cpumask_var(cpumask_var_t mask) { } static __always_inline void free_bootmem_cpumask_var(cpumask_var_t mask) { } static __always_inline bool cpumask_available(cpumask_var_t mask) { return true; } #endif /* CONFIG_CPUMASK_OFFSTACK */ DEFINE_FREE(free_cpumask_var, struct cpumask *, if (_T) free_cpumask_var(_T)); /* It's common to want to use cpu_all_mask in struct member initializers, * so it has to refer to an address rather than a pointer. */ extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS); #define cpu_all_mask to_cpumask(cpu_all_bits) /* First bits of cpu_bit_bitmap are in fact unset. */ #define cpu_none_mask to_cpumask(cpu_bit_bitmap[0]) #if NR_CPUS == 1 /* Uniprocessor: the possible/online/present masks are always "1" */ #define for_each_possible_cpu(cpu) for ((cpu) = 0; (cpu) < 1; (cpu)++) #define for_each_online_cpu(cpu) for ((cpu) = 0; (cpu) < 1; (cpu)++) #define for_each_present_cpu(cpu) for ((cpu) = 0; (cpu) < 1; (cpu)++) #define for_each_possible_cpu_wrap(cpu, start) \ for ((void)(start), (cpu) = 0; (cpu) < 1; (cpu)++) #define for_each_online_cpu_wrap(cpu, start) \ for ((void)(start), (cpu) = 0; (cpu) < 1; (cpu)++) #else #define for_each_possible_cpu(cpu) for_each_cpu((cpu), cpu_possible_mask) #define for_each_online_cpu(cpu) for_each_cpu((cpu), cpu_online_mask) #define for_each_enabled_cpu(cpu) for_each_cpu((cpu), cpu_enabled_mask) #define for_each_present_cpu(cpu) for_each_cpu((cpu), cpu_present_mask) #define for_each_possible_cpu_wrap(cpu, start) \ for_each_cpu_wrap((cpu), cpu_possible_mask, (start)) #define for_each_online_cpu_wrap(cpu, start) \ for_each_cpu_wrap((cpu), cpu_online_mask, (start)) #endif /* Wrappers for arch boot code to manipulate normally-constant masks */ void init_cpu_present(const struct cpumask *src); void init_cpu_possible(const struct cpumask *src); #define assign_cpu(cpu, mask, val) \ assign_bit(cpumask_check(cpu), cpumask_bits(mask), (val)) #define __assign_cpu(cpu, mask, val) \ __assign_bit(cpumask_check(cpu), cpumask_bits(mask), (val)) #define set_cpu_possible(cpu, possible) assign_cpu((cpu), &__cpu_possible_mask, (possible)) #define set_cpu_enabled(cpu, enabled) assign_cpu((cpu), &__cpu_enabled_mask, (enabled)) #define set_cpu_present(cpu, present) assign_cpu((cpu), &__cpu_present_mask, (present)) #define set_cpu_active(cpu, active) assign_cpu((cpu), &__cpu_active_mask, (active)) #define set_cpu_dying(cpu, dying) assign_cpu((cpu), &__cpu_dying_mask, (dying)) void set_cpu_online(unsigned int cpu, bool online); /** * to_cpumask - convert a NR_CPUS bitmap to a struct cpumask * * @bitmap: the bitmap * * There are a few places where cpumask_var_t isn't appropriate and * static cpumasks must be used (eg. very early boot), yet we don't * expose the definition of 'struct cpumask'. * * This does the conversion, and can be used as a constant initializer. */ #define to_cpumask(bitmap) \ ((struct cpumask *)(1 ? (bitmap) \ : (void *)sizeof(__check_is_bitmap(bitmap)))) static __always_inline int __check_is_bitmap(const unsigned long *bitmap) { return 1; } /* * Special-case data structure for "single bit set only" constant CPU masks. * * We pre-generate all the 64 (or 32) possible bit positions, with enough * padding to the left and the right, and return the constant pointer * appropriately offset. */ extern const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)]; static __always_inline const struct cpumask *get_cpu_mask(unsigned int cpu) { const unsigned long *p = cpu_bit_bitmap[1 + cpu % BITS_PER_LONG]; p -= cpu / BITS_PER_LONG; return to_cpumask(p); } #if NR_CPUS > 1 /** * num_online_cpus() - Read the number of online CPUs * * Despite the fact that __num_online_cpus is of type atomic_t, this * interface gives only a momentary snapshot and is not protected against * concurrent CPU hotplug operations unless invoked from a cpuhp_lock held * region. * * Return: momentary snapshot of the number of online CPUs */ static __always_inline unsigned int num_online_cpus(void) { return raw_atomic_read(&__num_online_cpus); } #define num_possible_cpus() cpumask_weight(cpu_possible_mask) #define num_enabled_cpus() cpumask_weight(cpu_enabled_mask) #define num_present_cpus() cpumask_weight(cpu_present_mask) #define num_active_cpus() cpumask_weight(cpu_active_mask) static __always_inline bool cpu_online(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_online_mask); } static __always_inline bool cpu_enabled(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_enabled_mask); } static __always_inline bool cpu_possible(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_possible_mask); } static __always_inline bool cpu_present(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_present_mask); } static __always_inline bool cpu_active(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_active_mask); } static __always_inline bool cpu_dying(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_dying_mask); } #else #define num_online_cpus() 1U #define num_possible_cpus() 1U #define num_enabled_cpus() 1U #define num_present_cpus() 1U #define num_active_cpus() 1U static __always_inline bool cpu_online(unsigned int cpu) { return cpu == 0; } static __always_inline bool cpu_possible(unsigned int cpu) { return cpu == 0; } static __always_inline bool cpu_enabled(unsigned int cpu) { return cpu == 0; } static __always_inline bool cpu_present(unsigned int cpu) { return cpu == 0; } static __always_inline bool cpu_active(unsigned int cpu) { return cpu == 0; } static __always_inline bool cpu_dying(unsigned int cpu) { return false; } #endif /* NR_CPUS > 1 */ #define cpu_is_offline(cpu) unlikely(!cpu_online(cpu)) #if NR_CPUS <= BITS_PER_LONG #define CPU_BITS_ALL \ { \ [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS) \ } #else /* NR_CPUS > BITS_PER_LONG */ #define CPU_BITS_ALL \ { \ [0 ... BITS_TO_LONGS(NR_CPUS)-2] = ~0UL, \ [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS) \ } #endif /* NR_CPUS > BITS_PER_LONG */ /** * cpumap_print_to_pagebuf - copies the cpumask into the buffer either * as comma-separated list of cpus or hex values of cpumask * @list: indicates whether the cpumap must be list * @mask: the cpumask to copy * @buf: the buffer to copy into * * Return: the length of the (null-terminated) @buf string, zero if * nothing is copied. */ static __always_inline ssize_t cpumap_print_to_pagebuf(bool list, char *buf, const struct cpumask *mask) { return bitmap_print_to_pagebuf(list, buf, cpumask_bits(mask), nr_cpu_ids); } /** * cpumap_print_bitmask_to_buf - copies the cpumask into the buffer as * hex values of cpumask * * @buf: the buffer to copy into * @mask: the cpumask to copy * @off: in the string from which we are copying, we copy to @buf * @count: the maximum number of bytes to print * * The function prints the cpumask into the buffer as hex values of * cpumask; Typically used by bin_attribute to export cpumask bitmask * ABI. * * Return: the length of how many bytes have been copied, excluding * terminating '\0'. */ static __always_inline ssize_t cpumap_print_bitmask_to_buf(char *buf, const struct cpumask *mask, loff_t off, size_t count) { return bitmap_print_bitmask_to_buf(buf, cpumask_bits(mask), nr_cpu_ids, off, count) - 1; } /** * cpumap_print_list_to_buf - copies the cpumask into the buffer as * comma-separated list of cpus * @buf: the buffer to copy into * @mask: the cpumask to copy * @off: in the string from which we are copying, we copy to @buf * @count: the maximum number of bytes to print * * Everything is same with the above cpumap_print_bitmask_to_buf() * except the print format. * * Return: the length of how many bytes have been copied, excluding * terminating '\0'. */ static __always_inline ssize_t cpumap_print_list_to_buf(char *buf, const struct cpumask *mask, loff_t off, size_t count) { return bitmap_print_list_to_buf(buf, cpumask_bits(mask), nr_cpu_ids, off, count) - 1; } #if NR_CPUS <= BITS_PER_LONG #define CPU_MASK_ALL \ (cpumask_t) { { \ [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS) \ } } #else #define CPU_MASK_ALL \ (cpumask_t) { { \ [0 ... BITS_TO_LONGS(NR_CPUS)-2] = ~0UL, \ [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS) \ } } #endif /* NR_CPUS > BITS_PER_LONG */ #define CPU_MASK_NONE \ (cpumask_t) { { \ [0 ... BITS_TO_LONGS(NR_CPUS)-1] = 0UL \ } } #define CPU_MASK_CPU0 \ (cpumask_t) { { \ [0] = 1UL \ } } /* * Provide a valid theoretical max size for cpumap and cpulist sysfs files * to avoid breaking userspace which may allocate a buffer based on the size * reported by e.g. fstat. * * for cpumap NR_CPUS * 9/32 - 1 should be an exact length. * * For cpulist 7 is (ceil(log10(NR_CPUS)) + 1) allowing for NR_CPUS to be up * to 2 orders of magnitude larger than 8192. And then we divide by 2 to * cover a worst-case of every other cpu being on one of two nodes for a * very large NR_CPUS. * * Use PAGE_SIZE as a minimum for smaller configurations while avoiding * unsigned comparison to -1. */ #define CPUMAP_FILE_MAX_BYTES (((NR_CPUS * 9)/32 > PAGE_SIZE) \ ? (NR_CPUS * 9)/32 - 1 : PAGE_SIZE) #define CPULIST_FILE_MAX_BYTES (((NR_CPUS * 7)/2 > PAGE_SIZE) ? (NR_CPUS * 7)/2 : PAGE_SIZE) #endif /* __LINUX_CPUMASK_H */
241 275 240 1 1 241 11 11 241 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the AF_INET socket handler. * * Version: @(#)sock.h 1.0.4 05/13/93 * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Corey Minyard <wf-rch!minyard@relay.EU.net> * Florian La Roche <flla@stud.uni-sb.de> * * Fixes: * Alan Cox : Volatiles in skbuff pointers. See * skbuff comments. May be overdone, * better to prove they can be removed * than the reverse. * Alan Cox : Added a zapped field for tcp to note * a socket is reset and must stay shut up * Alan Cox : New fields for options * Pauline Middelink : identd support * Alan Cox : Eliminate low level recv/recvfrom * David S. Miller : New socket lookup architecture. * Steve Whitehouse: Default routines for sock_ops * Arnaldo C. Melo : removed net_pinfo, tp_pinfo and made * protinfo be just a void pointer, as the * protocol specific parts were moved to * respective headers and ipv4/v6, etc now * use private slabcaches for its socks * Pedro Hortas : New flags field for socket options */ #ifndef _SOCK_H #define _SOCK_H #include <linux/hardirq.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/list_nulls.h> #include <linux/timer.h> #include <linux/cache.h> #include <linux/bitops.h> #include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/skbuff.h> /* struct sk_buff */ #include <linux/mm.h> #include <linux/security.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/page_counter.h> #include <linux/memcontrol.h> #include <linux/static_key.h> #include <linux/sched.h> #include <linux/wait.h> #include <linux/cgroup-defs.h> #include <linux/rbtree.h> #include <linux/rculist_nulls.h> #include <linux/poll.h> #include <linux/sockptr.h> #include <linux/indirect_call_wrapper.h> #include <linux/atomic.h> #include <linux/refcount.h> #include <linux/llist.h> #include <net/dst.h> #include <net/checksum.h> #include <net/tcp_states.h> #include <linux/net_tstamp.h> #include <net/l3mdev.h> #include <uapi/linux/socket.h> /* * This structure really needs to be cleaned up. * Most of it is for TCP, and not used by any of * the other protocols. */ /* This is the per-socket lock. The spinlock provides a synchronization * between user contexts and software interrupt processing, whereas the * mini-semaphore synchronizes multiple users amongst themselves. */ typedef struct { spinlock_t slock; int owned; wait_queue_head_t wq; /* * We express the mutex-alike socket_lock semantics * to the lock validator by explicitly managing * the slock as a lock variant (in addition to * the slock itself): */ #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif } socket_lock_t; struct sock; struct proto; struct net; typedef __u32 __bitwise __portpair; typedef __u64 __bitwise __addrpair; /** * struct sock_common - minimal network layer representation of sockets * @skc_daddr: Foreign IPv4 addr * @skc_rcv_saddr: Bound local IPv4 addr * @skc_addrpair: 8-byte-aligned __u64 union of @skc_daddr & @skc_rcv_saddr * @skc_hash: hash value used with various protocol lookup tables * @skc_u16hashes: two u16 hash values used by UDP lookup tables * @skc_dport: placeholder for inet_dport/tw_dport * @skc_num: placeholder for inet_num/tw_num * @skc_portpair: __u32 union of @skc_dport & @skc_num * @skc_family: network address family * @skc_state: Connection state * @skc_reuse: %SO_REUSEADDR setting * @skc_reuseport: %SO_REUSEPORT setting * @skc_ipv6only: socket is IPV6 only * @skc_net_refcnt: socket is using net ref counting * @skc_bound_dev_if: bound device index if != 0 * @skc_bind_node: bind hash linkage for various protocol lookup tables * @skc_portaddr_node: second hash linkage for UDP/UDP-Lite protocol * @skc_prot: protocol handlers inside a network family * @skc_net: reference to the network namespace of this socket * @skc_v6_daddr: IPV6 destination address * @skc_v6_rcv_saddr: IPV6 source address * @skc_cookie: socket's cookie value * @skc_node: main hash linkage for various protocol lookup tables * @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol * @skc_tx_queue_mapping: tx queue number for this connection * @skc_rx_queue_mapping: rx queue number for this connection * @skc_flags: place holder for sk_flags * %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE, * %SO_OOBINLINE settings, %SO_TIMESTAMPING settings * @skc_listener: connection request listener socket (aka rsk_listener) * [union with @skc_flags] * @skc_tw_dr: (aka tw_dr) ptr to &struct inet_timewait_death_row * [union with @skc_flags] * @skc_incoming_cpu: record/match cpu processing incoming packets * @skc_rcv_wnd: (aka rsk_rcv_wnd) TCP receive window size (possibly scaled) * [union with @skc_incoming_cpu] * @skc_tw_rcv_nxt: (aka tw_rcv_nxt) TCP window next expected seq number * [union with @skc_incoming_cpu] * @skc_refcnt: reference count * * This is the minimal network layer representation of sockets, the header * for struct sock and struct inet_timewait_sock. */ struct sock_common { union { __addrpair skc_addrpair; struct { __be32 skc_daddr; __be32 skc_rcv_saddr; }; }; union { unsigned int skc_hash; __u16 skc_u16hashes[2]; }; /* skc_dport && skc_num must be grouped as well */ union { __portpair skc_portpair; struct { __be16 skc_dport; __u16 skc_num; }; }; unsigned short skc_family; volatile unsigned char skc_state; unsigned char skc_reuse:4; unsigned char skc_reuseport:1; unsigned char skc_ipv6only:1; unsigned char skc_net_refcnt:1; int skc_bound_dev_if; union { struct hlist_node skc_bind_node; struct hlist_node skc_portaddr_node; }; struct proto *skc_prot; possible_net_t skc_net; #if IS_ENABLED(CONFIG_IPV6) struct in6_addr skc_v6_daddr; struct in6_addr skc_v6_rcv_saddr; #endif atomic64_t skc_cookie; /* following fields are padding to force * offset(struct sock, sk_refcnt) == 128 on 64bit arches * assuming IPV6 is enabled. We use this padding differently * for different kind of 'sockets' */ union { unsigned long skc_flags; struct sock *skc_listener; /* request_sock */ struct inet_timewait_death_row *skc_tw_dr; /* inet_timewait_sock */ }; /* * fields between dontcopy_begin/dontcopy_end * are not copied in sock_copy() */ /* private: */ int skc_dontcopy_begin[0]; /* public: */ union { struct hlist_node skc_node; struct hlist_nulls_node skc_nulls_node; }; unsigned short skc_tx_queue_mapping; #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING unsigned short skc_rx_queue_mapping; #endif union { int skc_incoming_cpu; u32 skc_rcv_wnd; u32 skc_tw_rcv_nxt; /* struct tcp_timewait_sock */ }; refcount_t skc_refcnt; /* private: */ int skc_dontcopy_end[0]; union { u32 skc_rxhash; u32 skc_window_clamp; u32 skc_tw_snd_nxt; /* struct tcp_timewait_sock */ }; /* public: */ }; struct bpf_local_storage; struct sk_filter; /** * struct sock - network layer representation of sockets * @__sk_common: shared layout with inet_timewait_sock * @sk_shutdown: mask of %SEND_SHUTDOWN and/or %RCV_SHUTDOWN * @sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings * @sk_lock: synchronizer * @sk_kern_sock: True if sock is using kernel lock classes * @sk_rcvbuf: size of receive buffer in bytes * @sk_wq: sock wait queue and async head * @sk_rx_dst: receive input route used by early demux * @sk_rx_dst_ifindex: ifindex for @sk_rx_dst * @sk_rx_dst_cookie: cookie for @sk_rx_dst * @sk_dst_cache: destination cache * @sk_dst_pending_confirm: need to confirm neighbour * @sk_policy: flow policy * @sk_receive_queue: incoming packets * @sk_wmem_alloc: transmit queue bytes committed * @sk_tsq_flags: TCP Small Queues flags * @sk_write_queue: Packet sending queue * @sk_omem_alloc: "o" is "option" or "other" * @sk_wmem_queued: persistent queue size * @sk_forward_alloc: space allocated forward * @sk_reserved_mem: space reserved and non-reclaimable for the socket * @sk_napi_id: id of the last napi context to receive data for sk * @sk_ll_usec: usecs to busypoll when there is no data * @sk_allocation: allocation mode * @sk_pacing_rate: Pacing rate (if supported by transport/packet scheduler) * @sk_pacing_status: Pacing status (requested, handled by sch_fq) * @sk_max_pacing_rate: Maximum pacing rate (%SO_MAX_PACING_RATE) * @sk_sndbuf: size of send buffer in bytes * @sk_no_check_tx: %SO_NO_CHECK setting, set checksum in TX packets * @sk_no_check_rx: allow zero checksum in RX packets * @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO) * @sk_gso_disabled: if set, NETIF_F_GSO_MASK is forbidden. * @sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4) * @sk_gso_max_size: Maximum GSO segment size to build * @sk_gso_max_segs: Maximum number of GSO segments * @sk_pacing_shift: scaling factor for TCP Small Queues * @sk_lingertime: %SO_LINGER l_linger setting * @sk_backlog: always used with the per-socket spinlock held * @sk_callback_lock: used with the callbacks in the end of this struct * @sk_error_queue: rarely used * @sk_prot_creator: sk_prot of original sock creator (see ipv6_setsockopt, * IPV6_ADDRFORM for instance) * @sk_err: last error * @sk_err_soft: errors that don't cause failure but are the cause of a * persistent failure not just 'timed out' * @sk_drops: raw/udp drops counter * @sk_ack_backlog: current listen backlog * @sk_max_ack_backlog: listen backlog set in listen() * @sk_uid: user id of owner * @sk_prefer_busy_poll: prefer busypolling over softirq processing * @sk_busy_poll_budget: napi processing budget when busypolling * @sk_priority: %SO_PRIORITY setting * @sk_type: socket type (%SOCK_STREAM, etc) * @sk_protocol: which protocol this socket belongs in this network family * @sk_peer_lock: lock protecting @sk_peer_pid and @sk_peer_cred * @sk_peer_pid: &struct pid for this socket's peer * @sk_peer_cred: %SO_PEERCRED setting * @sk_rcvlowat: %SO_RCVLOWAT setting * @sk_rcvtimeo: %SO_RCVTIMEO setting * @sk_sndtimeo: %SO_SNDTIMEO setting * @sk_txhash: computed flow hash for use on transmit * @sk_txrehash: enable TX hash rethink * @sk_filter: socket filtering instructions * @sk_timer: sock cleanup timer * @sk_stamp: time stamp of last packet received * @sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only * @sk_tsflags: SO_TIMESTAMPING flags * @sk_bpf_cb_flags: used in bpf_setsockopt() * @sk_use_task_frag: allow sk_page_frag() to use current->task_frag. * Sockets that can be used under memory reclaim should * set this to false. * @sk_bind_phc: SO_TIMESTAMPING bind PHC index of PTP virtual clock * for timestamping * @sk_tskey: counter to disambiguate concurrent tstamp requests * @sk_zckey: counter to order MSG_ZEROCOPY notifications * @sk_socket: Identd and reporting IO signals * @sk_user_data: RPC layer private data. Write-protected by @sk_callback_lock. * @sk_frag: cached page frag * @sk_peek_off: current peek_offset value * @sk_send_head: front of stuff to transmit * @tcp_rtx_queue: TCP re-transmit queue [union with @sk_send_head] * @sk_security: used by security modules * @sk_mark: generic packet mark * @sk_cgrp_data: cgroup data for this cgroup * @sk_memcg: this socket's memory cgroup association * @sk_write_pending: a write to stream socket waits to start * @sk_disconnects: number of disconnect operations performed on this sock * @sk_state_change: callback to indicate change in the state of the sock * @sk_data_ready: callback to indicate there is data to be processed * @sk_write_space: callback to indicate there is bf sending space available * @sk_error_report: callback to indicate errors (e.g. %MSG_ERRQUEUE) * @sk_backlog_rcv: callback to process the backlog * @sk_validate_xmit_skb: ptr to an optional validate function * @sk_destruct: called at sock freeing time, i.e. when all refcnt == 0 * @sk_reuseport_cb: reuseport group container * @sk_bpf_storage: ptr to cache and control for bpf_sk_storage * @sk_rcu: used during RCU grace period * @sk_clockid: clockid used by time-based scheduling (SO_TXTIME) * @sk_txtime_deadline_mode: set deadline mode for SO_TXTIME * @sk_txtime_report_errors: set report errors mode for SO_TXTIME * @sk_txtime_unused: unused txtime flags * @sk_scm_recv_flags: all flags used by scm_recv() * @sk_scm_credentials: flagged by SO_PASSCRED to recv SCM_CREDENTIALS * @sk_scm_security: flagged by SO_PASSSEC to recv SCM_SECURITY * @sk_scm_pidfd: flagged by SO_PASSPIDFD to recv SCM_PIDFD * @sk_scm_rights: flagged by SO_PASSRIGHTS to recv SCM_RIGHTS * @sk_scm_unused: unused flags for scm_recv() * @ns_tracker: tracker for netns reference * @sk_user_frags: xarray of pages the user is holding a reference on. * @sk_owner: reference to the real owner of the socket that calls * sock_lock_init_class_and_name(). */ struct sock { /* * Now struct inet_timewait_sock also uses sock_common, so please just * don't add nothing before this first member (__sk_common) --acme */ struct sock_common __sk_common; #define sk_node __sk_common.skc_node #define sk_nulls_node __sk_common.skc_nulls_node #define sk_refcnt __sk_common.skc_refcnt #define sk_tx_queue_mapping __sk_common.skc_tx_queue_mapping #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING #define sk_rx_queue_mapping __sk_common.skc_rx_queue_mapping #endif #define sk_dontcopy_begin __sk_common.skc_dontcopy_begin #define sk_dontcopy_end __sk_common.skc_dontcopy_end #define sk_hash __sk_common.skc_hash #define sk_portpair __sk_common.skc_portpair #define sk_num __sk_common.skc_num #define sk_dport __sk_common.skc_dport #define sk_addrpair __sk_common.skc_addrpair #define sk_daddr __sk_common.skc_daddr #define sk_rcv_saddr __sk_common.skc_rcv_saddr #define sk_family __sk_common.skc_family #define sk_state __sk_common.skc_state #define sk_reuse __sk_common.skc_reuse #define sk_reuseport __sk_common.skc_reuseport #define sk_ipv6only __sk_common.skc_ipv6only #define sk_net_refcnt __sk_common.skc_net_refcnt #define sk_bound_dev_if __sk_common.skc_bound_dev_if #define sk_bind_node __sk_common.skc_bind_node #define sk_prot __sk_common.skc_prot #define sk_net __sk_common.skc_net #define sk_v6_daddr __sk_common.skc_v6_daddr #define sk_v6_rcv_saddr __sk_common.skc_v6_rcv_saddr #define sk_cookie __sk_common.skc_cookie #define sk_incoming_cpu __sk_common.skc_incoming_cpu #define sk_flags __sk_common.skc_flags #define sk_rxhash __sk_common.skc_rxhash __cacheline_group_begin(sock_write_rx); atomic_t sk_drops; __s32 sk_peek_off; struct sk_buff_head sk_error_queue; struct sk_buff_head sk_receive_queue; /* * The backlog queue is special, it is always used with * the per-socket spinlock held and requires low latency * access. Therefore we special case it's implementation. * Note : rmem_alloc is in this structure to fill a hole * on 64bit arches, not because its logically part of * backlog. */ struct { atomic_t rmem_alloc; int len; struct sk_buff *head; struct sk_buff *tail; } sk_backlog; #define sk_rmem_alloc sk_backlog.rmem_alloc __cacheline_group_end(sock_write_rx); __cacheline_group_begin(sock_read_rx); /* early demux fields */ struct dst_entry __rcu *sk_rx_dst; int sk_rx_dst_ifindex; u32 sk_rx_dst_cookie; #ifdef CONFIG_NET_RX_BUSY_POLL unsigned int sk_ll_usec; unsigned int sk_napi_id; u16 sk_busy_poll_budget; u8 sk_prefer_busy_poll; #endif u8 sk_userlocks; int sk_rcvbuf; struct sk_filter __rcu *sk_filter; union { struct socket_wq __rcu *sk_wq; /* private: */ struct socket_wq *sk_wq_raw; /* public: */ }; void (*sk_data_ready)(struct sock *sk); long sk_rcvtimeo; int sk_rcvlowat; __cacheline_group_end(sock_read_rx); __cacheline_group_begin(sock_read_rxtx); int sk_err; struct socket *sk_socket; struct mem_cgroup *sk_memcg; #ifdef CONFIG_XFRM struct xfrm_policy __rcu *sk_policy[2]; #endif __cacheline_group_end(sock_read_rxtx); __cacheline_group_begin(sock_write_rxtx); socket_lock_t sk_lock; u32 sk_reserved_mem; int sk_forward_alloc; u32 sk_tsflags; __cacheline_group_end(sock_write_rxtx); __cacheline_group_begin(sock_write_tx); int sk_write_pending; atomic_t sk_omem_alloc; int sk_sndbuf; int sk_wmem_queued; refcount_t sk_wmem_alloc; unsigned long sk_tsq_flags; union { struct sk_buff *sk_send_head; struct rb_root tcp_rtx_queue; }; struct sk_buff_head sk_write_queue; u32 sk_dst_pending_confirm; u32 sk_pacing_status; /* see enum sk_pacing */ struct page_frag sk_frag; struct timer_list sk_timer; unsigned long sk_pacing_rate; /* bytes per second */ atomic_t sk_zckey; atomic_t sk_tskey; __cacheline_group_end(sock_write_tx); __cacheline_group_begin(sock_read_tx); unsigned long sk_max_pacing_rate; long sk_sndtimeo; u32 sk_priority; u32 sk_mark; struct dst_entry __rcu *sk_dst_cache; netdev_features_t sk_route_caps; #ifdef CONFIG_SOCK_VALIDATE_XMIT struct sk_buff* (*sk_validate_xmit_skb)(struct sock *sk, struct net_device *dev, struct sk_buff *skb); #endif u16 sk_gso_type; u16 sk_gso_max_segs; unsigned int sk_gso_max_size; gfp_t sk_allocation; u32 sk_txhash; u8 sk_pacing_shift; bool sk_use_task_frag; __cacheline_group_end(sock_read_tx); /* * Because of non atomicity rules, all * changes are protected by socket lock. */ u8 sk_gso_disabled : 1, sk_kern_sock : 1, sk_no_check_tx : 1, sk_no_check_rx : 1; u8 sk_shutdown; u16 sk_type; u16 sk_protocol; unsigned long sk_lingertime; struct proto *sk_prot_creator; rwlock_t sk_callback_lock; int sk_err_soft; u32 sk_ack_backlog; u32 sk_max_ack_backlog; kuid_t sk_uid; spinlock_t sk_peer_lock; int sk_bind_phc; struct pid *sk_peer_pid; const struct cred *sk_peer_cred; ktime_t sk_stamp; #if BITS_PER_LONG==32 seqlock_t sk_stamp_seq; #endif int sk_disconnects; union { u8 sk_txrehash; u8 sk_scm_recv_flags; struct { u8 sk_scm_credentials : 1, sk_scm_security : 1, sk_scm_pidfd : 1, sk_scm_rights : 1, sk_scm_unused : 4; }; }; u8 sk_clockid; u8 sk_txtime_deadline_mode : 1, sk_txtime_report_errors : 1, sk_txtime_unused : 6; #define SK_BPF_CB_FLAG_TEST(SK, FLAG) ((SK)->sk_bpf_cb_flags & (FLAG)) u8 sk_bpf_cb_flags; void *sk_user_data; #ifdef CONFIG_SECURITY void *sk_security; #endif struct sock_cgroup_data sk_cgrp_data; void (*sk_state_change)(struct sock *sk); void (*sk_write_space)(struct sock *sk); void (*sk_error_report)(struct sock *sk); int (*sk_backlog_rcv)(struct sock *sk, struct sk_buff *skb); void (*sk_destruct)(struct sock *sk); struct sock_reuseport __rcu *sk_reuseport_cb; #ifdef CONFIG_BPF_SYSCALL struct bpf_local_storage __rcu *sk_bpf_storage; #endif struct rcu_head sk_rcu; netns_tracker ns_tracker; struct xarray sk_user_frags; #if IS_ENABLED(CONFIG_PROVE_LOCKING) && IS_ENABLED(CONFIG_MODULES) struct module *sk_owner; #endif }; struct sock_bh_locked { struct sock *sock; local_lock_t bh_lock; }; enum sk_pacing { SK_PACING_NONE = 0, SK_PACING_NEEDED = 1, SK_PACING_FQ = 2, }; /* flag bits in sk_user_data * * - SK_USER_DATA_NOCOPY: Pointer stored in sk_user_data might * not be suitable for copying when cloning the socket. For instance, * it can point to a reference counted object. sk_user_data bottom * bit is set if pointer must not be copied. * * - SK_USER_DATA_BPF: Mark whether sk_user_data field is * managed/owned by a BPF reuseport array. This bit should be set * when sk_user_data's sk is added to the bpf's reuseport_array. * * - SK_USER_DATA_PSOCK: Mark whether pointer stored in * sk_user_data points to psock type. This bit should be set * when sk_user_data is assigned to a psock object. */ #define SK_USER_DATA_NOCOPY 1UL #define SK_USER_DATA_BPF 2UL #define SK_USER_DATA_PSOCK 4UL #define SK_USER_DATA_PTRMASK ~(SK_USER_DATA_NOCOPY | SK_USER_DATA_BPF |\ SK_USER_DATA_PSOCK) /** * sk_user_data_is_nocopy - Test if sk_user_data pointer must not be copied * @sk: socket */ static inline bool sk_user_data_is_nocopy(const struct sock *sk) { return ((uintptr_t)sk->sk_user_data & SK_USER_DATA_NOCOPY); } #define __sk_user_data(sk) ((*((void __rcu **)&(sk)->sk_user_data))) /** * __locked_read_sk_user_data_with_flags - return the pointer * only if argument flags all has been set in sk_user_data. Otherwise * return NULL * * @sk: socket * @flags: flag bits * * The caller must be holding sk->sk_callback_lock. */ static inline void * __locked_read_sk_user_data_with_flags(const struct sock *sk, uintptr_t flags) { uintptr_t sk_user_data = (uintptr_t)rcu_dereference_check(__sk_user_data(sk), lockdep_is_held(&sk->sk_callback_lock)); WARN_ON_ONCE(flags & SK_USER_DATA_PTRMASK); if ((sk_user_data & flags) == flags) return (void *)(sk_user_data & SK_USER_DATA_PTRMASK); return NULL; } /** * __rcu_dereference_sk_user_data_with_flags - return the pointer * only if argument flags all has been set in sk_user_data. Otherwise * return NULL * * @sk: socket * @flags: flag bits */ static inline void * __rcu_dereference_sk_user_data_with_flags(const struct sock *sk, uintptr_t flags) { uintptr_t sk_user_data = (uintptr_t)rcu_dereference(__sk_user_data(sk)); WARN_ON_ONCE(flags & SK_USER_DATA_PTRMASK); if ((sk_user_data & flags) == flags) return (void *)(sk_user_data & SK_USER_DATA_PTRMASK); return NULL; } #define rcu_dereference_sk_user_data(sk) \ __rcu_dereference_sk_user_data_with_flags(sk, 0) #define __rcu_assign_sk_user_data_with_flags(sk, ptr, flags) \ ({ \ uintptr_t __tmp1 = (uintptr_t)(ptr), \ __tmp2 = (uintptr_t)(flags); \ WARN_ON_ONCE(__tmp1 & ~SK_USER_DATA_PTRMASK); \ WARN_ON_ONCE(__tmp2 & SK_USER_DATA_PTRMASK); \ rcu_assign_pointer(__sk_user_data((sk)), \ __tmp1 | __tmp2); \ }) #define rcu_assign_sk_user_data(sk, ptr) \ __rcu_assign_sk_user_data_with_flags(sk, ptr, 0) static inline struct net *sock_net(const struct sock *sk) { return read_pnet(&sk->sk_net); } static inline void sock_net_set(struct sock *sk, struct net *net) { write_pnet(&sk->sk_net, net); } /* * SK_CAN_REUSE and SK_NO_REUSE on a socket mean that the socket is OK * or not whether his port will be reused by someone else. SK_FORCE_REUSE * on a socket means that the socket will reuse everybody else's port * without looking at the other's sk_reuse value. */ #define SK_NO_REUSE 0 #define SK_CAN_REUSE 1 #define SK_FORCE_REUSE 2 int sk_set_peek_off(struct sock *sk, int val); static inline int sk_peek_offset(const struct sock *sk, int flags) { if (unlikely(flags & MSG_PEEK)) { return READ_ONCE(sk->sk_peek_off); } return 0; } static inline void sk_peek_offset_bwd(struct sock *sk, int val) { s32 off = READ_ONCE(sk->sk_peek_off); if (unlikely(off >= 0)) { off = max_t(s32, off - val, 0); WRITE_ONCE(sk->sk_peek_off, off); } } static inline void sk_peek_offset_fwd(struct sock *sk, int val) { sk_peek_offset_bwd(sk, -val); } /* * Hashed lists helper routines */ static inline struct sock *sk_entry(const struct hlist_node *node) { return hlist_entry(node, struct sock, sk_node); } static inline struct sock *__sk_head(const struct hlist_head *head) { return hlist_entry(head->first, struct sock, sk_node); } static inline struct sock *sk_head(const struct hlist_head *head) { return hlist_empty(head) ? NULL : __sk_head(head); } static inline struct sock *__sk_nulls_head(const struct hlist_nulls_head *head) { return hlist_nulls_entry(head->first, struct sock, sk_nulls_node); } static inline struct sock *sk_nulls_head(const struct hlist_nulls_head *head) { return hlist_nulls_empty(head) ? NULL : __sk_nulls_head(head); } static inline struct sock *sk_next(const struct sock *sk) { return hlist_entry_safe(sk->sk_node.next, struct sock, sk_node); } static inline struct sock *sk_nulls_next(const struct sock *sk) { return (!is_a_nulls(sk->sk_nulls_node.next)) ? hlist_nulls_entry(sk->sk_nulls_node.next, struct sock, sk_nulls_node) : NULL; } static inline bool sk_unhashed(const struct sock *sk) { return hlist_unhashed(&sk->sk_node); } static inline bool sk_hashed(const struct sock *sk) { return !sk_unhashed(sk); } static inline void sk_node_init(struct hlist_node *node) { node->pprev = NULL; } static inline void __sk_del_node(struct sock *sk) { __hlist_del(&sk->sk_node); } /* NB: equivalent to hlist_del_init_rcu */ static inline bool __sk_del_node_init(struct sock *sk) { if (sk_hashed(sk)) { __sk_del_node(sk); sk_node_init(&sk->sk_node); return true; } return false; } /* Grab socket reference count. This operation is valid only when sk is ALREADY grabbed f.e. it is found in hash table or a list and the lookup is made under lock preventing hash table modifications. */ static __always_inline void sock_hold(struct sock *sk) { refcount_inc(&sk->sk_refcnt); } /* Ungrab socket in the context, which assumes that socket refcnt cannot hit zero, f.e. it is true in context of any socketcall. */ static __always_inline void __sock_put(struct sock *sk) { refcount_dec(&sk->sk_refcnt); } static inline bool sk_del_node_init(struct sock *sk) { bool rc = __sk_del_node_init(sk); if (rc) { /* paranoid for a while -acme */ WARN_ON(refcount_read(&sk->sk_refcnt) == 1); __sock_put(sk); } return rc; } #define sk_del_node_init_rcu(sk) sk_del_node_init(sk) static inline bool __sk_nulls_del_node_init_rcu(struct sock *sk) { if (sk_hashed(sk)) { hlist_nulls_del_init_rcu(&sk->sk_nulls_node); return true; } return false; } static inline bool sk_nulls_del_node_init_rcu(struct sock *sk) { bool rc = __sk_nulls_del_node_init_rcu(sk); if (rc) { /* paranoid for a while -acme */ WARN_ON(refcount_read(&sk->sk_refcnt) == 1); __sock_put(sk); } return rc; } static inline void __sk_add_node(struct sock *sk, struct hlist_head *list) { hlist_add_head(&sk->sk_node, list); } static inline void sk_add_node(struct sock *sk, struct hlist_head *list) { sock_hold(sk); __sk_add_node(sk, list); } static inline void sk_add_node_rcu(struct sock *sk, struct hlist_head *list) { sock_hold(sk); if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && sk->sk_family == AF_INET6) hlist_add_tail_rcu(&sk->sk_node, list); else hlist_add_head_rcu(&sk->sk_node, list); } static inline void sk_add_node_tail_rcu(struct sock *sk, struct hlist_head *list) { sock_hold(sk); hlist_add_tail_rcu(&sk->sk_node, list); } static inline void __sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list) { hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list); } static inline void __sk_nulls_add_node_tail_rcu(struct sock *sk, struct hlist_nulls_head *list) { hlist_nulls_add_tail_rcu(&sk->sk_nulls_node, list); } static inline void sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list) { sock_hold(sk); __sk_nulls_add_node_rcu(sk, list); } static inline void __sk_del_bind_node(struct sock *sk) { __hlist_del(&sk->sk_bind_node); } static inline void sk_add_bind_node(struct sock *sk, struct hlist_head *list) { hlist_add_head(&sk->sk_bind_node, list); } #define sk_for_each(__sk, list) \ hlist_for_each_entry(__sk, list, sk_node) #define sk_for_each_rcu(__sk, list) \ hlist_for_each_entry_rcu(__sk, list, sk_node) #define sk_nulls_for_each(__sk, node, list) \ hlist_nulls_for_each_entry(__sk, node, list, sk_nulls_node) #define sk_nulls_for_each_rcu(__sk, node, list) \ hlist_nulls_for_each_entry_rcu(__sk, node, list, sk_nulls_node) #define sk_for_each_from(__sk) \ hlist_for_each_entry_from(__sk, sk_node) #define sk_nulls_for_each_from(__sk, node) \ if (__sk && ({ node = &(__sk)->sk_nulls_node; 1; })) \ hlist_nulls_for_each_entry_from(__sk, node, sk_nulls_node) #define sk_for_each_safe(__sk, tmp, list) \ hlist_for_each_entry_safe(__sk, tmp, list, sk_node) #define sk_for_each_bound(__sk, list) \ hlist_for_each_entry(__sk, list, sk_bind_node) #define sk_for_each_bound_safe(__sk, tmp, list) \ hlist_for_each_entry_safe(__sk, tmp, list, sk_bind_node) /** * sk_for_each_entry_offset_rcu - iterate over a list at a given struct offset * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_node to use as a loop cursor. * @head: the head for your list. * @offset: offset of hlist_node within the struct. * */ #define sk_for_each_entry_offset_rcu(tpos, pos, head, offset) \ for (pos = rcu_dereference(hlist_first_rcu(head)); \ pos != NULL && \ ({ tpos = (typeof(*tpos) *)((void *)pos - offset); 1;}); \ pos = rcu_dereference(hlist_next_rcu(pos))) static inline struct user_namespace *sk_user_ns(const struct sock *sk) { /* Careful only use this in a context where these parameters * can not change and must all be valid, such as recvmsg from * userspace. */ return sk->sk_socket->file->f_cred->user_ns; } /* Sock flags */ enum sock_flags { SOCK_DEAD, SOCK_DONE, SOCK_URGINLINE, SOCK_KEEPOPEN, SOCK_LINGER, SOCK_DESTROY, SOCK_BROADCAST, SOCK_TIMESTAMP, SOCK_ZAPPED, SOCK_USE_WRITE_QUEUE, /* whether to call sk->sk_write_space in sock_wfree */ SOCK_DBG, /* %SO_DEBUG setting */ SOCK_RCVTSTAMP, /* %SO_TIMESTAMP setting */ SOCK_RCVTSTAMPNS, /* %SO_TIMESTAMPNS setting */ SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */ SOCK_MEMALLOC, /* VM depends on this socket for swapping */ SOCK_TIMESTAMPING_RX_SOFTWARE, /* %SOF_TIMESTAMPING_RX_SOFTWARE */ SOCK_FASYNC, /* fasync() active */ SOCK_RXQ_OVFL, SOCK_ZEROCOPY, /* buffers from userspace */ SOCK_WIFI_STATUS, /* push wifi status to userspace */ SOCK_NOFCS, /* Tell NIC not to do the Ethernet FCS. * Will use last 4 bytes of packet sent from * user-space instead. */ SOCK_FILTER_LOCKED, /* Filter cannot be changed anymore */ SOCK_SELECT_ERR_QUEUE, /* Wake select on error queue */ SOCK_RCU_FREE, /* wait rcu grace period in sk_destruct() */ SOCK_TXTIME, SOCK_XDP, /* XDP is attached */ SOCK_TSTAMP_NEW, /* Indicates 64 bit timestamps always */ SOCK_RCVMARK, /* Receive SO_MARK ancillary data with packet */ SOCK_RCVPRIORITY, /* Receive SO_PRIORITY ancillary data with packet */ SOCK_TIMESTAMPING_ANY, /* Copy of sk_tsflags & TSFLAGS_ANY */ }; #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) /* * The highest bit of sk_tsflags is reserved for kernel-internal * SOCKCM_FLAG_TS_OPT_ID. There is a check in core/sock.c to control that * SOF_TIMESTAMPING* values do not reach this reserved area */ #define SOCKCM_FLAG_TS_OPT_ID BIT(31) static inline void sock_copy_flags(struct sock *nsk, const struct sock *osk) { nsk->sk_flags = osk->sk_flags; } static inline void sock_set_flag(struct sock *sk, enum sock_flags flag) { __set_bit(flag, &sk->sk_flags); } static inline void sock_reset_flag(struct sock *sk, enum sock_flags flag) { __clear_bit(flag, &sk->sk_flags); } static inline void sock_valbool_flag(struct sock *sk, enum sock_flags bit, int valbool) { if (valbool) sock_set_flag(sk, bit); else sock_reset_flag(sk, bit); } static inline bool sock_flag(const struct sock *sk, enum sock_flags flag) { return test_bit(flag, &sk->sk_flags); } #ifdef CONFIG_NET DECLARE_STATIC_KEY_FALSE(memalloc_socks_key); static inline int sk_memalloc_socks(void) { return static_branch_unlikely(&memalloc_socks_key); } void __receive_sock(struct file *file); #else static inline int sk_memalloc_socks(void) { return 0; } static inline void __receive_sock(struct file *file) { } #endif static inline gfp_t sk_gfp_mask(const struct sock *sk, gfp_t gfp_mask) { return gfp_mask | (sk->sk_allocation & __GFP_MEMALLOC); } static inline void sk_acceptq_removed(struct sock *sk) { WRITE_ONCE(sk->sk_ack_backlog, sk->sk_ack_backlog - 1); } static inline void sk_acceptq_added(struct sock *sk) { WRITE_ONCE(sk->sk_ack_backlog, sk->sk_ack_backlog + 1); } /* Note: If you think the test should be: * return READ_ONCE(sk->sk_ack_backlog) >= READ_ONCE(sk->sk_max_ack_backlog); * Then please take a look at commit 64a146513f8f ("[NET]: Revert incorrect accept queue backlog changes.") */ static inline bool sk_acceptq_is_full(const struct sock *sk) { return READ_ONCE(sk->sk_ack_backlog) > READ_ONCE(sk->sk_max_ack_backlog); } /* * Compute minimal free write space needed to queue new packets. */ static inline int sk_stream_min_wspace(const struct sock *sk) { return READ_ONCE(sk->sk_wmem_queued) >> 1; } static inline int sk_stream_wspace(const struct sock *sk) { return READ_ONCE(sk->sk_sndbuf) - READ_ONCE(sk->sk_wmem_queued); } static inline void sk_wmem_queued_add(struct sock *sk, int val) { WRITE_ONCE(sk->sk_wmem_queued, sk->sk_wmem_queued + val); } static inline void sk_forward_alloc_add(struct sock *sk, int val) { /* Paired with lockless reads of sk->sk_forward_alloc */ WRITE_ONCE(sk->sk_forward_alloc, sk->sk_forward_alloc + val); } void sk_stream_write_space(struct sock *sk); /* OOB backlog add */ static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb) { /* dont let skb dst not refcounted, we are going to leave rcu lock */ skb_dst_force(skb); if (!sk->sk_backlog.tail) WRITE_ONCE(sk->sk_backlog.head, skb); else sk->sk_backlog.tail->next = skb; WRITE_ONCE(sk->sk_backlog.tail, skb); skb->next = NULL; } /* * Take into account size of receive queue and backlog queue * Do not take into account this skb truesize, * to allow even a single big packet to come. */ static inline bool sk_rcvqueues_full(const struct sock *sk, unsigned int limit) { unsigned int qsize = sk->sk_backlog.len + atomic_read(&sk->sk_rmem_alloc); return qsize > limit; } /* The per-socket spinlock must be held here. */ static inline __must_check int sk_add_backlog(struct sock *sk, struct sk_buff *skb, unsigned int limit) { if (sk_rcvqueues_full(sk, limit)) return -ENOBUFS; /* * If the skb was allocated from pfmemalloc reserves, only * allow SOCK_MEMALLOC sockets to use it as this socket is * helping free memory */ if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) return -ENOMEM; __sk_add_backlog(sk, skb); sk->sk_backlog.len += skb->truesize; return 0; } int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb); INDIRECT_CALLABLE_DECLARE(int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)); INDIRECT_CALLABLE_DECLARE(int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)); static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) { if (sk_memalloc_socks() && skb_pfmemalloc(skb)) return __sk_backlog_rcv(sk, skb); return INDIRECT_CALL_INET(sk->sk_backlog_rcv, tcp_v6_do_rcv, tcp_v4_do_rcv, sk, skb); } static inline void sk_incoming_cpu_update(struct sock *sk) { int cpu = raw_smp_processor_id(); if (unlikely(READ_ONCE(sk->sk_incoming_cpu) != cpu)) WRITE_ONCE(sk->sk_incoming_cpu, cpu); } static inline void sock_rps_save_rxhash(struct sock *sk, const struct sk_buff *skb) { #ifdef CONFIG_RPS /* The following WRITE_ONCE() is paired with the READ_ONCE() * here, and another one in sock_rps_record_flow(). */ if (unlikely(READ_ONCE(sk->sk_rxhash) != skb->hash)) WRITE_ONCE(sk->sk_rxhash, skb->hash); #endif } static inline void sock_rps_reset_rxhash(struct sock *sk) { #ifdef CONFIG_RPS /* Paired with READ_ONCE() in sock_rps_record_flow() */ WRITE_ONCE(sk->sk_rxhash, 0); #endif } #define sk_wait_event(__sk, __timeo, __condition, __wait) \ ({ int __rc, __dis = __sk->sk_disconnects; \ release_sock(__sk); \ __rc = __condition; \ if (!__rc) { \ *(__timeo) = wait_woken(__wait, \ TASK_INTERRUPTIBLE, \ *(__timeo)); \ } \ sched_annotate_sleep(); \ lock_sock(__sk); \ __rc = __dis == __sk->sk_disconnects ? __condition : -EPIPE; \ __rc; \ }) int sk_stream_wait_connect(struct sock *sk, long *timeo_p); int sk_stream_wait_memory(struct sock *sk, long *timeo_p); void sk_stream_wait_close(struct sock *sk, long timeo_p); int sk_stream_error(struct sock *sk, int flags, int err); void sk_stream_kill_queues(struct sock *sk); void sk_set_memalloc(struct sock *sk); void sk_clear_memalloc(struct sock *sk); void __sk_flush_backlog(struct sock *sk); static inline bool sk_flush_backlog(struct sock *sk) { if (unlikely(READ_ONCE(sk->sk_backlog.tail))) { __sk_flush_backlog(sk); return true; } return false; } int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb); struct request_sock_ops; struct timewait_sock_ops; struct inet_hashinfo; struct raw_hashinfo; struct smc_hashinfo; struct module; struct sk_psock; /* * caches using SLAB_TYPESAFE_BY_RCU should let .next pointer from nulls nodes * un-modified. Special care is taken when initializing object to zero. */ static inline void sk_prot_clear_nulls(struct sock *sk, int size) { if (offsetof(struct sock, sk_node.next) != 0) memset(sk, 0, offsetof(struct sock, sk_node.next)); memset(&sk->sk_node.pprev, 0, size - offsetof(struct sock, sk_node.pprev)); } struct proto_accept_arg { int flags; int err; int is_empty; bool kern; }; /* Networking protocol blocks we attach to sockets. * socket layer -> transport layer interface */ struct proto { void (*close)(struct sock *sk, long timeout); int (*pre_connect)(struct sock *sk, struct sockaddr *uaddr, int addr_len); int (*connect)(struct sock *sk, struct sockaddr *uaddr, int addr_len); int (*disconnect)(struct sock *sk, int flags); struct sock * (*accept)(struct sock *sk, struct proto_accept_arg *arg); int (*ioctl)(struct sock *sk, int cmd, int *karg); int (*init)(struct sock *sk); void (*destroy)(struct sock *sk); void (*shutdown)(struct sock *sk, int how); int (*setsockopt)(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int (*getsockopt)(struct sock *sk, int level, int optname, char __user *optval, int __user *option); void (*keepalive)(struct sock *sk, int valbool); #ifdef CONFIG_COMPAT int (*compat_ioctl)(struct sock *sk, unsigned int cmd, unsigned long arg); #endif int (*sendmsg)(struct sock *sk, struct msghdr *msg, size_t len); int (*recvmsg)(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len); void (*splice_eof)(struct socket *sock); int (*bind)(struct sock *sk, struct sockaddr *addr, int addr_len); int (*bind_add)(struct sock *sk, struct sockaddr *addr, int addr_len); int (*backlog_rcv) (struct sock *sk, struct sk_buff *skb); bool (*bpf_bypass_getsockopt)(int level, int optname); void (*release_cb)(struct sock *sk); /* Keeping track of sk's, looking them up, and port selection methods. */ int (*hash)(struct sock *sk); void (*unhash)(struct sock *sk); void (*rehash)(struct sock *sk); int (*get_port)(struct sock *sk, unsigned short snum); void (*put_port)(struct sock *sk); #ifdef CONFIG_BPF_SYSCALL int (*psock_update_sk_prot)(struct sock *sk, struct sk_psock *psock, bool restore); #endif /* Keeping track of sockets in use */ #ifdef CONFIG_PROC_FS unsigned int inuse_idx; #endif bool (*stream_memory_free)(const struct sock *sk, int wake); bool (*sock_is_readable)(struct sock *sk); /* Memory pressure */ void (*enter_memory_pressure)(struct sock *sk); void (*leave_memory_pressure)(struct sock *sk); atomic_long_t *memory_allocated; /* Current allocated memory. */ int __percpu *per_cpu_fw_alloc; struct percpu_counter *sockets_allocated; /* Current number of sockets. */ /* * Pressure flag: try to collapse. * Technical note: it is used by multiple contexts non atomically. * Make sure to use READ_ONCE()/WRITE_ONCE() for all reads/writes. * All the __sk_mem_schedule() is of this nature: accounting * is strict, actions are advisory and have some latency. */ unsigned long *memory_pressure; long *sysctl_mem; int *sysctl_wmem; int *sysctl_rmem; u32 sysctl_wmem_offset; u32 sysctl_rmem_offset; int max_header; bool no_autobind; struct kmem_cache *slab; unsigned int obj_size; unsigned int ipv6_pinfo_offset; slab_flags_t slab_flags; unsigned int useroffset; /* Usercopy region offset */ unsigned int usersize; /* Usercopy region size */ unsigned int __percpu *orphan_count; struct request_sock_ops *rsk_prot; struct timewait_sock_ops *twsk_prot; union { struct inet_hashinfo *hashinfo; struct udp_table *udp_table; struct raw_hashinfo *raw_hash; struct smc_hashinfo *smc_hash; } h; struct module *owner; char name[32]; struct list_head node; int (*diag_destroy)(struct sock *sk, int err); } __randomize_layout; int proto_register(struct proto *prot, int alloc_slab); void proto_unregister(struct proto *prot); int sock_load_diag_module(int family, int protocol); INDIRECT_CALLABLE_DECLARE(bool tcp_stream_memory_free(const struct sock *sk, int wake)); static inline bool __sk_stream_memory_free(const struct sock *sk, int wake) { if (READ_ONCE(sk->sk_wmem_queued) >= READ_ONCE(sk->sk_sndbuf)) return false; return sk->sk_prot->stream_memory_free ? INDIRECT_CALL_INET_1(sk->sk_prot->stream_memory_free, tcp_stream_memory_free, sk, wake) : true; } static inline bool sk_stream_memory_free(const struct sock *sk) { return __sk_stream_memory_free(sk, 0); } static inline bool __sk_stream_is_writeable(const struct sock *sk, int wake) { return sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && __sk_stream_memory_free(sk, wake); } static inline bool sk_stream_is_writeable(const struct sock *sk) { return __sk_stream_is_writeable(sk, 0); } static inline int sk_under_cgroup_hierarchy(struct sock *sk, struct cgroup *ancestor) { #ifdef CONFIG_SOCK_CGROUP_DATA return cgroup_is_descendant(sock_cgroup_ptr(&sk->sk_cgrp_data), ancestor); #else return -ENOTSUPP; #endif } #define SK_ALLOC_PERCPU_COUNTER_BATCH 16 static inline void sk_sockets_allocated_dec(struct sock *sk) { percpu_counter_add_batch(sk->sk_prot->sockets_allocated, -1, SK_ALLOC_PERCPU_COUNTER_BATCH); } static inline void sk_sockets_allocated_inc(struct sock *sk) { percpu_counter_add_batch(sk->sk_prot->sockets_allocated, 1, SK_ALLOC_PERCPU_COUNTER_BATCH); } static inline u64 sk_sockets_allocated_read_positive(struct sock *sk) { return percpu_counter_read_positive(sk->sk_prot->sockets_allocated); } static inline int proto_sockets_allocated_sum_positive(struct proto *prot) { return percpu_counter_sum_positive(prot->sockets_allocated); } #ifdef CONFIG_PROC_FS #define PROTO_INUSE_NR 64 /* should be enough for the first time */ struct prot_inuse { int all; int val[PROTO_INUSE_NR]; }; static inline void sock_prot_inuse_add(const struct net *net, const struct proto *prot, int val) { this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val); } static inline void sock_inuse_add(const struct net *net, int val) { this_cpu_add(net->core.prot_inuse->all, val); } int sock_prot_inuse_get(struct net *net, struct proto *proto); int sock_inuse_get(struct net *net); #else static inline void sock_prot_inuse_add(const struct net *net, const struct proto *prot, int val) { } static inline void sock_inuse_add(const struct net *net, int val) { } #endif /* With per-bucket locks this operation is not-atomic, so that * this version is not worse. */ static inline int __sk_prot_rehash(struct sock *sk) { sk->sk_prot->unhash(sk); return sk->sk_prot->hash(sk); } /* About 10 seconds */ #define SOCK_DESTROY_TIME (10*HZ) /* Sockets 0-1023 can't be bound to unless you are superuser */ #define PROT_SOCK 1024 #define SHUTDOWN_MASK 3 #define RCV_SHUTDOWN 1 #define SEND_SHUTDOWN 2 #define SOCK_BINDADDR_LOCK 4 #define SOCK_BINDPORT_LOCK 8 struct socket_alloc { struct socket socket; struct inode vfs_inode; }; static inline struct socket *SOCKET_I(struct inode *inode) { return &container_of(inode, struct socket_alloc, vfs_inode)->socket; } static inline struct inode *SOCK_INODE(struct socket *socket) { return &container_of(socket, struct socket_alloc, socket)->vfs_inode; } /* * Functions for memory accounting */ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind); int __sk_mem_schedule(struct sock *sk, int size, int kind); void __sk_mem_reduce_allocated(struct sock *sk, int amount); void __sk_mem_reclaim(struct sock *sk, int amount); #define SK_MEM_SEND 0 #define SK_MEM_RECV 1 /* sysctl_mem values are in pages */ static inline long sk_prot_mem_limits(const struct sock *sk, int index) { return READ_ONCE(sk->sk_prot->sysctl_mem[index]); } static inline int sk_mem_pages(int amt) { return (amt + PAGE_SIZE - 1) >> PAGE_SHIFT; } static inline bool sk_has_account(struct sock *sk) { /* return true if protocol supports memory accounting */ return !!sk->sk_prot->memory_allocated; } static inline bool sk_wmem_schedule(struct sock *sk, int size) { int delta; if (!sk_has_account(sk)) return true; delta = size - sk->sk_forward_alloc; return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_SEND); } static inline bool __sk_rmem_schedule(struct sock *sk, int size, bool pfmemalloc) { int delta; if (!sk_has_account(sk)) return true; delta = size - sk->sk_forward_alloc; return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_RECV) || pfmemalloc; } static inline bool sk_rmem_schedule(struct sock *sk, const struct sk_buff *skb, int size) { return __sk_rmem_schedule(sk, size, skb_pfmemalloc(skb)); } static inline int sk_unused_reserved_mem(const struct sock *sk) { int unused_mem; if (likely(!sk->sk_reserved_mem)) return 0; unused_mem = sk->sk_reserved_mem - sk->sk_wmem_queued - atomic_read(&sk->sk_rmem_alloc); return unused_mem > 0 ? unused_mem : 0; } static inline void sk_mem_reclaim(struct sock *sk) { int reclaimable; if (!sk_has_account(sk)) return; reclaimable = sk->sk_forward_alloc - sk_unused_reserved_mem(sk); if (reclaimable >= (int)PAGE_SIZE) __sk_mem_reclaim(sk, reclaimable); } static inline void sk_mem_reclaim_final(struct sock *sk) { sk->sk_reserved_mem = 0; sk_mem_reclaim(sk); } static inline void sk_mem_charge(struct sock *sk, int size) { if (!sk_has_account(sk)) return; sk_forward_alloc_add(sk, -size); } static inline void sk_mem_uncharge(struct sock *sk, int size) { if (!sk_has_account(sk)) return; sk_forward_alloc_add(sk, size); sk_mem_reclaim(sk); } #if IS_ENABLED(CONFIG_PROVE_LOCKING) && IS_ENABLED(CONFIG_MODULES) static inline void sk_owner_set(struct sock *sk, struct module *owner) { __module_get(owner); sk->sk_owner = owner; } static inline void sk_owner_clear(struct sock *sk) { sk->sk_owner = NULL; } static inline void sk_owner_put(struct sock *sk) { module_put(sk->sk_owner); } #else static inline void sk_owner_set(struct sock *sk, struct module *owner) { } static inline void sk_owner_clear(struct sock *sk) { } static inline void sk_owner_put(struct sock *sk) { } #endif /* * Macro so as to not evaluate some arguments when * lockdep is not enabled. * * Mark both the sk_lock and the sk_lock.slock as a * per-address-family lock class. */ #define sock_lock_init_class_and_name(sk, sname, skey, name, key) \ do { \ sk_owner_set(sk, THIS_MODULE); \ sk->sk_lock.owned = 0; \ init_waitqueue_head(&sk->sk_lock.wq); \ spin_lock_init(&(sk)->sk_lock.slock); \ debug_check_no_locks_freed((void *)&(sk)->sk_lock, \ sizeof((sk)->sk_lock)); \ lockdep_set_class_and_name(&(sk)->sk_lock.slock, \ (skey), (sname)); \ lockdep_init_map(&(sk)->sk_lock.dep_map, (name), (key), 0); \ } while (0) static inline bool lockdep_sock_is_held(const struct sock *sk) { return lockdep_is_held(&sk->sk_lock) || lockdep_is_held(&sk->sk_lock.slock); } void lock_sock_nested(struct sock *sk, int subclass); static inline void lock_sock(struct sock *sk) { lock_sock_nested(sk, 0); } void __lock_sock(struct sock *sk); void __release_sock(struct sock *sk); void release_sock(struct sock *sk); /* BH context may only use the following locking interface. */ #define bh_lock_sock(__sk) spin_lock(&((__sk)->sk_lock.slock)) #define bh_lock_sock_nested(__sk) \ spin_lock_nested(&((__sk)->sk_lock.slock), \ SINGLE_DEPTH_NESTING) #define bh_unlock_sock(__sk) spin_unlock(&((__sk)->sk_lock.slock)) bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock); /** * lock_sock_fast - fast version of lock_sock * @sk: socket * * This version should be used for very small section, where process won't block * return false if fast path is taken: * * sk_lock.slock locked, owned = 0, BH disabled * * return true if slow path is taken: * * sk_lock.slock unlocked, owned = 1, BH enabled */ static inline bool lock_sock_fast(struct sock *sk) { /* The sk_lock has mutex_lock() semantics here. */ mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_); return __lock_sock_fast(sk); } /* fast socket lock variant for caller already holding a [different] socket lock */ static inline bool lock_sock_fast_nested(struct sock *sk) { mutex_acquire(&sk->sk_lock.dep_map, SINGLE_DEPTH_NESTING, 0, _RET_IP_); return __lock_sock_fast(sk); } /** * unlock_sock_fast - complement of lock_sock_fast * @sk: socket * @slow: slow mode * * fast unlock socket for user context. * If slow mode is on, we call regular release_sock() */ static inline void unlock_sock_fast(struct sock *sk, bool slow) __releases(&sk->sk_lock.slock) { if (slow) { release_sock(sk); __release(&sk->sk_lock.slock); } else { mutex_release(&sk->sk_lock.dep_map, _RET_IP_); spin_unlock_bh(&sk->sk_lock.slock); } } void sockopt_lock_sock(struct sock *sk); void sockopt_release_sock(struct sock *sk); bool sockopt_ns_capable(struct user_namespace *ns, int cap); bool sockopt_capable(int cap); /* Used by processes to "lock" a socket state, so that * interrupts and bottom half handlers won't change it * from under us. It essentially blocks any incoming * packets, so that we won't get any new data or any * packets that change the state of the socket. * * While locked, BH processing will add new packets to * the backlog queue. This queue is processed by the * owner of the socket lock right before it is released. * * Since ~2.3.5 it is also exclusive sleep lock serializing * accesses from user process context. */ static inline void sock_owned_by_me(const struct sock *sk) { #ifdef CONFIG_LOCKDEP WARN_ON_ONCE(!lockdep_sock_is_held(sk) && debug_locks); #endif } static inline void sock_not_owned_by_me(const struct sock *sk) { #ifdef CONFIG_LOCKDEP WARN_ON_ONCE(lockdep_sock_is_held(sk) && debug_locks); #endif } static inline bool sock_owned_by_user(const struct sock *sk) { sock_owned_by_me(sk); return sk->sk_lock.owned; } static inline bool sock_owned_by_user_nocheck(const struct sock *sk) { return sk->sk_lock.owned; } static inline void sock_release_ownership(struct sock *sk) { DEBUG_NET_WARN_ON_ONCE(!sock_owned_by_user_nocheck(sk)); sk->sk_lock.owned = 0; /* The sk_lock has mutex_unlock() semantics: */ mutex_release(&sk->sk_lock.dep_map, _RET_IP_); } /* no reclassification while locks are held */ static inline bool sock_allow_reclassification(const struct sock *csk) { struct sock *sk = (struct sock *)csk; return !sock_owned_by_user_nocheck(sk) && !spin_is_locked(&sk->sk_lock.slock); } struct sock *sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot, int kern); void sk_free(struct sock *sk); void sk_net_refcnt_upgrade(struct sock *sk); void sk_destruct(struct sock *sk); struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority); struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, gfp_t priority); void __sock_wfree(struct sk_buff *skb); void sock_wfree(struct sk_buff *skb); struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, gfp_t priority); void skb_orphan_partial(struct sk_buff *skb); void sock_rfree(struct sk_buff *skb); void sock_efree(struct sk_buff *skb); #ifdef CONFIG_INET void sock_edemux(struct sk_buff *skb); void sock_pfree(struct sk_buff *skb); static inline void skb_set_owner_edemux(struct sk_buff *skb, struct sock *sk) { skb_orphan(skb); if (refcount_inc_not_zero(&sk->sk_refcnt)) { skb->sk = sk; skb->destructor = sock_edemux; } } #else #define sock_edemux sock_efree #endif int sk_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int sock_setsockopt(struct socket *sock, int level, int op, sockptr_t optval, unsigned int optlen); int do_sock_setsockopt(struct socket *sock, bool compat, int level, int optname, sockptr_t optval, int optlen); int do_sock_getsockopt(struct socket *sock, bool compat, int level, int optname, sockptr_t optval, sockptr_t optlen); int sk_getsockopt(struct sock *sk, int level, int optname, sockptr_t optval, sockptr_t optlen); int sock_gettstamp(struct socket *sock, void __user *userstamp, bool timeval, bool time32); struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, unsigned long data_len, int noblock, int *errcode, int max_page_order); static inline struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, int noblock, int *errcode) { return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0); } void *sock_kmalloc(struct sock *sk, int size, gfp_t priority); void *sock_kmemdup(struct sock *sk, const void *src, int size, gfp_t priority); void sock_kfree_s(struct sock *sk, void *mem, int size); void sock_kzfree_s(struct sock *sk, void *mem, int size); void sk_send_sigurg(struct sock *sk); static inline void sock_replace_proto(struct sock *sk, struct proto *proto) { if (sk->sk_socket) clear_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); WRITE_ONCE(sk->sk_prot, proto); } struct sockcm_cookie { u64 transmit_time; u32 mark; u32 tsflags; u32 ts_opt_id; u32 priority; u32 dmabuf_id; }; static inline void sockcm_init(struct sockcm_cookie *sockc, const struct sock *sk) { *sockc = (struct sockcm_cookie) { .mark = READ_ONCE(sk->sk_mark), .tsflags = READ_ONCE(sk->sk_tsflags), .priority = READ_ONCE(sk->sk_priority), }; } int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, struct sockcm_cookie *sockc); int sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct sockcm_cookie *sockc); /* * Functions to fill in entries in struct proto_ops when a protocol * does not implement a particular function. */ int sock_no_bind(struct socket *, struct sockaddr *, int); int sock_no_connect(struct socket *, struct sockaddr *, int, int); int sock_no_socketpair(struct socket *, struct socket *); int sock_no_accept(struct socket *, struct socket *, struct proto_accept_arg *); int sock_no_getname(struct socket *, struct sockaddr *, int); int sock_no_ioctl(struct socket *, unsigned int, unsigned long); int sock_no_listen(struct socket *, int); int sock_no_shutdown(struct socket *, int); int sock_no_sendmsg(struct socket *, struct msghdr *, size_t); int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t len); int sock_no_recvmsg(struct socket *, struct msghdr *, size_t, int); int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma); /* * Functions to fill in entries in struct proto_ops when a protocol * uses the inet style. */ int sock_common_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen); int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags); int sock_common_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen); void sk_common_release(struct sock *sk); /* * Default socket callbacks and setup code */ /* Initialise core socket variables using an explicit uid. */ void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid); /* Initialise core socket variables. * Assumes struct socket *sock is embedded in a struct socket_alloc. */ void sock_init_data(struct socket *sock, struct sock *sk); /* * Socket reference counting postulates. * * * Each user of socket SHOULD hold a reference count. * * Each access point to socket (an hash table bucket, reference from a list, * running timer, skb in flight MUST hold a reference count. * * When reference count hits 0, it means it will never increase back. * * When reference count hits 0, it means that no references from * outside exist to this socket and current process on current CPU * is last user and may/should destroy this socket. * * sk_free is called from any context: process, BH, IRQ. When * it is called, socket has no references from outside -> sk_free * may release descendant resources allocated by the socket, but * to the time when it is called, socket is NOT referenced by any * hash tables, lists etc. * * Packets, delivered from outside (from network or from another process) * and enqueued on receive/error queues SHOULD NOT grab reference count, * when they sit in queue. Otherwise, packets will leak to hole, when * socket is looked up by one cpu and unhasing is made by another CPU. * It is true for udp/raw, netlink (leak to receive and error queues), tcp * (leak to backlog). Packet socket does all the processing inside * BR_NETPROTO_LOCK, so that it has not this race condition. UNIX sockets * use separate SMP lock, so that they are prone too. */ /* Ungrab socket and destroy it, if it was the last reference. */ static inline void sock_put(struct sock *sk) { if (refcount_dec_and_test(&sk->sk_refcnt)) sk_free(sk); } /* Generic version of sock_put(), dealing with all sockets * (TCP_TIMEWAIT, TCP_NEW_SYN_RECV, ESTABLISHED...) */ void sock_gen_put(struct sock *sk); int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested, unsigned int trim_cap, bool refcounted); static inline int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested) { return __sk_receive_skb(sk, skb, nested, 1, true); } static inline void sk_tx_queue_set(struct sock *sk, int tx_queue) { /* sk_tx_queue_mapping accept only upto a 16-bit value */ if (WARN_ON_ONCE((unsigned short)tx_queue >= USHRT_MAX)) return; /* Paired with READ_ONCE() in sk_tx_queue_get() and * other WRITE_ONCE() because socket lock might be not held. */ WRITE_ONCE(sk->sk_tx_queue_mapping, tx_queue); } #define NO_QUEUE_MAPPING USHRT_MAX static inline void sk_tx_queue_clear(struct sock *sk) { /* Paired with READ_ONCE() in sk_tx_queue_get() and * other WRITE_ONCE() because socket lock might be not held. */ WRITE_ONCE(sk->sk_tx_queue_mapping, NO_QUEUE_MAPPING); } static inline int sk_tx_queue_get(const struct sock *sk) { if (sk) { /* Paired with WRITE_ONCE() in sk_tx_queue_clear() * and sk_tx_queue_set(). */ int val = READ_ONCE(sk->sk_tx_queue_mapping); if (val != NO_QUEUE_MAPPING) return val; } return -1; } static inline void __sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb, bool force_set) { #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING if (skb_rx_queue_recorded(skb)) { u16 rx_queue = skb_get_rx_queue(skb); if (force_set || unlikely(READ_ONCE(sk->sk_rx_queue_mapping) != rx_queue)) WRITE_ONCE(sk->sk_rx_queue_mapping, rx_queue); } #endif } static inline void sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb) { __sk_rx_queue_set(sk, skb, true); } static inline void sk_rx_queue_update(struct sock *sk, const struct sk_buff *skb) { __sk_rx_queue_set(sk, skb, false); } static inline void sk_rx_queue_clear(struct sock *sk) { #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING WRITE_ONCE(sk->sk_rx_queue_mapping, NO_QUEUE_MAPPING); #endif } static inline int sk_rx_queue_get(const struct sock *sk) { #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING if (sk) { int res = READ_ONCE(sk->sk_rx_queue_mapping); if (res != NO_QUEUE_MAPPING) return res; } #endif return -1; } static inline void sk_set_socket(struct sock *sk, struct socket *sock) { sk->sk_socket = sock; } static inline wait_queue_head_t *sk_sleep(struct sock *sk) { BUILD_BUG_ON(offsetof(struct socket_wq, wait) != 0); return &rcu_dereference_raw(sk->sk_wq)->wait; } /* Detach socket from process context. * Announce socket dead, detach it from wait queue and inode. * Note that parent inode held reference count on this struct sock, * we do not release it in this function, because protocol * probably wants some additional cleanups or even continuing * to work with this socket (TCP). */ static inline void sock_orphan(struct sock *sk) { write_lock_bh(&sk->sk_callback_lock); sock_set_flag(sk, SOCK_DEAD); sk_set_socket(sk, NULL); sk->sk_wq = NULL; /* Note: sk_uid is unchanged. */ write_unlock_bh(&sk->sk_callback_lock); } static inline void sock_graft(struct sock *sk, struct socket *parent) { WARN_ON(parent->sk); write_lock_bh(&sk->sk_callback_lock); rcu_assign_pointer(sk->sk_wq, &parent->wq); parent->sk = sk; sk_set_socket(sk, parent); WRITE_ONCE(sk->sk_uid, SOCK_INODE(parent)->i_uid); security_sock_graft(sk, parent); write_unlock_bh(&sk->sk_callback_lock); } static inline kuid_t sk_uid(const struct sock *sk) { /* Paired with WRITE_ONCE() in sockfs_setattr() */ return READ_ONCE(sk->sk_uid); } unsigned long __sock_i_ino(struct sock *sk); unsigned long sock_i_ino(struct sock *sk); static inline kuid_t sock_net_uid(const struct net *net, const struct sock *sk) { return sk ? sk_uid(sk) : make_kuid(net->user_ns, 0); } static inline u32 net_tx_rndhash(void) { u32 v = get_random_u32(); return v ?: 1; } static inline void sk_set_txhash(struct sock *sk) { /* This pairs with READ_ONCE() in skb_set_hash_from_sk() */ WRITE_ONCE(sk->sk_txhash, net_tx_rndhash()); } static inline bool sk_rethink_txhash(struct sock *sk) { if (sk->sk_txhash && sk->sk_txrehash == SOCK_TXREHASH_ENABLED) { sk_set_txhash(sk); return true; } return false; } static inline struct dst_entry * __sk_dst_get(const struct sock *sk) { return rcu_dereference_check(sk->sk_dst_cache, lockdep_sock_is_held(sk)); } static inline struct dst_entry * sk_dst_get(const struct sock *sk) { struct dst_entry *dst; rcu_read_lock(); dst = rcu_dereference(sk->sk_dst_cache); if (dst && !rcuref_get(&dst->__rcuref)) dst = NULL; rcu_read_unlock(); return dst; } static inline void __dst_negative_advice(struct sock *sk) { struct dst_entry *dst = __sk_dst_get(sk); if (dst && dst->ops->negative_advice) dst->ops->negative_advice(sk, dst); } static inline void dst_negative_advice(struct sock *sk) { sk_rethink_txhash(sk); __dst_negative_advice(sk); } static inline void __sk_dst_set(struct sock *sk, struct dst_entry *dst) { struct dst_entry *old_dst; sk_tx_queue_clear(sk); WRITE_ONCE(sk->sk_dst_pending_confirm, 0); old_dst = rcu_dereference_protected(sk->sk_dst_cache, lockdep_sock_is_held(sk)); rcu_assign_pointer(sk->sk_dst_cache, dst); dst_release(old_dst); } static inline void sk_dst_set(struct sock *sk, struct dst_entry *dst) { struct dst_entry *old_dst; sk_tx_queue_clear(sk); WRITE_ONCE(sk->sk_dst_pending_confirm, 0); old_dst = unrcu_pointer(xchg(&sk->sk_dst_cache, RCU_INITIALIZER(dst))); dst_release(old_dst); } static inline void __sk_dst_reset(struct sock *sk) { __sk_dst_set(sk, NULL); } static inline void sk_dst_reset(struct sock *sk) { sk_dst_set(sk, NULL); } struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie); struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie); static inline void sk_dst_confirm(struct sock *sk) { if (!READ_ONCE(sk->sk_dst_pending_confirm)) WRITE_ONCE(sk->sk_dst_pending_confirm, 1); } static inline void sock_confirm_neigh(struct sk_buff *skb, struct neighbour *n) { if (skb_get_dst_pending_confirm(skb)) { struct sock *sk = skb->sk; if (sk && READ_ONCE(sk->sk_dst_pending_confirm)) WRITE_ONCE(sk->sk_dst_pending_confirm, 0); neigh_confirm(n); } } bool sk_mc_loop(const struct sock *sk); static inline bool sk_can_gso(const struct sock *sk) { return net_gso_ok(sk->sk_route_caps, sk->sk_gso_type); } void sk_setup_caps(struct sock *sk, struct dst_entry *dst); static inline void sk_gso_disable(struct sock *sk) { sk->sk_gso_disabled = 1; sk->sk_route_caps &= ~NETIF_F_GSO_MASK; } static inline int skb_do_copy_data_nocache(struct sock *sk, struct sk_buff *skb, struct iov_iter *from, char *to, int copy, int offset) { if (skb->ip_summed == CHECKSUM_NONE) { __wsum csum = 0; if (!csum_and_copy_from_iter_full(to, copy, &csum, from)) return -EFAULT; skb->csum = csum_block_add(skb->csum, csum, offset); } else if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) { if (!copy_from_iter_full_nocache(to, copy, from)) return -EFAULT; } else if (!copy_from_iter_full(to, copy, from)) return -EFAULT; return 0; } static inline int skb_add_data_nocache(struct sock *sk, struct sk_buff *skb, struct iov_iter *from, int copy) { int err, offset = skb->len; err = skb_do_copy_data_nocache(sk, skb, from, skb_put(skb, copy), copy, offset); if (err) __skb_trim(skb, offset); return err; } static inline int skb_copy_to_page_nocache(struct sock *sk, struct iov_iter *from, struct sk_buff *skb, struct page *page, int off, int copy) { int err; err = skb_do_copy_data_nocache(sk, skb, from, page_address(page) + off, copy, skb->len); if (err) return err; skb_len_add(skb, copy); sk_wmem_queued_add(sk, copy); sk_mem_charge(sk, copy); return 0; } /** * sk_wmem_alloc_get - returns write allocations * @sk: socket * * Return: sk_wmem_alloc minus initial offset of one */ static inline int sk_wmem_alloc_get(const struct sock *sk) { return refcount_read(&sk->sk_wmem_alloc) - 1; } /** * sk_rmem_alloc_get - returns read allocations * @sk: socket * * Return: sk_rmem_alloc */ static inline int sk_rmem_alloc_get(const struct sock *sk) { return atomic_read(&sk->sk_rmem_alloc); } /** * sk_has_allocations - check if allocations are outstanding * @sk: socket * * Return: true if socket has write or read allocations */ static inline bool sk_has_allocations(const struct sock *sk) { return sk_wmem_alloc_get(sk) || sk_rmem_alloc_get(sk); } /** * skwq_has_sleeper - check if there are any waiting processes * @wq: struct socket_wq * * Return: true if socket_wq has waiting processes * * The purpose of the skwq_has_sleeper and sock_poll_wait is to wrap the memory * barrier call. They were added due to the race found within the tcp code. * * Consider following tcp code paths:: * * CPU1 CPU2 * sys_select receive packet * ... ... * __add_wait_queue update tp->rcv_nxt * ... ... * tp->rcv_nxt check sock_def_readable * ... { * schedule rcu_read_lock(); * wq = rcu_dereference(sk->sk_wq); * if (wq && waitqueue_active(&wq->wait)) * wake_up_interruptible(&wq->wait) * ... * } * * The race for tcp fires when the __add_wait_queue changes done by CPU1 stay * in its cache, and so does the tp->rcv_nxt update on CPU2 side. The CPU1 * could then endup calling schedule and sleep forever if there are no more * data on the socket. * */ static inline bool skwq_has_sleeper(struct socket_wq *wq) { return wq && wq_has_sleeper(&wq->wait); } /** * sock_poll_wait - wrapper for the poll_wait call. * @filp: file * @sock: socket to wait on * @p: poll_table * * See the comments in the wq_has_sleeper function. */ static inline void sock_poll_wait(struct file *filp, struct socket *sock, poll_table *p) { /* Provides a barrier we need to be sure we are in sync * with the socket flags modification. * * This memory barrier is paired in the wq_has_sleeper. */ poll_wait(filp, &sock->wq.wait, p); } static inline void skb_set_hash_from_sk(struct sk_buff *skb, struct sock *sk) { /* This pairs with WRITE_ONCE() in sk_set_txhash() */ u32 txhash = READ_ONCE(sk->sk_txhash); if (txhash) { skb->l4_hash = 1; skb->hash = txhash; } } void skb_set_owner_w(struct sk_buff *skb, struct sock *sk); /* * Queue a received datagram if it will fit. Stream and sequenced * protocols can't normally use this as they need to fit buffers in * and play with them. * * Inlined as it's very short and called for pretty much every * packet ever received. */ static inline void skb_set_owner_r(struct sk_buff *skb, struct sock *sk) { skb_orphan(skb); skb->sk = sk; skb->destructor = sock_rfree; atomic_add(skb->truesize, &sk->sk_rmem_alloc); sk_mem_charge(sk, skb->truesize); } static inline __must_check bool skb_set_owner_sk_safe(struct sk_buff *skb, struct sock *sk) { if (sk && refcount_inc_not_zero(&sk->sk_refcnt)) { skb_orphan(skb); skb->destructor = sock_efree; skb->sk = sk; return true; } return false; } static inline struct sk_buff *skb_clone_and_charge_r(struct sk_buff *skb, struct sock *sk) { skb = skb_clone(skb, sk_gfp_mask(sk, GFP_ATOMIC)); if (skb) { if (sk_rmem_schedule(sk, skb, skb->truesize)) { skb_set_owner_r(skb, sk); return skb; } __kfree_skb(skb); } return NULL; } static inline void skb_prepare_for_gro(struct sk_buff *skb) { if (skb->destructor != sock_wfree) { skb_orphan(skb); return; } skb->slow_gro = 1; } void sk_reset_timer(struct sock *sk, struct timer_list *timer, unsigned long expires); void sk_stop_timer(struct sock *sk, struct timer_list *timer); void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer); int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue, struct sk_buff *skb, unsigned int flags, void (*destructor)(struct sock *sk, struct sk_buff *skb)); int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason); static inline int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { return sock_queue_rcv_skb_reason(sk, skb, NULL); } int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb); struct sk_buff *sock_dequeue_err_skb(struct sock *sk); /* * Recover an error report and clear atomically */ static inline int sock_error(struct sock *sk) { int err; /* Avoid an atomic operation for the common case. * This is racy since another cpu/thread can change sk_err under us. */ if (likely(data_race(!sk->sk_err))) return 0; err = xchg(&sk->sk_err, 0); return -err; } void sk_error_report(struct sock *sk); static inline unsigned long sock_wspace(struct sock *sk) { int amt = 0; if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { amt = sk->sk_sndbuf - refcount_read(&sk->sk_wmem_alloc); if (amt < 0) amt = 0; } return amt; } /* Note: * We use sk->sk_wq_raw, from contexts knowing this * pointer is not NULL and cannot disappear/change. */ static inline void sk_set_bit(int nr, struct sock *sk) { if ((nr == SOCKWQ_ASYNC_NOSPACE || nr == SOCKWQ_ASYNC_WAITDATA) && !sock_flag(sk, SOCK_FASYNC)) return; set_bit(nr, &sk->sk_wq_raw->flags); } static inline void sk_clear_bit(int nr, struct sock *sk) { if ((nr == SOCKWQ_ASYNC_NOSPACE || nr == SOCKWQ_ASYNC_WAITDATA) && !sock_flag(sk, SOCK_FASYNC)) return; clear_bit(nr, &sk->sk_wq_raw->flags); } static inline void sk_wake_async(const struct sock *sk, int how, int band) { if (sock_flag(sk, SOCK_FASYNC)) { rcu_read_lock(); sock_wake_async(rcu_dereference(sk->sk_wq), how, band); rcu_read_unlock(); } } static inline void sk_wake_async_rcu(const struct sock *sk, int how, int band) { if (unlikely(sock_flag(sk, SOCK_FASYNC))) sock_wake_async(rcu_dereference(sk->sk_wq), how, band); } /* Since sk_{r,w}mem_alloc sums skb->truesize, even a small frame might * need sizeof(sk_buff) + MTU + padding, unless net driver perform copybreak. * Note: for send buffers, TCP works better if we can build two skbs at * minimum. */ #define TCP_SKB_MIN_TRUESIZE (2048 + SKB_DATA_ALIGN(sizeof(struct sk_buff))) #define SOCK_MIN_SNDBUF (TCP_SKB_MIN_TRUESIZE * 2) #define SOCK_MIN_RCVBUF TCP_SKB_MIN_TRUESIZE static inline void sk_stream_moderate_sndbuf(struct sock *sk) { u32 val; if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) return; val = min(sk->sk_sndbuf, sk->sk_wmem_queued >> 1); val = max_t(u32, val, sk_unused_reserved_mem(sk)); WRITE_ONCE(sk->sk_sndbuf, max_t(u32, val, SOCK_MIN_SNDBUF)); } /** * sk_page_frag - return an appropriate page_frag * @sk: socket * * Use the per task page_frag instead of the per socket one for * optimization when we know that we're in process context and own * everything that's associated with %current. * * Both direct reclaim and page faults can nest inside other * socket operations and end up recursing into sk_page_frag() * while it's already in use: explicitly avoid task page_frag * when users disable sk_use_task_frag. * * Return: a per task page_frag if context allows that, * otherwise a per socket one. */ static inline struct page_frag *sk_page_frag(struct sock *sk) { if (sk->sk_use_task_frag) return &current->task_frag; return &sk->sk_frag; } bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag); /* * Default write policy as shown to user space via poll/select/SIGIO */ static inline bool sock_writeable(const struct sock *sk) { return refcount_read(&sk->sk_wmem_alloc) < (READ_ONCE(sk->sk_sndbuf) >> 1); } static inline gfp_t gfp_any(void) { return in_softirq() ? GFP_ATOMIC : GFP_KERNEL; } static inline gfp_t gfp_memcg_charge(void) { return in_softirq() ? GFP_ATOMIC : GFP_KERNEL; } static inline long sock_rcvtimeo(const struct sock *sk, bool noblock) { return noblock ? 0 : READ_ONCE(sk->sk_rcvtimeo); } static inline long sock_sndtimeo(const struct sock *sk, bool noblock) { return noblock ? 0 : READ_ONCE(sk->sk_sndtimeo); } static inline int sock_rcvlowat(const struct sock *sk, int waitall, int len) { int v = waitall ? len : min_t(int, READ_ONCE(sk->sk_rcvlowat), len); return v ?: 1; } /* Alas, with timeout socket operations are not restartable. * Compare this to poll(). */ static inline int sock_intr_errno(long timeo) { return timeo == MAX_SCHEDULE_TIMEOUT ? -ERESTARTSYS : -EINTR; } struct sock_skb_cb { u32 dropcount; }; /* Store sock_skb_cb at the end of skb->cb[] so protocol families * using skb->cb[] would keep using it directly and utilize its * alignment guarantee. */ #define SOCK_SKB_CB_OFFSET (sizeof_field(struct sk_buff, cb) - \ sizeof(struct sock_skb_cb)) #define SOCK_SKB_CB(__skb) ((struct sock_skb_cb *)((__skb)->cb + \ SOCK_SKB_CB_OFFSET)) #define sock_skb_cb_check_size(size) \ BUILD_BUG_ON((size) > SOCK_SKB_CB_OFFSET) static inline void sock_skb_set_dropcount(const struct sock *sk, struct sk_buff *skb) { SOCK_SKB_CB(skb)->dropcount = sock_flag(sk, SOCK_RXQ_OVFL) ? atomic_read(&sk->sk_drops) : 0; } static inline void sk_drops_add(struct sock *sk, const struct sk_buff *skb) { int segs = max_t(u16, 1, skb_shinfo(skb)->gso_segs); atomic_add(segs, &sk->sk_drops); } static inline ktime_t sock_read_timestamp(struct sock *sk) { #if BITS_PER_LONG==32 unsigned int seq; ktime_t kt; do { seq = read_seqbegin(&sk->sk_stamp_seq); kt = sk->sk_stamp; } while (read_seqretry(&sk->sk_stamp_seq, seq)); return kt; #else return READ_ONCE(sk->sk_stamp); #endif } static inline void sock_write_timestamp(struct sock *sk, ktime_t kt) { #if BITS_PER_LONG==32 write_seqlock(&sk->sk_stamp_seq); sk->sk_stamp = kt; write_sequnlock(&sk->sk_stamp_seq); #else WRITE_ONCE(sk->sk_stamp, kt); #endif } void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb); void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk, struct sk_buff *skb); bool skb_has_tx_timestamp(struct sk_buff *skb, const struct sock *sk); int skb_get_tx_timestamp(struct sk_buff *skb, struct sock *sk, struct timespec64 *ts); static inline void sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { struct skb_shared_hwtstamps *hwtstamps = skb_hwtstamps(skb); u32 tsflags = READ_ONCE(sk->sk_tsflags); ktime_t kt = skb->tstamp; /* * generate control messages if * - receive time stamping in software requested * - software time stamp available and wanted * - hardware time stamps available and wanted */ if (sock_flag(sk, SOCK_RCVTSTAMP) || (tsflags & SOF_TIMESTAMPING_RX_SOFTWARE) || (kt && tsflags & SOF_TIMESTAMPING_SOFTWARE) || (hwtstamps->hwtstamp && (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE))) __sock_recv_timestamp(msg, sk, skb); else sock_write_timestamp(sk, kt); if (sock_flag(sk, SOCK_WIFI_STATUS) && skb_wifi_acked_valid(skb)) __sock_recv_wifi_status(msg, sk, skb); } void __sock_recv_cmsgs(struct msghdr *msg, struct sock *sk, struct sk_buff *skb); #define SK_DEFAULT_STAMP (-1L * NSEC_PER_SEC) static inline void sock_recv_cmsgs(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { #define FLAGS_RECV_CMSGS ((1UL << SOCK_RXQ_OVFL) | \ (1UL << SOCK_RCVTSTAMP) | \ (1UL << SOCK_RCVMARK) | \ (1UL << SOCK_RCVPRIORITY) | \ (1UL << SOCK_TIMESTAMPING_ANY)) #define TSFLAGS_ANY (SOF_TIMESTAMPING_SOFTWARE | \ SOF_TIMESTAMPING_RAW_HARDWARE) if (READ_ONCE(sk->sk_flags) & FLAGS_RECV_CMSGS) __sock_recv_cmsgs(msg, sk, skb); else if (unlikely(sock_flag(sk, SOCK_TIMESTAMP))) sock_write_timestamp(sk, skb->tstamp); else if (unlikely(sock_read_timestamp(sk) == SK_DEFAULT_STAMP)) sock_write_timestamp(sk, 0); } void __sock_tx_timestamp(__u32 tsflags, __u8 *tx_flags); /** * _sock_tx_timestamp - checks whether the outgoing packet is to be time stamped * @sk: socket sending this packet * @sockc: pointer to socket cmsg cookie to get timestamping info * @tx_flags: completed with instructions for time stamping * @tskey: filled in with next sk_tskey (not for TCP, which uses seqno) * * Note: callers should take care of initial ``*tx_flags`` value (usually 0) */ static inline void _sock_tx_timestamp(struct sock *sk, const struct sockcm_cookie *sockc, __u8 *tx_flags, __u32 *tskey) { __u32 tsflags = sockc->tsflags; if (unlikely(tsflags)) { __sock_tx_timestamp(tsflags, tx_flags); if (tsflags & SOF_TIMESTAMPING_OPT_ID && tskey && tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK) { if (tsflags & SOCKCM_FLAG_TS_OPT_ID) *tskey = sockc->ts_opt_id; else *tskey = atomic_inc_return(&sk->sk_tskey) - 1; } } } static inline void sock_tx_timestamp(struct sock *sk, const struct sockcm_cookie *sockc, __u8 *tx_flags) { _sock_tx_timestamp(sk, sockc, tx_flags, NULL); } static inline void skb_setup_tx_timestamp(struct sk_buff *skb, const struct sockcm_cookie *sockc) { _sock_tx_timestamp(skb->sk, sockc, &skb_shinfo(skb)->tx_flags, &skb_shinfo(skb)->tskey); } static inline bool sk_is_inet(const struct sock *sk) { int family = READ_ONCE(sk->sk_family); return family == AF_INET || family == AF_INET6; } static inline bool sk_is_tcp(const struct sock *sk) { return sk_is_inet(sk) && sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP; } static inline bool sk_is_udp(const struct sock *sk) { return sk_is_inet(sk) && sk->sk_type == SOCK_DGRAM && sk->sk_protocol == IPPROTO_UDP; } static inline bool sk_is_unix(const struct sock *sk) { return sk->sk_family == AF_UNIX; } static inline bool sk_is_stream_unix(const struct sock *sk) { return sk_is_unix(sk) && sk->sk_type == SOCK_STREAM; } static inline bool sk_is_vsock(const struct sock *sk) { return sk->sk_family == AF_VSOCK; } static inline bool sk_may_scm_recv(const struct sock *sk) { return (IS_ENABLED(CONFIG_UNIX) && sk->sk_family == AF_UNIX) || sk->sk_family == AF_NETLINK || (IS_ENABLED(CONFIG_BT) && sk->sk_family == AF_BLUETOOTH); } /** * sk_eat_skb - Release a skb if it is no longer needed * @sk: socket to eat this skb from * @skb: socket buffer to eat * * This routine must be called with interrupts disabled or with the socket * locked so that the sk_buff queue operation is ok. */ static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb) { __skb_unlink(skb, &sk->sk_receive_queue); __kfree_skb(skb); } static inline bool skb_sk_is_prefetched(struct sk_buff *skb) { #ifdef CONFIG_INET return skb->destructor == sock_pfree; #else return false; #endif /* CONFIG_INET */ } /* This helper checks if a socket is a full socket, * ie _not_ a timewait or request socket. */ static inline bool sk_fullsock(const struct sock *sk) { return (1 << sk->sk_state) & ~(TCPF_TIME_WAIT | TCPF_NEW_SYN_RECV); } static inline bool sk_is_refcounted(struct sock *sk) { /* Only full sockets have sk->sk_flags. */ return !sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE); } static inline bool sk_requests_wifi_status(struct sock *sk) { return sk && sk_fullsock(sk) && sock_flag(sk, SOCK_WIFI_STATUS); } /* Checks if this SKB belongs to an HW offloaded socket * and whether any SW fallbacks are required based on dev. * Check decrypted mark in case skb_orphan() cleared socket. */ static inline struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) { #ifdef CONFIG_SOCK_VALIDATE_XMIT struct sock *sk = skb->sk; if (sk && sk_fullsock(sk) && sk->sk_validate_xmit_skb) { skb = sk->sk_validate_xmit_skb(sk, dev, skb); } else if (unlikely(skb_is_decrypted(skb))) { pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n"); kfree_skb(skb); skb = NULL; } #endif return skb; } /* This helper checks if a socket is a LISTEN or NEW_SYN_RECV * SYNACK messages can be attached to either ones (depending on SYNCOOKIE) */ static inline bool sk_listener(const struct sock *sk) { return (1 << sk->sk_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV); } /* This helper checks if a socket is a LISTEN or NEW_SYN_RECV or TIME_WAIT * TCP SYNACK messages can be attached to LISTEN or NEW_SYN_RECV (depending on SYNCOOKIE) * TCP RST and ACK can be attached to TIME_WAIT. */ static inline bool sk_listener_or_tw(const struct sock *sk) { return (1 << READ_ONCE(sk->sk_state)) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV | TCPF_TIME_WAIT); } void sock_enable_timestamp(struct sock *sk, enum sock_flags flag); int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, int level, int type); bool sk_ns_capable(const struct sock *sk, struct user_namespace *user_ns, int cap); bool sk_capable(const struct sock *sk, int cap); bool sk_net_capable(const struct sock *sk, int cap); void sk_get_meminfo(const struct sock *sk, u32 *meminfo); /* Take into consideration the size of the struct sk_buff overhead in the * determination of these values, since that is non-constant across * platforms. This makes socket queueing behavior and performance * not depend upon such differences. */ #define _SK_MEM_PACKETS 256 #define _SK_MEM_OVERHEAD SKB_TRUESIZE(256) #define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) #define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) extern __u32 sysctl_wmem_max; extern __u32 sysctl_rmem_max; extern __u32 sysctl_wmem_default; extern __u32 sysctl_rmem_default; #define SKB_FRAG_PAGE_ORDER get_order(32768) DECLARE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key); static inline int sk_get_wmem0(const struct sock *sk, const struct proto *proto) { /* Does this proto have per netns sysctl_wmem ? */ if (proto->sysctl_wmem_offset) return READ_ONCE(*(int *)((void *)sock_net(sk) + proto->sysctl_wmem_offset)); return READ_ONCE(*proto->sysctl_wmem); } static inline int sk_get_rmem0(const struct sock *sk, const struct proto *proto) { /* Does this proto have per netns sysctl_rmem ? */ if (proto->sysctl_rmem_offset) return READ_ONCE(*(int *)((void *)sock_net(sk) + proto->sysctl_rmem_offset)); return READ_ONCE(*proto->sysctl_rmem); } /* Default TCP Small queue budget is ~1 ms of data (1sec >> 10) * Some wifi drivers need to tweak it to get more chunks. * They can use this helper from their ndo_start_xmit() */ static inline void sk_pacing_shift_update(struct sock *sk, int val) { if (!sk || !sk_fullsock(sk) || READ_ONCE(sk->sk_pacing_shift) == val) return; WRITE_ONCE(sk->sk_pacing_shift, val); } /* if a socket is bound to a device, check that the given device * index is either the same or that the socket is bound to an L3 * master device and the given device index is also enslaved to * that L3 master */ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif) { int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); int mdif; if (!bound_dev_if || bound_dev_if == dif) return true; mdif = l3mdev_master_ifindex_by_index(sock_net(sk), dif); if (mdif && mdif == bound_dev_if) return true; return false; } void sock_def_readable(struct sock *sk); int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk); void sock_set_timestamp(struct sock *sk, int optname, bool valbool); int sock_set_timestamping(struct sock *sk, int optname, struct so_timestamping timestamping); #if defined(CONFIG_CGROUP_BPF) void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op); #else static inline void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op) { } #endif void sock_no_linger(struct sock *sk); void sock_set_keepalive(struct sock *sk); void sock_set_priority(struct sock *sk, u32 priority); void sock_set_rcvbuf(struct sock *sk, int val); void sock_set_mark(struct sock *sk, u32 val); void sock_set_reuseaddr(struct sock *sk); void sock_set_reuseport(struct sock *sk); void sock_set_sndtimeo(struct sock *sk, s64 secs); int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len); int sock_get_timeout(long timeo, void *optval, bool old_timeval); int sock_copy_user_timeval(struct __kernel_sock_timeval *tv, sockptr_t optval, int optlen, bool old_timeval); int sock_ioctl_inout(struct sock *sk, unsigned int cmd, void __user *arg, void *karg, size_t size); int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg); static inline bool sk_is_readable(struct sock *sk) { const struct proto *prot = READ_ONCE(sk->sk_prot); if (prot->sock_is_readable) return prot->sock_is_readable(sk); return false; } #endif /* _SOCK_H */
83 28 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __ASM_CURRENT_H #define __ASM_CURRENT_H #include <linux/compiler.h> #ifndef __ASSEMBLY__ struct task_struct; /* * We don't use read_sysreg() as we want the compiler to cache the value where * possible. */ static __always_inline struct task_struct *get_current(void) { unsigned long sp_el0; asm ("mrs %0, sp_el0" : "=r" (sp_el0)); return (struct task_struct *)sp_el0; } #define current get_current() #endif /* __ASSEMBLY__ */ #endif /* __ASM_CURRENT_H */
232 211 204 321 152 203 152 204 232 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2020 Google LLC * Author: Will Deacon <will@kernel.org> */ #ifndef __ARM64_KVM_PGTABLE_H__ #define __ARM64_KVM_PGTABLE_H__ #include <linux/bits.h> #include <linux/kvm_host.h> #include <linux/types.h> #define KVM_PGTABLE_FIRST_LEVEL -1 #define KVM_PGTABLE_LAST_LEVEL 3 /* * The largest supported block sizes for KVM (no 52-bit PA support): * - 4K (level 1): 1GB * - 16K (level 2): 32MB * - 64K (level 2): 512MB */ #ifdef CONFIG_ARM64_4K_PAGES #define KVM_PGTABLE_MIN_BLOCK_LEVEL 1 #else #define KVM_PGTABLE_MIN_BLOCK_LEVEL 2 #endif #define kvm_lpa2_is_enabled() system_supports_lpa2() static inline u64 kvm_get_parange_max(void) { if (kvm_lpa2_is_enabled() || (IS_ENABLED(CONFIG_ARM64_PA_BITS_52) && PAGE_SHIFT == 16)) return ID_AA64MMFR0_EL1_PARANGE_52; else return ID_AA64MMFR0_EL1_PARANGE_48; } static inline u64 kvm_get_parange(u64 mmfr0) { u64 parange_max = kvm_get_parange_max(); u64 parange = cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_EL1_PARANGE_SHIFT); if (parange > parange_max) parange = parange_max; return parange; } typedef u64 kvm_pte_t; #define KVM_PTE_VALID BIT(0) #define KVM_PTE_ADDR_MASK GENMASK(47, PAGE_SHIFT) #define KVM_PTE_ADDR_51_48 GENMASK(15, 12) #define KVM_PTE_ADDR_MASK_LPA2 GENMASK(49, PAGE_SHIFT) #define KVM_PTE_ADDR_51_50_LPA2 GENMASK(9, 8) #define KVM_PHYS_INVALID (-1ULL) #define KVM_PTE_TYPE BIT(1) #define KVM_PTE_TYPE_BLOCK 0 #define KVM_PTE_TYPE_PAGE 1 #define KVM_PTE_TYPE_TABLE 1 #define KVM_PTE_LEAF_ATTR_LO GENMASK(11, 2) #define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2) #define KVM_PTE_LEAF_ATTR_LO_S1_AP GENMASK(7, 6) #define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO \ ({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 2 : 3; }) #define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW \ ({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 0 : 1; }) #define KVM_PTE_LEAF_ATTR_LO_S1_SH GENMASK(9, 8) #define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS 3 #define KVM_PTE_LEAF_ATTR_LO_S1_AF BIT(10) #define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR GENMASK(5, 2) #define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R BIT(6) #define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W BIT(7) #define KVM_PTE_LEAF_ATTR_LO_S2_SH GENMASK(9, 8) #define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS 3 #define KVM_PTE_LEAF_ATTR_LO_S2_AF BIT(10) #define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 50) #define KVM_PTE_LEAF_ATTR_HI_SW GENMASK(58, 55) #define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54) #define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54) #define KVM_PTE_LEAF_ATTR_HI_S1_GP BIT(50) #define KVM_PTE_LEAF_ATTR_S2_PERMS (KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \ KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \ KVM_PTE_LEAF_ATTR_HI_S2_XN) #define KVM_INVALID_PTE_OWNER_MASK GENMASK(9, 2) #define KVM_MAX_OWNER_ID 1 /* * Used to indicate a pte for which a 'break-before-make' sequence is in * progress. */ #define KVM_INVALID_PTE_LOCKED BIT(10) static inline bool kvm_pte_valid(kvm_pte_t pte) { return pte & KVM_PTE_VALID; } static inline u64 kvm_pte_to_phys(kvm_pte_t pte) { u64 pa; if (kvm_lpa2_is_enabled()) { pa = pte & KVM_PTE_ADDR_MASK_LPA2; pa |= FIELD_GET(KVM_PTE_ADDR_51_50_LPA2, pte) << 50; } else { pa = pte & KVM_PTE_ADDR_MASK; if (PAGE_SHIFT == 16) pa |= FIELD_GET(KVM_PTE_ADDR_51_48, pte) << 48; } return pa; } static inline kvm_pte_t kvm_phys_to_pte(u64 pa) { kvm_pte_t pte; if (kvm_lpa2_is_enabled()) { pte = pa & KVM_PTE_ADDR_MASK_LPA2; pa &= GENMASK(51, 50); pte |= FIELD_PREP(KVM_PTE_ADDR_51_50_LPA2, pa >> 50); } else { pte = pa & KVM_PTE_ADDR_MASK; if (PAGE_SHIFT == 16) { pa &= GENMASK(51, 48); pte |= FIELD_PREP(KVM_PTE_ADDR_51_48, pa >> 48); } } return pte; } static inline kvm_pfn_t kvm_pte_to_pfn(kvm_pte_t pte) { return __phys_to_pfn(kvm_pte_to_phys(pte)); } static inline u64 kvm_granule_shift(s8 level) { /* Assumes KVM_PGTABLE_LAST_LEVEL is 3 */ return ARM64_HW_PGTABLE_LEVEL_SHIFT(level); } static inline u64 kvm_granule_size(s8 level) { return BIT(kvm_granule_shift(level)); } static inline bool kvm_level_supports_block_mapping(s8 level) { return level >= KVM_PGTABLE_MIN_BLOCK_LEVEL; } static inline u32 kvm_supported_block_sizes(void) { s8 level = KVM_PGTABLE_MIN_BLOCK_LEVEL; u32 r = 0; for (; level <= KVM_PGTABLE_LAST_LEVEL; level++) r |= BIT(kvm_granule_shift(level)); return r; } static inline bool kvm_is_block_size_supported(u64 size) { bool is_power_of_two = IS_ALIGNED(size, size); return is_power_of_two && (size & kvm_supported_block_sizes()); } /** * struct kvm_pgtable_mm_ops - Memory management callbacks. * @zalloc_page: Allocate a single zeroed memory page. * The @arg parameter can be used by the walker * to pass a memcache. The initial refcount of * the page is 1. * @zalloc_pages_exact: Allocate an exact number of zeroed memory pages. * The @size parameter is in bytes, and is rounded * up to the next page boundary. The resulting * allocation is physically contiguous. * @free_pages_exact: Free an exact number of memory pages previously * allocated by zalloc_pages_exact. * @free_unlinked_table: Free an unlinked paging structure by unlinking and * dropping references. * @get_page: Increment the refcount on a page. * @put_page: Decrement the refcount on a page. When the * refcount reaches 0 the page is automatically * freed. * @page_count: Return the refcount of a page. * @phys_to_virt: Convert a physical address into a virtual * address mapped in the current context. * @virt_to_phys: Convert a virtual address mapped in the current * context into a physical address. * @dcache_clean_inval_poc: Clean and invalidate the data cache to the PoC * for the specified memory address range. * @icache_inval_pou: Invalidate the instruction cache to the PoU * for the specified memory address range. */ struct kvm_pgtable_mm_ops { void* (*zalloc_page)(void *arg); void* (*zalloc_pages_exact)(size_t size); void (*free_pages_exact)(void *addr, size_t size); void (*free_unlinked_table)(void *addr, s8 level); void (*get_page)(void *addr); void (*put_page)(void *addr); int (*page_count)(void *addr); void* (*phys_to_virt)(phys_addr_t phys); phys_addr_t (*virt_to_phys)(void *addr); void (*dcache_clean_inval_poc)(void *addr, size_t size); void (*icache_inval_pou)(void *addr, size_t size); }; /** * enum kvm_pgtable_stage2_flags - Stage-2 page-table flags. * @KVM_PGTABLE_S2_NOFWB: Don't enforce Normal-WB even if the CPUs have * ARM64_HAS_STAGE2_FWB. * @KVM_PGTABLE_S2_IDMAP: Only use identity mappings. */ enum kvm_pgtable_stage2_flags { KVM_PGTABLE_S2_NOFWB = BIT(0), KVM_PGTABLE_S2_IDMAP = BIT(1), }; /** * enum kvm_pgtable_prot - Page-table permissions and attributes. * @KVM_PGTABLE_PROT_X: Execute permission. * @KVM_PGTABLE_PROT_W: Write permission. * @KVM_PGTABLE_PROT_R: Read permission. * @KVM_PGTABLE_PROT_DEVICE: Device attributes. * @KVM_PGTABLE_PROT_NORMAL_NC: Normal noncacheable attributes. * @KVM_PGTABLE_PROT_SW0: Software bit 0. * @KVM_PGTABLE_PROT_SW1: Software bit 1. * @KVM_PGTABLE_PROT_SW2: Software bit 2. * @KVM_PGTABLE_PROT_SW3: Software bit 3. */ enum kvm_pgtable_prot { KVM_PGTABLE_PROT_X = BIT(0), KVM_PGTABLE_PROT_W = BIT(1), KVM_PGTABLE_PROT_R = BIT(2), KVM_PGTABLE_PROT_DEVICE = BIT(3), KVM_PGTABLE_PROT_NORMAL_NC = BIT(4), KVM_PGTABLE_PROT_SW0 = BIT(55), KVM_PGTABLE_PROT_SW1 = BIT(56), KVM_PGTABLE_PROT_SW2 = BIT(57), KVM_PGTABLE_PROT_SW3 = BIT(58), }; #define KVM_PGTABLE_PROT_RW (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W) #define KVM_PGTABLE_PROT_RWX (KVM_PGTABLE_PROT_RW | KVM_PGTABLE_PROT_X) #define PKVM_HOST_MEM_PROT KVM_PGTABLE_PROT_RWX #define PKVM_HOST_MMIO_PROT KVM_PGTABLE_PROT_RW #define PAGE_HYP KVM_PGTABLE_PROT_RW #define PAGE_HYP_EXEC (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_X) #define PAGE_HYP_RO (KVM_PGTABLE_PROT_R) #define PAGE_HYP_DEVICE (PAGE_HYP | KVM_PGTABLE_PROT_DEVICE) typedef bool (*kvm_pgtable_force_pte_cb_t)(u64 addr, u64 end, enum kvm_pgtable_prot prot); /** * enum kvm_pgtable_walk_flags - Flags to control a depth-first page-table walk. * @KVM_PGTABLE_WALK_LEAF: Visit leaf entries, including invalid * entries. * @KVM_PGTABLE_WALK_TABLE_PRE: Visit table entries before their * children. * @KVM_PGTABLE_WALK_TABLE_POST: Visit table entries after their * children. * @KVM_PGTABLE_WALK_SHARED: Indicates the page-tables may be shared * with other software walkers. * @KVM_PGTABLE_WALK_HANDLE_FAULT: Indicates the page-table walk was * invoked from a fault handler. * @KVM_PGTABLE_WALK_SKIP_BBM_TLBI: Visit and update table entries * without Break-before-make's * TLB invalidation. * @KVM_PGTABLE_WALK_SKIP_CMO: Visit and update table entries * without Cache maintenance * operations required. */ enum kvm_pgtable_walk_flags { KVM_PGTABLE_WALK_LEAF = BIT(0), KVM_PGTABLE_WALK_TABLE_PRE = BIT(1), KVM_PGTABLE_WALK_TABLE_POST = BIT(2), KVM_PGTABLE_WALK_SHARED = BIT(3), KVM_PGTABLE_WALK_HANDLE_FAULT = BIT(4), KVM_PGTABLE_WALK_SKIP_BBM_TLBI = BIT(5), KVM_PGTABLE_WALK_SKIP_CMO = BIT(6), }; struct kvm_pgtable_visit_ctx { kvm_pte_t *ptep; kvm_pte_t old; void *arg; struct kvm_pgtable_mm_ops *mm_ops; u64 start; u64 addr; u64 end; s8 level; enum kvm_pgtable_walk_flags flags; }; typedef int (*kvm_pgtable_visitor_fn_t)(const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit); static inline bool kvm_pgtable_walk_shared(const struct kvm_pgtable_visit_ctx *ctx) { return ctx->flags & KVM_PGTABLE_WALK_SHARED; } /** * struct kvm_pgtable_walker - Hook into a page-table walk. * @cb: Callback function to invoke during the walk. * @arg: Argument passed to the callback function. * @flags: Bitwise-OR of flags to identify the entry types on which to * invoke the callback function. */ struct kvm_pgtable_walker { const kvm_pgtable_visitor_fn_t cb; void * const arg; const enum kvm_pgtable_walk_flags flags; }; /* * RCU cannot be used in a non-kernel context such as the hyp. As such, page * table walkers used in hyp do not call into RCU and instead use other * synchronization mechanisms (such as a spinlock). */ #if defined(__KVM_NVHE_HYPERVISOR__) || defined(__KVM_VHE_HYPERVISOR__) typedef kvm_pte_t *kvm_pteref_t; static inline kvm_pte_t *kvm_dereference_pteref(struct kvm_pgtable_walker *walker, kvm_pteref_t pteref) { return pteref; } static inline kvm_pte_t *kvm_dereference_pteref_raw(kvm_pteref_t pteref) { return pteref; } static inline int kvm_pgtable_walk_begin(struct kvm_pgtable_walker *walker) { /* * Due to the lack of RCU (or a similar protection scheme), only * non-shared table walkers are allowed in the hypervisor. */ if (walker->flags & KVM_PGTABLE_WALK_SHARED) return -EPERM; return 0; } static inline void kvm_pgtable_walk_end(struct kvm_pgtable_walker *walker) {} static inline bool kvm_pgtable_walk_lock_held(void) { return true; } #else typedef kvm_pte_t __rcu *kvm_pteref_t; static inline kvm_pte_t *kvm_dereference_pteref(struct kvm_pgtable_walker *walker, kvm_pteref_t pteref) { return rcu_dereference_check(pteref, !(walker->flags & KVM_PGTABLE_WALK_SHARED)); } static inline kvm_pte_t *kvm_dereference_pteref_raw(kvm_pteref_t pteref) { return rcu_dereference_raw(pteref); } static inline int kvm_pgtable_walk_begin(struct kvm_pgtable_walker *walker) { if (walker->flags & KVM_PGTABLE_WALK_SHARED) rcu_read_lock(); return 0; } static inline void kvm_pgtable_walk_end(struct kvm_pgtable_walker *walker) { if (walker->flags & KVM_PGTABLE_WALK_SHARED) rcu_read_unlock(); } static inline bool kvm_pgtable_walk_lock_held(void) { return rcu_read_lock_held(); } #endif /** * struct kvm_pgtable - KVM page-table. * @ia_bits: Maximum input address size, in bits. * @start_level: Level at which the page-table walk starts. * @pgd: Pointer to the first top-level entry of the page-table. * @mm_ops: Memory management callbacks. * @mmu: Stage-2 KVM MMU struct. Unused for stage-1 page-tables. * @flags: Stage-2 page-table flags. * @force_pte_cb: Function that returns true if page level mappings must * be used instead of block mappings. */ struct kvm_pgtable { union { struct rb_root_cached pkvm_mappings; struct { u32 ia_bits; s8 start_level; kvm_pteref_t pgd; struct kvm_pgtable_mm_ops *mm_ops; /* Stage-2 only */ enum kvm_pgtable_stage2_flags flags; kvm_pgtable_force_pte_cb_t force_pte_cb; }; }; struct kvm_s2_mmu *mmu; }; /** * kvm_pgtable_hyp_init() - Initialise a hypervisor stage-1 page-table. * @pgt: Uninitialised page-table structure to initialise. * @va_bits: Maximum virtual address bits. * @mm_ops: Memory management callbacks. * * Return: 0 on success, negative error code on failure. */ int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits, struct kvm_pgtable_mm_ops *mm_ops); /** * kvm_pgtable_hyp_destroy() - Destroy an unused hypervisor stage-1 page-table. * @pgt: Page-table structure initialised by kvm_pgtable_hyp_init(). * * The page-table is assumed to be unreachable by any hardware walkers prior * to freeing and therefore no TLB invalidation is performed. */ void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt); /** * kvm_pgtable_hyp_map() - Install a mapping in a hypervisor stage-1 page-table. * @pgt: Page-table structure initialised by kvm_pgtable_hyp_init(). * @addr: Virtual address at which to place the mapping. * @size: Size of the mapping. * @phys: Physical address of the memory to map. * @prot: Permissions and attributes for the mapping. * * The offset of @addr within a page is ignored, @size is rounded-up to * the next page boundary and @phys is rounded-down to the previous page * boundary. * * If device attributes are not explicitly requested in @prot, then the * mapping will be normal, cacheable. Attempts to install a new mapping * for a virtual address that is already mapped will be rejected with an * error and a WARN(). * * Return: 0 on success, negative error code on failure. */ int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, enum kvm_pgtable_prot prot); /** * kvm_pgtable_hyp_unmap() - Remove a mapping from a hypervisor stage-1 page-table. * @pgt: Page-table structure initialised by kvm_pgtable_hyp_init(). * @addr: Virtual address from which to remove the mapping. * @size: Size of the mapping. * * The offset of @addr within a page is ignored, @size is rounded-up to * the next page boundary and @phys is rounded-down to the previous page * boundary. * * TLB invalidation is performed for each page-table entry cleared during the * unmapping operation and the reference count for the page-table page * containing the cleared entry is decremented, with unreferenced pages being * freed. The unmapping operation will stop early if it encounters either an * invalid page-table entry or a valid block mapping which maps beyond the range * being unmapped. * * Return: Number of bytes unmapped, which may be 0. */ u64 kvm_pgtable_hyp_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size); /** * kvm_get_vtcr() - Helper to construct VTCR_EL2 * @mmfr0: Sanitized value of SYS_ID_AA64MMFR0_EL1 register. * @mmfr1: Sanitized value of SYS_ID_AA64MMFR1_EL1 register. * @phys_shfit: Value to set in VTCR_EL2.T0SZ. * * The VTCR value is common across all the physical CPUs on the system. * We use system wide sanitised values to fill in different fields, * except for Hardware Management of Access Flags. HA Flag is set * unconditionally on all CPUs, as it is safe to run with or without * the feature and the bit is RES0 on CPUs that don't support it. * * Return: VTCR_EL2 value */ u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift); /** * kvm_pgtable_stage2_pgd_size() - Helper to compute size of a stage-2 PGD * @vtcr: Content of the VTCR register. * * Return: the size (in bytes) of the stage-2 PGD */ size_t kvm_pgtable_stage2_pgd_size(u64 vtcr); /** * __kvm_pgtable_stage2_init() - Initialise a guest stage-2 page-table. * @pgt: Uninitialised page-table structure to initialise. * @mmu: S2 MMU context for this S2 translation * @mm_ops: Memory management callbacks. * @flags: Stage-2 configuration flags. * @force_pte_cb: Function that returns true if page level mappings must * be used instead of block mappings. * * Return: 0 on success, negative error code on failure. */ int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, struct kvm_pgtable_mm_ops *mm_ops, enum kvm_pgtable_stage2_flags flags, kvm_pgtable_force_pte_cb_t force_pte_cb); static inline int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, struct kvm_pgtable_mm_ops *mm_ops) { return __kvm_pgtable_stage2_init(pgt, mmu, mm_ops, 0, NULL); } /** * kvm_pgtable_stage2_destroy() - Destroy an unused guest stage-2 page-table. * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). * * The page-table is assumed to be unreachable by any hardware walkers prior * to freeing and therefore no TLB invalidation is performed. */ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt); /** * kvm_pgtable_stage2_destroy_range() - Destroy the unlinked range of addresses. * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). * @addr: Intermediate physical address at which to place the mapping. * @size: Size of the mapping. * * The page-table is assumed to be unreachable by any hardware walkers prior * to freeing and therefore no TLB invalidation is performed. */ void kvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt, u64 addr, u64 size); /** * kvm_pgtable_stage2_destroy_pgd() - Destroy the PGD of guest stage-2 page-table. * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). * * It is assumed that the rest of the page-table is freed before this operation. */ void kvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt); /** * kvm_pgtable_stage2_free_unlinked() - Free an unlinked stage-2 paging structure. * @mm_ops: Memory management callbacks. * @pgtable: Unlinked stage-2 paging structure to be freed. * @level: Level of the stage-2 paging structure to be freed. * * The page-table is assumed to be unreachable by any hardware walkers prior to * freeing and therefore no TLB invalidation is performed. */ void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, s8 level); /** * kvm_pgtable_stage2_create_unlinked() - Create an unlinked stage-2 paging structure. * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). * @phys: Physical address of the memory to map. * @level: Starting level of the stage-2 paging structure to be created. * @prot: Permissions and attributes for the mapping. * @mc: Cache of pre-allocated and zeroed memory from which to allocate * page-table pages. * @force_pte: Force mappings to PAGE_SIZE granularity. * * Returns an unlinked page-table tree. This new page-table tree is * not reachable (i.e., it is unlinked) from the root pgd and it's * therefore unreachableby the hardware page-table walker. No TLB * invalidation or CMOs are performed. * * If device attributes are not explicitly requested in @prot, then the * mapping will be normal, cacheable. * * Return: The fully populated (unlinked) stage-2 paging structure, or * an ERR_PTR(error) on failure. */ kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt, u64 phys, s8 level, enum kvm_pgtable_prot prot, void *mc, bool force_pte); /** * kvm_pgtable_stage2_map() - Install a mapping in a guest stage-2 page-table. * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). * @addr: Intermediate physical address at which to place the mapping. * @size: Size of the mapping. * @phys: Physical address of the memory to map. * @prot: Permissions and attributes for the mapping. * @mc: Cache of pre-allocated and zeroed memory from which to allocate * page-table pages. * @flags: Flags to control the page-table walk (ex. a shared walk) * * The offset of @addr within a page is ignored, @size is rounded-up to * the next page boundary and @phys is rounded-down to the previous page * boundary. * * If device attributes are not explicitly requested in @prot, then the * mapping will be normal, cacheable. * * Note that the update of a valid leaf PTE in this function will be aborted, * if it's trying to recreate the exact same mapping or only change the access * permissions. Instead, the vCPU will exit one more time from guest if still * needed and then go through the path of relaxing permissions. * * Note that this function will both coalesce existing table entries and split * existing block mappings, relying on page-faults to fault back areas outside * of the new mapping lazily. * * Return: 0 on success, negative error code on failure. */ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, enum kvm_pgtable_prot prot, void *mc, enum kvm_pgtable_walk_flags flags); /** * kvm_pgtable_stage2_set_owner() - Unmap and annotate pages in the IPA space to * track ownership. * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). * @addr: Base intermediate physical address to annotate. * @size: Size of the annotated range. * @mc: Cache of pre-allocated and zeroed memory from which to allocate * page-table pages. * @owner_id: Unique identifier for the owner of the page. * * By default, all page-tables are owned by identifier 0. This function can be * used to mark portions of the IPA space as owned by other entities. When a * stage 2 is used with identity-mappings, these annotations allow to use the * page-table data structure as a simple rmap. * * Return: 0 on success, negative error code on failure. */ int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size, void *mc, u8 owner_id); /** * kvm_pgtable_stage2_unmap() - Remove a mapping from a guest stage-2 page-table. * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). * @addr: Intermediate physical address from which to remove the mapping. * @size: Size of the mapping. * * The offset of @addr within a page is ignored and @size is rounded-up to * the next page boundary. * * TLB invalidation is performed for each page-table entry cleared during the * unmapping operation and the reference count for the page-table page * containing the cleared entry is decremented, with unreferenced pages being * freed. Unmapping a cacheable page will ensure that it is clean to the PoC if * FWB is not supported by the CPU. * * Return: 0 on success, negative error code on failure. */ int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size); /** * kvm_pgtable_stage2_wrprotect() - Write-protect guest stage-2 address range * without TLB invalidation. * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). * @addr: Intermediate physical address from which to write-protect, * @size: Size of the range. * * The offset of @addr within a page is ignored and @size is rounded-up to * the next page boundary. * * Note that it is the caller's responsibility to invalidate the TLB after * calling this function to ensure that the updated permissions are visible * to the CPUs. * * Return: 0 on success, negative error code on failure. */ int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size); /** * kvm_pgtable_stage2_mkyoung() - Set the access flag in a page-table entry. * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). * @addr: Intermediate physical address to identify the page-table entry. * @flags: Flags to control the page-table walk (ex. a shared walk) * * The offset of @addr within a page is ignored. * * If there is a valid, leaf page-table entry used to translate @addr, then * set the access flag in that entry. */ void kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr, enum kvm_pgtable_walk_flags flags); /** * kvm_pgtable_stage2_test_clear_young() - Test and optionally clear the access * flag in a page-table entry. * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). * @addr: Intermediate physical address to identify the page-table entry. * @size: Size of the address range to visit. * @mkold: True if the access flag should be cleared. * * The offset of @addr within a page is ignored. * * Tests and conditionally clears the access flag for every valid, leaf * page-table entry used to translate the range [@addr, @addr + @size). * * Note that it is the caller's responsibility to invalidate the TLB after * calling this function to ensure that the updated permissions are visible * to the CPUs. * * Return: True if any of the visited PTEs had the access flag set. */ bool kvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr, u64 size, bool mkold); /** * kvm_pgtable_stage2_relax_perms() - Relax the permissions enforced by a * page-table entry. * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). * @addr: Intermediate physical address to identify the page-table entry. * @prot: Additional permissions to grant for the mapping. * @flags: Flags to control the page-table walk (ex. a shared walk) * * The offset of @addr within a page is ignored. * * If there is a valid, leaf page-table entry used to translate @addr, then * relax the permissions in that entry according to the read, write and * execute permissions specified by @prot. No permissions are removed, and * TLB invalidation is performed after updating the entry. Software bits cannot * be set or cleared using kvm_pgtable_stage2_relax_perms(). * * Return: 0 on success, negative error code on failure. */ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, enum kvm_pgtable_prot prot, enum kvm_pgtable_walk_flags flags); /** * kvm_pgtable_stage2_flush_range() - Clean and invalidate data cache to Point * of Coherency for guest stage-2 address * range. * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). * @addr: Intermediate physical address from which to flush. * @size: Size of the range. * * The offset of @addr within a page is ignored and @size is rounded-up to * the next page boundary. * * Return: 0 on success, negative error code on failure. */ int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size); /** * kvm_pgtable_stage2_split() - Split a range of huge pages into leaf PTEs pointing * to PAGE_SIZE guest pages. * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). * @addr: Intermediate physical address from which to split. * @size: Size of the range. * @mc: Cache of pre-allocated and zeroed memory from which to allocate * page-table pages. * * The function tries to split any level 1 or 2 entry that overlaps * with the input range (given by @addr and @size). * * Return: 0 on success, negative error code on failure. Note that * kvm_pgtable_stage2_split() is best effort: it tries to break as many * blocks in the input range as allowed by @mc_capacity. */ int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size, struct kvm_mmu_memory_cache *mc); /** * kvm_pgtable_walk() - Walk a page-table. * @pgt: Page-table structure initialised by kvm_pgtable_*_init(). * @addr: Input address for the start of the walk. * @size: Size of the range to walk. * @walker: Walker callback description. * * The offset of @addr within a page is ignored and @size is rounded-up to * the next page boundary. * * The walker will walk the page-table entries corresponding to the input * address range specified, visiting entries according to the walker flags. * Invalid entries are treated as leaf entries. The visited page table entry is * reloaded after invoking the walker callback, allowing the walker to descend * into a newly installed table. * * Returning a negative error code from the walker callback function will * terminate the walk immediately with the same error code. * * Return: 0 on success, negative error code on failure. */ int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size, struct kvm_pgtable_walker *walker); /** * kvm_pgtable_get_leaf() - Walk a page-table and retrieve the leaf entry * with its level. * @pgt: Page-table structure initialised by kvm_pgtable_*_init() * or a similar initialiser. * @addr: Input address for the start of the walk. * @ptep: Pointer to storage for the retrieved PTE. * @level: Pointer to storage for the level of the retrieved PTE. * * The offset of @addr within a page is ignored. * * The walker will walk the page-table entries corresponding to the input * address specified, retrieving the leaf corresponding to this address. * Invalid entries are treated as leaf entries. * * Return: 0 on success, negative error code on failure. */ int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr, kvm_pte_t *ptep, s8 *level); /** * kvm_pgtable_stage2_pte_prot() - Retrieve the protection attributes of a * stage-2 Page-Table Entry. * @pte: Page-table entry * * Return: protection attributes of the page-table entry in the enum * kvm_pgtable_prot format. */ enum kvm_pgtable_prot kvm_pgtable_stage2_pte_prot(kvm_pte_t pte); /** * kvm_pgtable_hyp_pte_prot() - Retrieve the protection attributes of a stage-1 * Page-Table Entry. * @pte: Page-table entry * * Return: protection attributes of the page-table entry in the enum * kvm_pgtable_prot format. */ enum kvm_pgtable_prot kvm_pgtable_hyp_pte_prot(kvm_pte_t pte); /** * kvm_tlb_flush_vmid_range() - Invalidate/flush a range of TLB entries * * @mmu: Stage-2 KVM MMU struct * @addr: The base Intermediate physical address from which to invalidate * @size: Size of the range from the base to invalidate */ void kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, size_t size); #endif /* __ARM64_KVM_PGTABLE_H__ */
11 245 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM skb #if !defined(_TRACE_SKB_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_SKB_H #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/tracepoint.h> #undef FN #define FN(reason) TRACE_DEFINE_ENUM(SKB_DROP_REASON_##reason); DEFINE_DROP_REASON(FN, FN) #undef FN #undef FNe #define FN(reason) { SKB_DROP_REASON_##reason, #reason }, #define FNe(reason) { SKB_DROP_REASON_##reason, #reason } /* * Tracepoint for free an sk_buff: */ TRACE_EVENT(kfree_skb, TP_PROTO(struct sk_buff *skb, void *location, enum skb_drop_reason reason, struct sock *rx_sk), TP_ARGS(skb, location, reason, rx_sk), TP_STRUCT__entry( __field(void *, skbaddr) __field(void *, location) __field(void *, rx_sk) __field(unsigned short, protocol) __field(enum skb_drop_reason, reason) ), TP_fast_assign( __entry->skbaddr = skb; __entry->location = location; __entry->rx_sk = rx_sk; __entry->protocol = ntohs(skb->protocol); __entry->reason = reason; ), TP_printk("skbaddr=%p rx_sk=%p protocol=%u location=%pS reason: %s", __entry->skbaddr, __entry->rx_sk, __entry->protocol, __entry->location, __print_symbolic(__entry->reason, DEFINE_DROP_REASON(FN, FNe))) ); #undef FN #undef FNe TRACE_EVENT(consume_skb, TP_PROTO(struct sk_buff *skb, void *location), TP_ARGS(skb, location), TP_STRUCT__entry( __field( void *, skbaddr) __field( void *, location) ), TP_fast_assign( __entry->skbaddr = skb; __entry->location = location; ), TP_printk("skbaddr=%p location=%pS", __entry->skbaddr, __entry->location) ); TRACE_EVENT(skb_copy_datagram_iovec, TP_PROTO(const struct sk_buff *skb, int len), TP_ARGS(skb, len), TP_STRUCT__entry( __field( const void *, skbaddr ) __field( int, len ) ), TP_fast_assign( __entry->skbaddr = skb; __entry->len = len; ), TP_printk("skbaddr=%p len=%d", __entry->skbaddr, __entry->len) ); #endif /* _TRACE_SKB_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
205 700 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Security-Enhanced Linux (SELinux) security module * * This file contains the SELinux security data structures for kernel objects. * * Author(s): Stephen Smalley, <stephen.smalley.work@gmail.com> * Chris Vance, <cvance@nai.com> * Wayne Salamon, <wsalamon@nai.com> * James Morris <jmorris@redhat.com> * * Copyright (C) 2001,2002 Networks Associates Technology, Inc. * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com> * Copyright (C) 2016 Mellanox Technologies */ #ifndef _SELINUX_OBJSEC_H_ #define _SELINUX_OBJSEC_H_ #include <linux/list.h> #include <linux/sched.h> #include <linux/fs.h> #include <linux/binfmts.h> #include <linux/in.h> #include <linux/spinlock.h> #include <linux/lsm_hooks.h> #include <linux/msg.h> #include <net/net_namespace.h> #include "flask.h" #include "avc.h" struct avdc_entry { u32 isid; /* inode SID */ u32 allowed; /* allowed permission bitmask */ u32 audited; /* audited permission bitmask */ bool permissive; /* AVC permissive flag */ }; struct task_security_struct { u32 osid; /* SID prior to last execve */ u32 sid; /* current SID */ u32 exec_sid; /* exec SID */ u32 create_sid; /* fscreate SID */ u32 keycreate_sid; /* keycreate SID */ u32 sockcreate_sid; /* fscreate SID */ #define TSEC_AVDC_DIR_SIZE (1 << 2) struct { u32 sid; /* current SID for cached entries */ u32 seqno; /* AVC sequence number */ unsigned int dir_spot; /* dir cache index to check first */ struct avdc_entry dir[TSEC_AVDC_DIR_SIZE]; /* dir entries */ bool permissive_neveraudit; /* permissive and neveraudit */ } avdcache; } __randomize_layout; static inline bool task_avdcache_permnoaudit(struct task_security_struct *tsec) { return (tsec->avdcache.permissive_neveraudit && tsec->sid == tsec->avdcache.sid && tsec->avdcache.seqno == avc_policy_seqno()); } enum label_initialized { LABEL_INVALID, /* invalid or not initialized */ LABEL_INITIALIZED, /* initialized */ LABEL_PENDING }; struct inode_security_struct { struct inode *inode; /* back pointer to inode object */ struct list_head list; /* list of inode_security_struct */ u32 task_sid; /* SID of creating task */ u32 sid; /* SID of this object */ u16 sclass; /* security class of this object */ unsigned char initialized; /* initialization flag */ spinlock_t lock; }; struct file_security_struct { u32 sid; /* SID of open file description */ u32 fown_sid; /* SID of file owner (for SIGIO) */ u32 isid; /* SID of inode at the time of file open */ u32 pseqno; /* Policy seqno at the time of file open */ }; struct superblock_security_struct { u32 sid; /* SID of file system superblock */ u32 def_sid; /* default SID for labeling */ u32 mntpoint_sid; /* SECURITY_FS_USE_MNTPOINT context for files */ unsigned short behavior; /* labeling behavior */ unsigned short flags; /* which mount options were specified */ struct mutex lock; struct list_head isec_head; spinlock_t isec_lock; }; struct msg_security_struct { u32 sid; /* SID of message */ }; struct ipc_security_struct { u16 sclass; /* security class of this object */ u32 sid; /* SID of IPC resource */ }; struct netif_security_struct { const struct net *ns; /* network namespace */ int ifindex; /* device index */ u32 sid; /* SID for this interface */ }; struct netnode_security_struct { union { __be32 ipv4; /* IPv4 node address */ struct in6_addr ipv6; /* IPv6 node address */ } addr; u32 sid; /* SID for this node */ u16 family; /* address family */ }; struct netport_security_struct { u32 sid; /* SID for this node */ u16 port; /* port number */ u8 protocol; /* transport protocol */ }; struct sk_security_struct { #ifdef CONFIG_NETLABEL enum { /* NetLabel state */ NLBL_UNSET = 0, NLBL_REQUIRE, NLBL_LABELED, NLBL_REQSKB, NLBL_CONNLABELED, } nlbl_state; struct netlbl_lsm_secattr *nlbl_secattr; /* NetLabel sec attributes */ #endif u32 sid; /* SID of this object */ u32 peer_sid; /* SID of peer */ u16 sclass; /* sock security class */ enum { /* SCTP association state */ SCTP_ASSOC_UNSET = 0, SCTP_ASSOC_SET, } sctp_assoc_state; }; struct tun_security_struct { u32 sid; /* SID for the tun device sockets */ }; struct key_security_struct { u32 sid; /* SID of key */ }; struct ib_security_struct { u32 sid; /* SID of the queue pair or MAD agent */ }; struct pkey_security_struct { u64 subnet_prefix; /* Port subnet prefix */ u16 pkey; /* PKey number */ u32 sid; /* SID of pkey */ }; struct bpf_security_struct { u32 sid; /* SID of bpf obj creator */ }; struct perf_event_security_struct { u32 sid; /* SID of perf_event obj creator */ }; extern struct lsm_blob_sizes selinux_blob_sizes; static inline struct task_security_struct *selinux_cred(const struct cred *cred) { return cred->security + selinux_blob_sizes.lbs_cred; } static inline struct file_security_struct *selinux_file(const struct file *file) { return file->f_security + selinux_blob_sizes.lbs_file; } static inline struct inode_security_struct * selinux_inode(const struct inode *inode) { if (unlikely(!inode->i_security)) return NULL; return inode->i_security + selinux_blob_sizes.lbs_inode; } static inline struct msg_security_struct * selinux_msg_msg(const struct msg_msg *msg_msg) { return msg_msg->security + selinux_blob_sizes.lbs_msg_msg; } static inline struct ipc_security_struct * selinux_ipc(const struct kern_ipc_perm *ipc) { return ipc->security + selinux_blob_sizes.lbs_ipc; } /* * get the subjective security ID of the current task */ static inline u32 current_sid(void) { const struct task_security_struct *tsec = selinux_cred(current_cred()); return tsec->sid; } static inline struct superblock_security_struct * selinux_superblock(const struct super_block *superblock) { return superblock->s_security + selinux_blob_sizes.lbs_superblock; } #ifdef CONFIG_KEYS static inline struct key_security_struct *selinux_key(const struct key *key) { return key->security + selinux_blob_sizes.lbs_key; } #endif /* CONFIG_KEYS */ static inline struct sk_security_struct *selinux_sock(const struct sock *sock) { return sock->sk_security + selinux_blob_sizes.lbs_sock; } static inline struct tun_security_struct *selinux_tun_dev(void *security) { return security + selinux_blob_sizes.lbs_tun_dev; } static inline struct ib_security_struct *selinux_ib(void *ib_sec) { return ib_sec + selinux_blob_sizes.lbs_ib; } static inline struct perf_event_security_struct * selinux_perf_event(void *perf_event) { return perf_event + selinux_blob_sizes.lbs_perf_event; } #endif /* _SELINUX_OBJSEC_H_ */
451 449 452 448 452 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_LOCAL_LOCK_H # error "Do not include directly, include linux/local_lock.h" #endif #include <linux/percpu-defs.h> #include <linux/lockdep.h> #ifndef CONFIG_PREEMPT_RT typedef struct { #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; struct task_struct *owner; #endif } local_lock_t; /* local_trylock() and local_trylock_irqsave() only work with local_trylock_t */ typedef struct { local_lock_t llock; u8 acquired; } local_trylock_t; #ifdef CONFIG_DEBUG_LOCK_ALLOC # define LOCAL_LOCK_DEBUG_INIT(lockname) \ .dep_map = { \ .name = #lockname, \ .wait_type_inner = LD_WAIT_CONFIG, \ .lock_type = LD_LOCK_PERCPU, \ }, \ .owner = NULL, # define LOCAL_TRYLOCK_DEBUG_INIT(lockname) \ .llock = { LOCAL_LOCK_DEBUG_INIT((lockname).llock) }, static inline void local_lock_acquire(local_lock_t *l) { lock_map_acquire(&l->dep_map); DEBUG_LOCKS_WARN_ON(l->owner); l->owner = current; } static inline void local_trylock_acquire(local_lock_t *l) { lock_map_acquire_try(&l->dep_map); DEBUG_LOCKS_WARN_ON(l->owner); l->owner = current; } static inline void local_lock_release(local_lock_t *l) { DEBUG_LOCKS_WARN_ON(l->owner != current); l->owner = NULL; lock_map_release(&l->dep_map); } static inline void local_lock_debug_init(local_lock_t *l) { l->owner = NULL; } #else /* CONFIG_DEBUG_LOCK_ALLOC */ # define LOCAL_LOCK_DEBUG_INIT(lockname) # define LOCAL_TRYLOCK_DEBUG_INIT(lockname) static inline void local_lock_acquire(local_lock_t *l) { } static inline void local_trylock_acquire(local_lock_t *l) { } static inline void local_lock_release(local_lock_t *l) { } static inline void local_lock_debug_init(local_lock_t *l) { } #endif /* !CONFIG_DEBUG_LOCK_ALLOC */ #define INIT_LOCAL_LOCK(lockname) { LOCAL_LOCK_DEBUG_INIT(lockname) } #define INIT_LOCAL_TRYLOCK(lockname) { LOCAL_TRYLOCK_DEBUG_INIT(lockname) } #define __local_lock_init(lock) \ do { \ static struct lock_class_key __key; \ \ debug_check_no_locks_freed((void *)lock, sizeof(*lock));\ lockdep_init_map_type(&(lock)->dep_map, #lock, &__key, \ 0, LD_WAIT_CONFIG, LD_WAIT_INV, \ LD_LOCK_PERCPU); \ local_lock_debug_init(lock); \ } while (0) #define __local_trylock_init(lock) __local_lock_init(lock.llock) #define __spinlock_nested_bh_init(lock) \ do { \ static struct lock_class_key __key; \ \ debug_check_no_locks_freed((void *)lock, sizeof(*lock));\ lockdep_init_map_type(&(lock)->dep_map, #lock, &__key, \ 0, LD_WAIT_CONFIG, LD_WAIT_INV, \ LD_LOCK_NORMAL); \ local_lock_debug_init(lock); \ } while (0) #define __local_lock_acquire(lock) \ do { \ local_trylock_t *tl; \ local_lock_t *l; \ \ l = (local_lock_t *)(lock); \ tl = (local_trylock_t *)l; \ _Generic((lock), \ local_trylock_t *: ({ \ lockdep_assert(tl->acquired == 0); \ WRITE_ONCE(tl->acquired, 1); \ }), \ local_lock_t *: (void)0); \ local_lock_acquire(l); \ } while (0) #define __local_lock(lock) \ do { \ preempt_disable(); \ __local_lock_acquire(lock); \ } while (0) #define __local_lock_irq(lock) \ do { \ local_irq_disable(); \ __local_lock_acquire(lock); \ } while (0) #define __local_lock_irqsave(lock, flags) \ do { \ local_irq_save(flags); \ __local_lock_acquire(lock); \ } while (0) #define __local_trylock(lock) \ ({ \ local_trylock_t *tl; \ \ preempt_disable(); \ tl = (lock); \ if (READ_ONCE(tl->acquired)) { \ preempt_enable(); \ tl = NULL; \ } else { \ WRITE_ONCE(tl->acquired, 1); \ local_trylock_acquire( \ (local_lock_t *)tl); \ } \ !!tl; \ }) #define __local_trylock_irqsave(lock, flags) \ ({ \ local_trylock_t *tl; \ \ local_irq_save(flags); \ tl = (lock); \ if (READ_ONCE(tl->acquired)) { \ local_irq_restore(flags); \ tl = NULL; \ } else { \ WRITE_ONCE(tl->acquired, 1); \ local_trylock_acquire( \ (local_lock_t *)tl); \ } \ !!tl; \ }) #define __local_lock_release(lock) \ do { \ local_trylock_t *tl; \ local_lock_t *l; \ \ l = (local_lock_t *)(lock); \ tl = (local_trylock_t *)l; \ local_lock_release(l); \ _Generic((lock), \ local_trylock_t *: ({ \ lockdep_assert(tl->acquired == 1); \ WRITE_ONCE(tl->acquired, 0); \ }), \ local_lock_t *: (void)0); \ } while (0) #define __local_unlock(lock) \ do { \ __local_lock_release(lock); \ preempt_enable(); \ } while (0) #define __local_unlock_irq(lock) \ do { \ __local_lock_release(lock); \ local_irq_enable(); \ } while (0) #define __local_unlock_irqrestore(lock, flags) \ do { \ __local_lock_release(lock); \ local_irq_restore(flags); \ } while (0) #define __local_lock_nested_bh(lock) \ do { \ lockdep_assert_in_softirq(); \ local_lock_acquire((lock)); \ } while (0) #define __local_unlock_nested_bh(lock) \ local_lock_release((lock)) #else /* !CONFIG_PREEMPT_RT */ /* * On PREEMPT_RT local_lock maps to a per CPU spinlock, which protects the * critical section while staying preemptible. */ typedef spinlock_t local_lock_t; typedef spinlock_t local_trylock_t; #define INIT_LOCAL_LOCK(lockname) __LOCAL_SPIN_LOCK_UNLOCKED((lockname)) #define INIT_LOCAL_TRYLOCK(lockname) __LOCAL_SPIN_LOCK_UNLOCKED((lockname)) #define __local_lock_init(l) \ do { \ local_spin_lock_init((l)); \ } while (0) #define __local_trylock_init(l) __local_lock_init(l) #define __local_lock(__lock) \ do { \ migrate_disable(); \ spin_lock((__lock)); \ } while (0) #define __local_lock_irq(lock) __local_lock(lock) #define __local_lock_irqsave(lock, flags) \ do { \ typecheck(unsigned long, flags); \ flags = 0; \ __local_lock(lock); \ } while (0) #define __local_unlock(__lock) \ do { \ spin_unlock((__lock)); \ migrate_enable(); \ } while (0) #define __local_unlock_irq(lock) __local_unlock(lock) #define __local_unlock_irqrestore(lock, flags) __local_unlock(lock) #define __local_lock_nested_bh(lock) \ do { \ lockdep_assert_in_softirq_func(); \ spin_lock((lock)); \ } while (0) #define __local_unlock_nested_bh(lock) \ do { \ spin_unlock((lock)); \ } while (0) #define __local_trylock(lock) \ ({ \ int __locked; \ \ if (in_nmi() | in_hardirq()) { \ __locked = 0; \ } else { \ migrate_disable(); \ __locked = spin_trylock((lock)); \ if (!__locked) \ migrate_enable(); \ } \ __locked; \ }) #define __local_trylock_irqsave(lock, flags) \ ({ \ typecheck(unsigned long, flags); \ flags = 0; \ __local_trylock(lock); \ }) #endif /* CONFIG_PREEMPT_RT */
211 1 2 3 4 5 6 7 8 9 10 11 12 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2012-2015 - ARM Ltd * Author: Marc Zyngier <marc.zyngier@arm.com> */ #include <asm/kvm_hyp.h> void __kvm_timer_set_cntvoff(u64 cntvoff) { write_sysreg(cntvoff, cntvoff_el2); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 // SPDX-License-Identifier: GPL-2.0 /* * kobject.h - generic kernel object infrastructure. * * Copyright (c) 2002-2003 Patrick Mochel * Copyright (c) 2002-2003 Open Source Development Labs * Copyright (c) 2006-2008 Greg Kroah-Hartman <greg@kroah.com> * Copyright (c) 2006-2008 Novell Inc. * * Please read Documentation/core-api/kobject.rst before using the kobject * interface, ESPECIALLY the parts about reference counts and object * destructors. */ #ifndef _KOBJECT_H_ #define _KOBJECT_H_ #include <linux/types.h> #include <linux/list.h> #include <linux/sysfs.h> #include <linux/compiler.h> #include <linux/container_of.h> #include <linux/spinlock.h> #include <linux/kref.h> #include <linux/kobject_ns.h> #include <linux/wait.h> #include <linux/atomic.h> #include <linux/workqueue.h> #include <linux/uidgid.h> #define UEVENT_HELPER_PATH_LEN 256 #define UEVENT_NUM_ENVP 64 /* number of env pointers */ #define UEVENT_BUFFER_SIZE 2048 /* buffer for the variables */ #ifdef CONFIG_UEVENT_HELPER /* path to the userspace helper executed on an event */ extern char uevent_helper[]; #endif /* counter to tag the uevent, read only except for the kobject core */ extern atomic64_t uevent_seqnum; /* * The actions here must match the index to the string array * in lib/kobject_uevent.c * * Do not add new actions here without checking with the driver-core * maintainers. Action strings are not meant to express subsystem * or device specific properties. In most cases you want to send a * kobject_uevent_env(kobj, KOBJ_CHANGE, env) with additional event * specific variables added to the event environment. */ enum kobject_action { KOBJ_ADD, KOBJ_REMOVE, KOBJ_CHANGE, KOBJ_MOVE, KOBJ_ONLINE, KOBJ_OFFLINE, KOBJ_BIND, KOBJ_UNBIND, }; struct kobject { const char *name; struct list_head entry; struct kobject *parent; struct kset *kset; const struct kobj_type *ktype; struct kernfs_node *sd; /* sysfs directory entry */ struct kref kref; unsigned int state_initialized:1; unsigned int state_in_sysfs:1; unsigned int state_add_uevent_sent:1; unsigned int state_remove_uevent_sent:1; unsigned int uevent_suppress:1; #ifdef CONFIG_DEBUG_KOBJECT_RELEASE struct delayed_work release; #endif }; __printf(2, 3) int kobject_set_name(struct kobject *kobj, const char *name, ...); __printf(2, 0) int kobject_set_name_vargs(struct kobject *kobj, const char *fmt, va_list vargs); static inline const char *kobject_name(const struct kobject *kobj) { return kobj->name; } void kobject_init(struct kobject *kobj, const struct kobj_type *ktype); __printf(3, 4) __must_check int kobject_add(struct kobject *kobj, struct kobject *parent, const char *fmt, ...); __printf(4, 5) __must_check int kobject_init_and_add(struct kobject *kobj, const struct kobj_type *ktype, struct kobject *parent, const char *fmt, ...); void kobject_del(struct kobject *kobj); struct kobject * __must_check kobject_create_and_add(const char *name, struct kobject *parent); int __must_check kobject_rename(struct kobject *, const char *new_name); int __must_check kobject_move(struct kobject *, struct kobject *); struct kobject *kobject_get(struct kobject *kobj); struct kobject * __must_check kobject_get_unless_zero(struct kobject *kobj); void kobject_put(struct kobject *kobj); const void *kobject_namespace(const struct kobject *kobj); void kobject_get_ownership(const struct kobject *kobj, kuid_t *uid, kgid_t *gid); char *kobject_get_path(const struct kobject *kobj, gfp_t flag); struct kobj_type { void (*release)(struct kobject *kobj); const struct sysfs_ops *sysfs_ops; const struct attribute_group **default_groups; const struct kobj_ns_type_operations *(*child_ns_type)(const struct kobject *kobj); const void *(*namespace)(const struct kobject *kobj); void (*get_ownership)(const struct kobject *kobj, kuid_t *uid, kgid_t *gid); }; struct kobj_uevent_env { char *argv[3]; char *envp[UEVENT_NUM_ENVP]; int envp_idx; char buf[UEVENT_BUFFER_SIZE]; int buflen; }; struct kset_uevent_ops { int (* const filter)(const struct kobject *kobj); const char *(* const name)(const struct kobject *kobj); int (* const uevent)(const struct kobject *kobj, struct kobj_uevent_env *env); }; struct kobj_attribute { struct attribute attr; ssize_t (*show)(struct kobject *kobj, struct kobj_attribute *attr, char *buf); ssize_t (*store)(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count); }; extern const struct sysfs_ops kobj_sysfs_ops; struct sock; /** * struct kset - a set of kobjects of a specific type, belonging to a specific subsystem. * * A kset defines a group of kobjects. They can be individually * different "types" but overall these kobjects all want to be grouped * together and operated on in the same manner. ksets are used to * define the attribute callbacks and other common events that happen to * a kobject. * * @list: the list of all kobjects for this kset * @list_lock: a lock for iterating over the kobjects * @kobj: the embedded kobject for this kset (recursion, isn't it fun...) * @uevent_ops: the set of uevent operations for this kset. These are * called whenever a kobject has something happen to it so that the kset * can add new environment variables, or filter out the uevents if so * desired. */ struct kset { struct list_head list; spinlock_t list_lock; struct kobject kobj; const struct kset_uevent_ops *uevent_ops; } __randomize_layout; void kset_init(struct kset *kset); int __must_check kset_register(struct kset *kset); void kset_unregister(struct kset *kset); struct kset * __must_check kset_create_and_add(const char *name, const struct kset_uevent_ops *u, struct kobject *parent_kobj); static inline struct kset *to_kset(struct kobject *kobj) { return kobj ? container_of(kobj, struct kset, kobj) : NULL; } static inline struct kset *kset_get(struct kset *k) { return k ? to_kset(kobject_get(&k->kobj)) : NULL; } static inline void kset_put(struct kset *k) { kobject_put(&k->kobj); } static inline const struct kobj_type *get_ktype(const struct kobject *kobj) { return kobj->ktype; } struct kobject *kset_find_obj(struct kset *, const char *); /* The global /sys/kernel/ kobject for people to chain off of */ extern struct kobject *kernel_kobj; /* The global /sys/kernel/mm/ kobject for people to chain off of */ extern struct kobject *mm_kobj; /* The global /sys/hypervisor/ kobject for people to chain off of */ extern struct kobject *hypervisor_kobj; /* The global /sys/power/ kobject for people to chain off of */ extern struct kobject *power_kobj; /* The global /sys/firmware/ kobject for people to chain off of */ extern struct kobject *firmware_kobj; int kobject_uevent(struct kobject *kobj, enum kobject_action action); int kobject_uevent_env(struct kobject *kobj, enum kobject_action action, char *envp[]); int kobject_synth_uevent(struct kobject *kobj, const char *buf, size_t count); __printf(2, 3) int add_uevent_var(struct kobj_uevent_env *env, const char *format, ...); #endif /* _KOBJECT_H_ */
84 193 193 193 192 200 200 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com> */ #include <linux/dcache.h> #include <linux/fs.h> #include <linux/gfp.h> #include <linux/init.h> #include <linux/module.h> #include <linux/mount.h> #include <linux/srcu.h> #include <linux/fsnotify_backend.h> #include "fsnotify.h" /* * Clear all of the marks on an inode when it is being evicted from core */ void __fsnotify_inode_delete(struct inode *inode) { fsnotify_clear_marks_by_inode(inode); } EXPORT_SYMBOL_GPL(__fsnotify_inode_delete); void __fsnotify_vfsmount_delete(struct vfsmount *mnt) { fsnotify_clear_marks_by_mount(mnt); } void __fsnotify_mntns_delete(struct mnt_namespace *mntns) { fsnotify_clear_marks_by_mntns(mntns); } /** * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes. * @sb: superblock being unmounted. * * Called during unmount with no locks held, so needs to be safe against * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block. */ static void fsnotify_unmount_inodes(struct super_block *sb) { struct inode *inode, *iput_inode = NULL; spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { /* * We cannot __iget() an inode in state I_FREEING, * I_WILL_FREE, or I_NEW which is fine because by that point * the inode cannot have any associated watches. */ spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) { spin_unlock(&inode->i_lock); continue; } /* * If i_count is zero, the inode cannot have any watches and * doing an __iget/iput with SB_ACTIVE clear would actually * evict all inodes with zero i_count from icache which is * unnecessarily violent and may in fact be illegal to do. * However, we should have been called /after/ evict_inodes * removed all zero refcount inodes, in any case. Test to * be sure. */ if (!atomic_read(&inode->i_count)) { spin_unlock(&inode->i_lock); continue; } __iget(inode); spin_unlock(&inode->i_lock); spin_unlock(&sb->s_inode_list_lock); iput(iput_inode); /* for each watch, send FS_UNMOUNT and then remove it */ fsnotify_inode(inode, FS_UNMOUNT); fsnotify_inode_delete(inode); iput_inode = inode; cond_resched(); spin_lock(&sb->s_inode_list_lock); } spin_unlock(&sb->s_inode_list_lock); iput(iput_inode); } void fsnotify_sb_delete(struct super_block *sb) { struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb); /* Were any marks ever added to any object on this sb? */ if (!sbinfo) return; fsnotify_unmount_inodes(sb); fsnotify_clear_marks_by_sb(sb); /* Wait for outstanding object references from connectors */ wait_var_event(fsnotify_sb_watched_objects(sb), !atomic_long_read(fsnotify_sb_watched_objects(sb))); WARN_ON(fsnotify_sb_has_priority_watchers(sb, FSNOTIFY_PRIO_CONTENT)); WARN_ON(fsnotify_sb_has_priority_watchers(sb, FSNOTIFY_PRIO_PRE_CONTENT)); } void fsnotify_sb_free(struct super_block *sb) { kfree(sb->s_fsnotify_info); } /* * Given an inode, first check if we care what happens to our children. Inotify * and dnotify both tell their parents about events. If we care about any event * on a child we run all of our children and set a dentry flag saying that the * parent cares. Thus when an event happens on a child it can quickly tell * if there is a need to find a parent and send the event to the parent. */ void fsnotify_set_children_dentry_flags(struct inode *inode) { struct dentry *alias; if (!S_ISDIR(inode->i_mode)) return; spin_lock(&inode->i_lock); /* run all of the dentries associated with this inode. Since this is a * directory, there damn well better only be one item on this list */ hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { struct dentry *child; /* run all of the children of the original inode and fix their * d_flags to indicate parental interest (their parent is the * original inode) */ spin_lock(&alias->d_lock); hlist_for_each_entry(child, &alias->d_children, d_sib) { if (!child->d_inode) continue; spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED; spin_unlock(&child->d_lock); } spin_unlock(&alias->d_lock); } spin_unlock(&inode->i_lock); } /* * Lazily clear false positive PARENT_WATCHED flag for child whose parent had * stopped watching children. */ static void fsnotify_clear_child_dentry_flag(struct inode *pinode, struct dentry *dentry) { spin_lock(&dentry->d_lock); /* * d_lock is a sufficient barrier to prevent observing a non-watched * parent state from before the fsnotify_set_children_dentry_flags() * or fsnotify_update_flags() call that had set PARENT_WATCHED. */ if (!fsnotify_inode_watches_children(pinode)) dentry->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED; spin_unlock(&dentry->d_lock); } /* Are inode/sb/mount interested in parent and name info with this event? */ static bool fsnotify_event_needs_parent(struct inode *inode, __u32 mnt_mask, __u32 mask) { __u32 marks_mask = 0; /* We only send parent/name to inode/sb/mount for events on non-dir */ if (mask & FS_ISDIR) return false; /* * All events that are possible on child can also may be reported with * parent/name info to inode/sb/mount. Otherwise, a watching parent * could result in events reported with unexpected name info to sb/mount. */ BUILD_BUG_ON(FS_EVENTS_POSS_ON_CHILD & ~FS_EVENTS_POSS_TO_PARENT); /* Did either inode/sb/mount subscribe for events with parent/name? */ marks_mask |= fsnotify_parent_needed_mask( READ_ONCE(inode->i_fsnotify_mask)); marks_mask |= fsnotify_parent_needed_mask( READ_ONCE(inode->i_sb->s_fsnotify_mask)); marks_mask |= fsnotify_parent_needed_mask(mnt_mask); /* Did they subscribe for this event with parent/name info? */ return mask & marks_mask; } /* Are there any inode/mount/sb objects that watch for these events? */ static inline __u32 fsnotify_object_watched(struct inode *inode, __u32 mnt_mask, __u32 mask) { __u32 marks_mask = READ_ONCE(inode->i_fsnotify_mask) | mnt_mask | READ_ONCE(inode->i_sb->s_fsnotify_mask); return mask & marks_mask & ALL_FSNOTIFY_EVENTS; } /* Report pre-content event with optional range info */ int fsnotify_pre_content(const struct path *path, const loff_t *ppos, size_t count) { struct file_range range; /* Report page aligned range only when pos is known */ if (!ppos) return fsnotify_path(path, FS_PRE_ACCESS); range.path = path; range.pos = PAGE_ALIGN_DOWN(*ppos); range.count = PAGE_ALIGN(*ppos + count) - range.pos; return fsnotify_parent(path->dentry, FS_PRE_ACCESS, &range, FSNOTIFY_EVENT_FILE_RANGE); } /* * Notify this dentry's parent about a child's events with child name info * if parent is watching or if inode/sb/mount are interested in events with * parent and name info. * * Notify only the child without name info if parent is not watching and * inode/sb/mount are not interested in events with parent and name info. */ int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data, int data_type) { const struct path *path = fsnotify_data_path(data, data_type); __u32 mnt_mask = path ? READ_ONCE(real_mount(path->mnt)->mnt_fsnotify_mask) : 0; struct inode *inode = d_inode(dentry); struct dentry *parent; bool parent_watched = dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED; bool parent_needed, parent_interested; __u32 p_mask; struct inode *p_inode = NULL; struct name_snapshot name; struct qstr *file_name = NULL; int ret = 0; /* Optimize the likely case of nobody watching this path */ if (likely(!parent_watched && !fsnotify_object_watched(inode, mnt_mask, mask))) return 0; parent = NULL; parent_needed = fsnotify_event_needs_parent(inode, mnt_mask, mask); if (!parent_watched && !parent_needed) goto notify; /* Does parent inode care about events on children? */ parent = dget_parent(dentry); p_inode = parent->d_inode; p_mask = fsnotify_inode_watches_children(p_inode); if (unlikely(parent_watched && !p_mask)) fsnotify_clear_child_dentry_flag(p_inode, dentry); /* * Include parent/name in notification either if some notification * groups require parent info or the parent is interested in this event. */ parent_interested = mask & p_mask & ALL_FSNOTIFY_EVENTS; if (parent_needed || parent_interested) { /* When notifying parent, child should be passed as data */ WARN_ON_ONCE(inode != fsnotify_data_inode(data, data_type)); /* Notify both parent and child with child name info */ take_dentry_name_snapshot(&name, dentry); file_name = &name.name; if (parent_interested) mask |= FS_EVENT_ON_CHILD; } notify: ret = fsnotify(mask, data, data_type, p_inode, file_name, inode, 0); if (file_name) release_dentry_name_snapshot(&name); dput(parent); return ret; } EXPORT_SYMBOL_GPL(__fsnotify_parent); static int fsnotify_handle_inode_event(struct fsnotify_group *group, struct fsnotify_mark *inode_mark, u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *name, u32 cookie) { const struct path *path = fsnotify_data_path(data, data_type); struct inode *inode = fsnotify_data_inode(data, data_type); const struct fsnotify_ops *ops = group->ops; if (WARN_ON_ONCE(!ops->handle_inode_event)) return 0; if (WARN_ON_ONCE(!inode && !dir)) return 0; if ((inode_mark->flags & FSNOTIFY_MARK_FLAG_EXCL_UNLINK) && path && d_unlinked(path->dentry)) return 0; /* Check interest of this mark in case event was sent with two marks */ if (!(mask & inode_mark->mask & ALL_FSNOTIFY_EVENTS)) return 0; return ops->handle_inode_event(inode_mark, mask, inode, dir, name, cookie); } static int fsnotify_handle_event(struct fsnotify_group *group, __u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *name, u32 cookie, struct fsnotify_iter_info *iter_info) { struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info); struct fsnotify_mark *parent_mark = fsnotify_iter_parent_mark(iter_info); int ret; if (WARN_ON_ONCE(fsnotify_iter_sb_mark(iter_info)) || WARN_ON_ONCE(fsnotify_iter_vfsmount_mark(iter_info))) return 0; /* * For FS_RENAME, 'dir' is old dir and 'data' is new dentry. * The only ->handle_inode_event() backend that supports FS_RENAME is * dnotify, where it means file was renamed within same parent. */ if (mask & FS_RENAME) { struct dentry *moved = fsnotify_data_dentry(data, data_type); if (dir != moved->d_parent->d_inode) return 0; } if (parent_mark) { ret = fsnotify_handle_inode_event(group, parent_mark, mask, data, data_type, dir, name, 0); if (ret) return ret; } if (!inode_mark) return 0; /* * Some events can be sent on both parent dir and child marks (e.g. * FS_ATTRIB). If both parent dir and child are watching, report the * event once to parent dir with name (if interested) and once to child * without name (if interested). * * In any case regardless whether the parent is watching or not, the * child watcher is expecting an event without the FS_EVENT_ON_CHILD * flag. The file name is expected if and only if this is a directory * event. */ mask &= ~FS_EVENT_ON_CHILD; if (!(mask & ALL_FSNOTIFY_DIRENT_EVENTS)) { dir = NULL; name = NULL; } return fsnotify_handle_inode_event(group, inode_mark, mask, data, data_type, dir, name, cookie); } static int send_to_group(__u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *file_name, u32 cookie, struct fsnotify_iter_info *iter_info) { struct fsnotify_group *group = NULL; __u32 test_mask = (mask & ALL_FSNOTIFY_EVENTS); __u32 marks_mask = 0; __u32 marks_ignore_mask = 0; bool is_dir = mask & FS_ISDIR; struct fsnotify_mark *mark; int type; if (!iter_info->report_mask) return 0; /* clear ignored on inode modification */ if (mask & FS_MODIFY) { fsnotify_foreach_iter_mark_type(iter_info, mark, type) { if (!(mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) mark->ignore_mask = 0; } } /* Are any of the group marks interested in this event? */ fsnotify_foreach_iter_mark_type(iter_info, mark, type) { group = mark->group; marks_mask |= mark->mask; marks_ignore_mask |= fsnotify_effective_ignore_mask(mark, is_dir, type); } pr_debug("%s: group=%p mask=%x marks_mask=%x marks_ignore_mask=%x data=%p data_type=%d dir=%p cookie=%d\n", __func__, group, mask, marks_mask, marks_ignore_mask, data, data_type, dir, cookie); if (!(test_mask & marks_mask & ~marks_ignore_mask)) return 0; if (group->ops->handle_event) { return group->ops->handle_event(group, mask, data, data_type, dir, file_name, cookie, iter_info); } return fsnotify_handle_event(group, mask, data, data_type, dir, file_name, cookie, iter_info); } static struct fsnotify_mark *fsnotify_first_mark(struct fsnotify_mark_connector *const *connp) { struct fsnotify_mark_connector *conn; struct hlist_node *node = NULL; conn = srcu_dereference(*connp, &fsnotify_mark_srcu); if (conn) node = srcu_dereference(conn->list.first, &fsnotify_mark_srcu); return hlist_entry_safe(node, struct fsnotify_mark, obj_list); } static struct fsnotify_mark *fsnotify_next_mark(struct fsnotify_mark *mark) { struct hlist_node *node = NULL; if (mark) node = srcu_dereference(mark->obj_list.next, &fsnotify_mark_srcu); return hlist_entry_safe(node, struct fsnotify_mark, obj_list); } /* * iter_info is a multi head priority queue of marks. * Pick a subset of marks from queue heads, all with the same group * and set the report_mask to a subset of the selected marks. * Returns false if there are no more groups to iterate. */ static bool fsnotify_iter_select_report_types( struct fsnotify_iter_info *iter_info) { struct fsnotify_group *max_prio_group = NULL; struct fsnotify_mark *mark; int type; /* Choose max prio group among groups of all queue heads */ fsnotify_foreach_iter_type(type) { mark = iter_info->marks[type]; if (mark && fsnotify_compare_groups(max_prio_group, mark->group) > 0) max_prio_group = mark->group; } if (!max_prio_group) return false; /* Set the report mask for marks from same group as max prio group */ iter_info->current_group = max_prio_group; iter_info->report_mask = 0; fsnotify_foreach_iter_type(type) { mark = iter_info->marks[type]; if (mark && mark->group == iter_info->current_group) { /* * FSNOTIFY_ITER_TYPE_PARENT indicates that this inode * is watching children and interested in this event, * which is an event possible on child. * But is *this mark* watching children? */ if (type == FSNOTIFY_ITER_TYPE_PARENT && !(mark->mask & FS_EVENT_ON_CHILD) && !(fsnotify_ignore_mask(mark) & FS_EVENT_ON_CHILD)) continue; fsnotify_iter_set_report_type(iter_info, type); } } return true; } /* * Pop from iter_info multi head queue, the marks that belong to the group of * current iteration step. */ static void fsnotify_iter_next(struct fsnotify_iter_info *iter_info) { struct fsnotify_mark *mark; int type; /* * We cannot use fsnotify_foreach_iter_mark_type() here because we * may need to advance a mark of type X that belongs to current_group * but was not selected for reporting. */ fsnotify_foreach_iter_type(type) { mark = iter_info->marks[type]; if (mark && mark->group == iter_info->current_group) iter_info->marks[type] = fsnotify_next_mark(iter_info->marks[type]); } } /* * fsnotify - This is the main call to fsnotify. * * The VFS calls into hook specific functions in linux/fsnotify.h. * Those functions then in turn call here. Here will call out to all of the * registered fsnotify_group. Those groups can then use the notification event * in whatever means they feel necessary. * * @mask: event type and flags * @data: object that event happened on * @data_type: type of object for fanotify_data_XXX() accessors * @dir: optional directory associated with event - * if @file_name is not NULL, this is the directory that * @file_name is relative to * @file_name: optional file name associated with event * @inode: optional inode associated with event - * If @dir and @inode are both non-NULL, event may be * reported to both. * @cookie: inotify rename cookie */ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *file_name, struct inode *inode, u32 cookie) { const struct path *path = fsnotify_data_path(data, data_type); struct super_block *sb = fsnotify_data_sb(data, data_type); const struct fsnotify_mnt *mnt_data = fsnotify_data_mnt(data, data_type); struct fsnotify_sb_info *sbinfo = sb ? fsnotify_sb_info(sb) : NULL; struct fsnotify_iter_info iter_info = {}; struct mount *mnt = NULL; struct inode *inode2 = NULL; struct dentry *moved; int inode2_type; int ret = 0; __u32 test_mask, marks_mask = 0; if (path) mnt = real_mount(path->mnt); if (!inode) { /* Dirent event - report on TYPE_INODE to dir */ inode = dir; /* For FS_RENAME, inode is old_dir and inode2 is new_dir */ if (mask & FS_RENAME) { moved = fsnotify_data_dentry(data, data_type); inode2 = moved->d_parent->d_inode; inode2_type = FSNOTIFY_ITER_TYPE_INODE2; } } else if (mask & FS_EVENT_ON_CHILD) { /* * Event on child - report on TYPE_PARENT to dir if it is * watching children and on TYPE_INODE to child. */ inode2 = dir; inode2_type = FSNOTIFY_ITER_TYPE_PARENT; } /* * Optimization: srcu_read_lock() has a memory barrier which can * be expensive. It protects walking the *_fsnotify_marks lists. * However, if we do not walk the lists, we do not have to do * SRCU because we have no references to any objects and do not * need SRCU to keep them "alive". */ if ((!sbinfo || !sbinfo->sb_marks) && (!mnt || !mnt->mnt_fsnotify_marks) && (!inode || !inode->i_fsnotify_marks) && (!inode2 || !inode2->i_fsnotify_marks) && (!mnt_data || !mnt_data->ns->n_fsnotify_marks)) return 0; if (sb) marks_mask |= READ_ONCE(sb->s_fsnotify_mask); if (mnt) marks_mask |= READ_ONCE(mnt->mnt_fsnotify_mask); if (inode) marks_mask |= READ_ONCE(inode->i_fsnotify_mask); if (inode2) marks_mask |= READ_ONCE(inode2->i_fsnotify_mask); if (mnt_data) marks_mask |= READ_ONCE(mnt_data->ns->n_fsnotify_mask); /* * If this is a modify event we may need to clear some ignore masks. * In that case, the object with ignore masks will have the FS_MODIFY * event in its mask. * Otherwise, return if none of the marks care about this type of event. */ test_mask = (mask & ALL_FSNOTIFY_EVENTS); if (!(test_mask & marks_mask)) return 0; iter_info.srcu_idx = srcu_read_lock(&fsnotify_mark_srcu); if (sbinfo) { iter_info.marks[FSNOTIFY_ITER_TYPE_SB] = fsnotify_first_mark(&sbinfo->sb_marks); } if (mnt) { iter_info.marks[FSNOTIFY_ITER_TYPE_VFSMOUNT] = fsnotify_first_mark(&mnt->mnt_fsnotify_marks); } if (inode) { iter_info.marks[FSNOTIFY_ITER_TYPE_INODE] = fsnotify_first_mark(&inode->i_fsnotify_marks); } if (inode2) { iter_info.marks[inode2_type] = fsnotify_first_mark(&inode2->i_fsnotify_marks); } if (mnt_data) { iter_info.marks[FSNOTIFY_ITER_TYPE_MNTNS] = fsnotify_first_mark(&mnt_data->ns->n_fsnotify_marks); } /* * We need to merge inode/vfsmount/sb mark lists so that e.g. inode mark * ignore masks are properly reflected for mount/sb mark notifications. * That's why this traversal is so complicated... */ while (fsnotify_iter_select_report_types(&iter_info)) { ret = send_to_group(mask, data, data_type, dir, file_name, cookie, &iter_info); if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS)) goto out; fsnotify_iter_next(&iter_info); } ret = 0; out: srcu_read_unlock(&fsnotify_mark_srcu, iter_info.srcu_idx); return ret; } EXPORT_SYMBOL_GPL(fsnotify); #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS /* * At open time we check fsnotify_sb_has_priority_watchers(), call the open perm * hook and set the FMODE_NONOTIFY_ mode bits accordignly. * Later, fsnotify permission hooks do not check if there are permission event * watches, but that there were permission event watches at open time. */ int fsnotify_open_perm_and_set_mode(struct file *file) { struct dentry *dentry = file->f_path.dentry, *parent; struct super_block *sb = dentry->d_sb; __u32 mnt_mask, p_mask = 0; /* Is it a file opened by fanotify? */ if (FMODE_FSNOTIFY_NONE(file->f_mode)) return 0; /* * Permission events is a super set of pre-content events, so if there * are no permission event watchers, there are also no pre-content event * watchers and this is implied from the single FMODE_NONOTIFY_PERM bit. */ if (likely(!fsnotify_sb_has_priority_watchers(sb, FSNOTIFY_PRIO_CONTENT))) { file_set_fsnotify_mode(file, FMODE_NONOTIFY_PERM); return 0; } /* * OK, there are some permission event watchers. Check if anybody is * watching for permission events on *this* file. */ mnt_mask = READ_ONCE(real_mount(file->f_path.mnt)->mnt_fsnotify_mask); p_mask = fsnotify_object_watched(d_inode(dentry), mnt_mask, ALL_FSNOTIFY_PERM_EVENTS); if (dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED) { parent = dget_parent(dentry); p_mask |= fsnotify_inode_watches_children(d_inode(parent)); dput(parent); } /* * Legacy FAN_ACCESS_PERM events have very high performance overhead, * so unlikely to be used in the wild. If they are used there will be * no optimizations at all. */ if (unlikely(p_mask & FS_ACCESS_PERM)) { /* Enable all permission and pre-content events */ file_set_fsnotify_mode(file, 0); goto open_perm; } /* * Pre-content events are only supported on regular files. * If there are pre-content event watchers and no permission access * watchers, set FMODE_NONOTIFY | FMODE_NONOTIFY_PERM to indicate that. * That is the common case with HSM service. */ if (d_is_reg(dentry) && (p_mask & FSNOTIFY_PRE_CONTENT_EVENTS)) { file_set_fsnotify_mode(file, FMODE_NONOTIFY | FMODE_NONOTIFY_PERM); goto open_perm; } /* Nobody watching permission and pre-content events on this file */ file_set_fsnotify_mode(file, FMODE_NONOTIFY_PERM); open_perm: /* * Send open perm events depending on object masks and regardless of * FMODE_NONOTIFY_PERM. */ if (file->f_flags & __FMODE_EXEC && p_mask & FS_OPEN_EXEC_PERM) { int ret = fsnotify_path(&file->f_path, FS_OPEN_EXEC_PERM); if (ret) return ret; } if (p_mask & FS_OPEN_PERM) return fsnotify_path(&file->f_path, FS_OPEN_PERM); return 0; } #endif void fsnotify_mnt(__u32 mask, struct mnt_namespace *ns, struct vfsmount *mnt) { struct fsnotify_mnt data = { .ns = ns, .mnt_id = real_mount(mnt)->mnt_id_unique, }; if (WARN_ON_ONCE(!ns)) return; /* * This is an optimization as well as making sure fsnotify_init() has * been called. */ if (!ns->n_fsnotify_marks) return; fsnotify(mask, &data, FSNOTIFY_EVENT_MNT, NULL, NULL, NULL, 0); } static __init int fsnotify_init(void) { int ret; BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 26); ret = init_srcu_struct(&fsnotify_mark_srcu); if (ret) panic("initializing fsnotify_mark_srcu"); fsnotify_mark_connector_cachep = KMEM_CACHE(fsnotify_mark_connector, SLAB_PANIC); return 0; } core_initcall(fsnotify_init);
210 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_ENTRYKVM_H #define __LINUX_ENTRYKVM_H #include <linux/static_call_types.h> #include <linux/resume_user_mode.h> #include <linux/syscalls.h> #include <linux/seccomp.h> #include <linux/sched.h> #include <linux/tick.h> /* Transfer to guest mode work */ #ifdef CONFIG_KVM_XFER_TO_GUEST_WORK #ifndef ARCH_XFER_TO_GUEST_MODE_WORK # define ARCH_XFER_TO_GUEST_MODE_WORK (0) #endif #define XFER_TO_GUEST_MODE_WORK \ (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | _TIF_SIGPENDING | \ _TIF_NOTIFY_SIGNAL | _TIF_NOTIFY_RESUME | \ ARCH_XFER_TO_GUEST_MODE_WORK) struct kvm_vcpu; /** * arch_xfer_to_guest_mode_handle_work - Architecture specific xfer to guest * mode work handling function. * @vcpu: Pointer to current's VCPU data * @ti_work: Cached TIF flags gathered in xfer_to_guest_mode_handle_work() * * Invoked from xfer_to_guest_mode_handle_work(). Defaults to NOOP. Can be * replaced by architecture specific code. */ static inline int arch_xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu, unsigned long ti_work); #ifndef arch_xfer_to_guest_mode_work static inline int arch_xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu, unsigned long ti_work) { return 0; } #endif /** * xfer_to_guest_mode_handle_work - Check and handle pending work which needs * to be handled before going to guest mode * @vcpu: Pointer to current's VCPU data * * Returns: 0 or an error code */ int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu); /** * xfer_to_guest_mode_prepare - Perform last minute preparation work that * need to be handled while IRQs are disabled * upon entering to guest. * * Has to be invoked with interrupts disabled before the last call * to xfer_to_guest_mode_work_pending(). */ static inline void xfer_to_guest_mode_prepare(void) { lockdep_assert_irqs_disabled(); tick_nohz_user_enter_prepare(); } /** * __xfer_to_guest_mode_work_pending - Check if work is pending * * Returns: True if work pending, False otherwise. * * Bare variant of xfer_to_guest_mode_work_pending(). Can be called from * interrupt enabled code for racy quick checks with care. */ static inline bool __xfer_to_guest_mode_work_pending(void) { unsigned long ti_work = read_thread_flags(); return !!(ti_work & XFER_TO_GUEST_MODE_WORK); } /** * xfer_to_guest_mode_work_pending - Check if work is pending which needs to be * handled before returning to guest mode * * Returns: True if work pending, False otherwise. * * Has to be invoked with interrupts disabled before the transition to * guest mode. */ static inline bool xfer_to_guest_mode_work_pending(void) { lockdep_assert_irqs_disabled(); return __xfer_to_guest_mode_work_pending(); } #endif /* CONFIG_KVM_XFER_TO_GUEST_WORK */ #endif
109 108 3 3 271 9 14 17 264 109 108 93 93 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 /* SPDX-License-Identifier: GPL-2.0 */ /* * Implementation of the extensible bitmap type. * * Author : Stephen Smalley, <stephen.smalley.work@gmail.com> */ /* * Updated: Hewlett-Packard <paul@paul-moore.com> * Added support to import/export the NetLabel category bitmap * (c) Copyright Hewlett-Packard Development Company, L.P., 2006 * * Updated: KaiGai Kohei <kaigai@ak.jp.nec.com> * Applied standard bit operations to improve bitmap scanning. */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/errno.h> #include <linux/jhash.h> #include <net/netlabel.h> #include "ebitmap.h" #include "policydb.h" #define BITS_PER_U64 ((u32)(sizeof(u64) * 8)) static struct kmem_cache *ebitmap_node_cachep __ro_after_init; bool ebitmap_equal(const struct ebitmap *e1, const struct ebitmap *e2) { const struct ebitmap_node *n1, *n2; if (e1->highbit != e2->highbit) return false; n1 = e1->node; n2 = e2->node; while (n1 && n2 && (n1->startbit == n2->startbit) && !memcmp(n1->maps, n2->maps, EBITMAP_SIZE / 8)) { n1 = n1->next; n2 = n2->next; } if (n1 || n2) return false; return true; } int ebitmap_cpy(struct ebitmap *dst, const struct ebitmap *src) { struct ebitmap_node *new, *prev; const struct ebitmap_node *n; ebitmap_init(dst); n = src->node; prev = NULL; while (n) { new = kmem_cache_zalloc(ebitmap_node_cachep, GFP_ATOMIC); if (!new) { ebitmap_destroy(dst); return -ENOMEM; } new->startbit = n->startbit; memcpy(new->maps, n->maps, EBITMAP_SIZE / 8); new->next = NULL; if (prev) prev->next = new; else dst->node = new; prev = new; n = n->next; } dst->highbit = src->highbit; return 0; } int ebitmap_and(struct ebitmap *dst, const struct ebitmap *e1, const struct ebitmap *e2) { struct ebitmap_node *n; u32 bit; int rc; ebitmap_init(dst); ebitmap_for_each_positive_bit(e1, n, bit) { if (ebitmap_get_bit(e2, bit)) { rc = ebitmap_set_bit(dst, bit, 1); if (rc < 0) return rc; } } return 0; } #ifdef CONFIG_NETLABEL /** * ebitmap_netlbl_export - Export an ebitmap into a NetLabel category bitmap * @ebmap: the ebitmap to export * @catmap: the NetLabel category bitmap * * Description: * Export a SELinux extensibile bitmap into a NetLabel category bitmap. * Returns zero on success, negative values on error. * */ int ebitmap_netlbl_export(struct ebitmap *ebmap, struct netlbl_lsm_catmap **catmap) { struct ebitmap_node *e_iter = ebmap->node; unsigned long e_map; u32 offset; unsigned int iter; int rc; if (e_iter == NULL) { *catmap = NULL; return 0; } if (*catmap != NULL) netlbl_catmap_free(*catmap); *catmap = NULL; while (e_iter) { offset = e_iter->startbit; for (iter = 0; iter < EBITMAP_UNIT_NUMS; iter++) { e_map = e_iter->maps[iter]; if (e_map != 0) { rc = netlbl_catmap_setlong(catmap, offset, e_map, GFP_ATOMIC); if (rc != 0) goto netlbl_export_failure; } offset += EBITMAP_UNIT_SIZE; } e_iter = e_iter->next; } return 0; netlbl_export_failure: netlbl_catmap_free(*catmap); return -ENOMEM; } /** * ebitmap_netlbl_import - Import a NetLabel category bitmap into an ebitmap * @ebmap: the ebitmap to import * @catmap: the NetLabel category bitmap * * Description: * Import a NetLabel category bitmap into a SELinux extensibile bitmap. * Returns zero on success, negative values on error. * */ int ebitmap_netlbl_import(struct ebitmap *ebmap, struct netlbl_lsm_catmap *catmap) { int rc; struct ebitmap_node *e_iter = NULL; struct ebitmap_node *e_prev = NULL; u32 offset = 0, idx; unsigned long bitmap; for (;;) { rc = netlbl_catmap_getlong(catmap, &offset, &bitmap); if (rc < 0) goto netlbl_import_failure; if (offset == (u32)-1) return 0; /* don't waste ebitmap space if the netlabel bitmap is empty */ if (bitmap == 0) { offset += EBITMAP_UNIT_SIZE; continue; } if (e_iter == NULL || offset >= e_iter->startbit + EBITMAP_SIZE) { e_prev = e_iter; e_iter = kmem_cache_zalloc(ebitmap_node_cachep, GFP_ATOMIC); if (e_iter == NULL) goto netlbl_import_failure; e_iter->startbit = offset - (offset % EBITMAP_SIZE); if (e_prev == NULL) ebmap->node = e_iter; else e_prev->next = e_iter; ebmap->highbit = e_iter->startbit + EBITMAP_SIZE; } /* offset will always be aligned to an unsigned long */ idx = EBITMAP_NODE_INDEX(e_iter, offset); e_iter->maps[idx] = bitmap; /* next */ offset += EBITMAP_UNIT_SIZE; } /* NOTE: we should never reach this return */ return 0; netlbl_import_failure: ebitmap_destroy(ebmap); return -ENOMEM; } #endif /* CONFIG_NETLABEL */ /* * Check to see if all the bits set in e2 are also set in e1. Optionally, * if last_e2bit is non-zero, the highest set bit in e2 cannot exceed * last_e2bit. */ int ebitmap_contains(const struct ebitmap *e1, const struct ebitmap *e2, u32 last_e2bit) { const struct ebitmap_node *n1, *n2; int i; if (e1->highbit < e2->highbit) return 0; n1 = e1->node; n2 = e2->node; while (n1 && n2 && (n1->startbit <= n2->startbit)) { if (n1->startbit < n2->startbit) { n1 = n1->next; continue; } for (i = EBITMAP_UNIT_NUMS - 1; (i >= 0) && !n2->maps[i];) i--; /* Skip trailing NULL map entries */ if (last_e2bit && (i >= 0)) { u32 lastsetbit = n2->startbit + i * EBITMAP_UNIT_SIZE + __fls(n2->maps[i]); if (lastsetbit > last_e2bit) return 0; } while (i >= 0) { if ((n1->maps[i] & n2->maps[i]) != n2->maps[i]) return 0; i--; } n1 = n1->next; n2 = n2->next; } if (n2) return 0; return 1; } int ebitmap_get_bit(const struct ebitmap *e, u32 bit) { const struct ebitmap_node *n; if (e->highbit < bit) return 0; n = e->node; while (n && (n->startbit <= bit)) { if ((n->startbit + EBITMAP_SIZE) > bit) return ebitmap_node_get_bit(n, bit); n = n->next; } return 0; } int ebitmap_set_bit(struct ebitmap *e, u32 bit, int value) { struct ebitmap_node *n, *prev, *new; prev = NULL; n = e->node; while (n && n->startbit <= bit) { if ((n->startbit + EBITMAP_SIZE) > bit) { if (value) { ebitmap_node_set_bit(n, bit); } else { u32 s; ebitmap_node_clr_bit(n, bit); s = find_first_bit(n->maps, EBITMAP_SIZE); if (s < EBITMAP_SIZE) return 0; /* drop this node from the bitmap */ if (!n->next) { /* * this was the highest map * within the bitmap */ if (prev) e->highbit = prev->startbit + EBITMAP_SIZE; else e->highbit = 0; } if (prev) prev->next = n->next; else e->node = n->next; kmem_cache_free(ebitmap_node_cachep, n); } return 0; } prev = n; n = n->next; } if (!value) return 0; new = kmem_cache_zalloc(ebitmap_node_cachep, GFP_ATOMIC); if (!new) return -ENOMEM; new->startbit = bit - (bit % EBITMAP_SIZE); ebitmap_node_set_bit(new, bit); if (!n) /* this node will be the highest map within the bitmap */ e->highbit = new->startbit + EBITMAP_SIZE; if (prev) { new->next = prev->next; prev->next = new; } else { new->next = e->node; e->node = new; } return 0; } void ebitmap_destroy(struct ebitmap *e) { struct ebitmap_node *n, *temp; if (!e) return; n = e->node; while (n) { temp = n; n = n->next; kmem_cache_free(ebitmap_node_cachep, temp); } e->highbit = 0; e->node = NULL; } int ebitmap_read(struct ebitmap *e, struct policy_file *fp) { struct ebitmap_node *n = NULL; u32 mapunit, count, startbit, index, i; __le32 ebitmap_start; u64 map; __le64 mapbits; __le32 buf[3]; int rc; ebitmap_init(e); rc = next_entry(buf, fp, sizeof buf); if (rc < 0) goto out; mapunit = le32_to_cpu(buf[0]); e->highbit = le32_to_cpu(buf[1]); count = le32_to_cpu(buf[2]); if (mapunit != BITS_PER_U64) { pr_err("SELinux: ebitmap: map size %u does not " "match my size %u (high bit was %u)\n", mapunit, BITS_PER_U64, e->highbit); goto bad; } /* round up e->highbit */ e->highbit += EBITMAP_SIZE - 1; e->highbit -= (e->highbit % EBITMAP_SIZE); if (!e->highbit) { e->node = NULL; goto ok; } if (e->highbit && !count) goto bad; for (i = 0; i < count; i++) { rc = next_entry(&ebitmap_start, fp, sizeof(u32)); if (rc < 0) { pr_err("SELinux: ebitmap: truncated map\n"); goto bad; } startbit = le32_to_cpu(ebitmap_start); if (startbit & (mapunit - 1)) { pr_err("SELinux: ebitmap start bit (%u) is " "not a multiple of the map unit size (%u)\n", startbit, mapunit); goto bad; } if (startbit > e->highbit - mapunit) { pr_err("SELinux: ebitmap start bit (%u) is " "beyond the end of the bitmap (%u)\n", startbit, (e->highbit - mapunit)); goto bad; } if (!n || startbit >= n->startbit + EBITMAP_SIZE) { struct ebitmap_node *tmp; tmp = kmem_cache_zalloc(ebitmap_node_cachep, GFP_KERNEL); if (!tmp) { pr_err("SELinux: ebitmap: out of memory\n"); rc = -ENOMEM; goto bad; } /* round down */ tmp->startbit = startbit - (startbit % EBITMAP_SIZE); if (n) n->next = tmp; else e->node = tmp; n = tmp; } else if (startbit <= n->startbit) { pr_err("SELinux: ebitmap: start bit %u" " comes after start bit %u\n", startbit, n->startbit); goto bad; } rc = next_entry(&mapbits, fp, sizeof(u64)); if (rc < 0) { pr_err("SELinux: ebitmap: truncated map\n"); goto bad; } map = le64_to_cpu(mapbits); if (!map) { pr_err("SELinux: ebitmap: empty map\n"); goto bad; } index = (startbit - n->startbit) / EBITMAP_UNIT_SIZE; while (map) { n->maps[index++] = map & (-1UL); map = EBITMAP_SHIFT_UNIT_SIZE(map); } } if (n && n->startbit + EBITMAP_SIZE != e->highbit) { pr_err("SELinux: ebitmap: high bit %u is not equal to the expected value %zu\n", e->highbit, n->startbit + EBITMAP_SIZE); goto bad; } ok: rc = 0; out: return rc; bad: if (!rc) rc = -EINVAL; ebitmap_destroy(e); goto out; } int ebitmap_write(const struct ebitmap *e, struct policy_file *fp) { struct ebitmap_node *n; u32 bit, count, last_bit, last_startbit; __le32 buf[3]; u64 map; int rc; buf[0] = cpu_to_le32(BITS_PER_U64); count = 0; last_bit = 0; last_startbit = U32_MAX; ebitmap_for_each_positive_bit(e, n, bit) { if (last_startbit == U32_MAX || rounddown(bit, BITS_PER_U64) > last_startbit) { count++; last_startbit = rounddown(bit, BITS_PER_U64); } last_bit = roundup(bit + 1, BITS_PER_U64); } buf[1] = cpu_to_le32(last_bit); buf[2] = cpu_to_le32(count); rc = put_entry(buf, sizeof(u32), 3, fp); if (rc) return rc; map = 0; last_startbit = U32_MAX; ebitmap_for_each_positive_bit(e, n, bit) { if (last_startbit == U32_MAX || rounddown(bit, BITS_PER_U64) > last_startbit) { __le64 buf64[1]; /* this is the very first bit */ if (!map) { last_startbit = rounddown(bit, BITS_PER_U64); map = (u64)1 << (bit - last_startbit); continue; } /* write the last node */ buf[0] = cpu_to_le32(last_startbit); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; buf64[0] = cpu_to_le64(map); rc = put_entry(buf64, sizeof(u64), 1, fp); if (rc) return rc; /* set up for the next node */ map = 0; last_startbit = rounddown(bit, BITS_PER_U64); } map |= (u64)1 << (bit - last_startbit); } /* write the last node */ if (map) { __le64 buf64[1]; /* write the last node */ buf[0] = cpu_to_le32(last_startbit); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; buf64[0] = cpu_to_le64(map); rc = put_entry(buf64, sizeof(u64), 1, fp); if (rc) return rc; } return 0; } u32 ebitmap_hash(const struct ebitmap *e, u32 hash) { struct ebitmap_node *node; /* need to change hash even if ebitmap is empty */ hash = jhash_1word(e->highbit, hash); for (node = e->node; node; node = node->next) { hash = jhash_1word(node->startbit, hash); hash = jhash(node->maps, sizeof(node->maps), hash); } return hash; } void __init ebitmap_cache_init(void) { ebitmap_node_cachep = KMEM_CACHE(ebitmap_node, SLAB_PANIC); }
29 29 29 29 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 // SPDX-License-Identifier: GPL-2.0-only /* * Based on arch/arm/kernel/traps.c * * Copyright (C) 1995-2009 Russell King * Copyright (C) 2012 ARM Ltd. */ #include <linux/bug.h> #include <linux/context_tracking.h> #include <linux/signal.h> #include <linux/kallsyms.h> #include <linux/kprobes.h> #include <linux/spinlock.h> #include <linux/uaccess.h> #include <linux/hardirq.h> #include <linux/kdebug.h> #include <linux/module.h> #include <linux/kexec.h> #include <linux/delay.h> #include <linux/efi.h> #include <linux/init.h> #include <linux/sched/signal.h> #include <linux/sched/debug.h> #include <linux/sched/task_stack.h> #include <linux/sizes.h> #include <linux/syscalls.h> #include <linux/mm_types.h> #include <linux/kasan.h> #include <linux/ubsan.h> #include <linux/cfi.h> #include <asm/atomic.h> #include <asm/bug.h> #include <asm/cpufeature.h> #include <asm/daifflags.h> #include <asm/debug-monitors.h> #include <asm/efi.h> #include <asm/esr.h> #include <asm/exception.h> #include <asm/extable.h> #include <asm/insn.h> #include <asm/kprobes.h> #include <asm/text-patching.h> #include <asm/traps.h> #include <asm/smp.h> #include <asm/stack_pointer.h> #include <asm/stacktrace.h> #include <asm/system_misc.h> #include <asm/sysreg.h> static bool __kprobes __check_eq(unsigned long pstate) { return (pstate & PSR_Z_BIT) != 0; } static bool __kprobes __check_ne(unsigned long pstate) { return (pstate & PSR_Z_BIT) == 0; } static bool __kprobes __check_cs(unsigned long pstate) { return (pstate & PSR_C_BIT) != 0; } static bool __kprobes __check_cc(unsigned long pstate) { return (pstate & PSR_C_BIT) == 0; } static bool __kprobes __check_mi(unsigned long pstate) { return (pstate & PSR_N_BIT) != 0; } static bool __kprobes __check_pl(unsigned long pstate) { return (pstate & PSR_N_BIT) == 0; } static bool __kprobes __check_vs(unsigned long pstate) { return (pstate & PSR_V_BIT) != 0; } static bool __kprobes __check_vc(unsigned long pstate) { return (pstate & PSR_V_BIT) == 0; } static bool __kprobes __check_hi(unsigned long pstate) { pstate &= ~(pstate >> 1); /* PSR_C_BIT &= ~PSR_Z_BIT */ return (pstate & PSR_C_BIT) != 0; } static bool __kprobes __check_ls(unsigned long pstate) { pstate &= ~(pstate >> 1); /* PSR_C_BIT &= ~PSR_Z_BIT */ return (pstate & PSR_C_BIT) == 0; } static bool __kprobes __check_ge(unsigned long pstate) { pstate ^= (pstate << 3); /* PSR_N_BIT ^= PSR_V_BIT */ return (pstate & PSR_N_BIT) == 0; } static bool __kprobes __check_lt(unsigned long pstate) { pstate ^= (pstate << 3); /* PSR_N_BIT ^= PSR_V_BIT */ return (pstate & PSR_N_BIT) != 0; } static bool __kprobes __check_gt(unsigned long pstate) { /*PSR_N_BIT ^= PSR_V_BIT */ unsigned long temp = pstate ^ (pstate << 3); temp |= (pstate << 1); /*PSR_N_BIT |= PSR_Z_BIT */ return (temp & PSR_N_BIT) == 0; } static bool __kprobes __check_le(unsigned long pstate) { /*PSR_N_BIT ^= PSR_V_BIT */ unsigned long temp = pstate ^ (pstate << 3); temp |= (pstate << 1); /*PSR_N_BIT |= PSR_Z_BIT */ return (temp & PSR_N_BIT) != 0; } static bool __kprobes __check_al(unsigned long pstate) { return true; } /* * Note that the ARMv8 ARM calls condition code 0b1111 "nv", but states that * it behaves identically to 0b1110 ("al"). */ pstate_check_t * const aarch32_opcode_cond_checks[16] = { __check_eq, __check_ne, __check_cs, __check_cc, __check_mi, __check_pl, __check_vs, __check_vc, __check_hi, __check_ls, __check_ge, __check_lt, __check_gt, __check_le, __check_al, __check_al }; int show_unhandled_signals = 0; void dump_kernel_instr(unsigned long kaddr) { char str[sizeof("00000000 ") * 5 + 2 + 1], *p = str; int i; if (!is_ttbr1_addr(kaddr)) return; for (i = -4; i < 1; i++) { unsigned int val, bad; bad = aarch64_insn_read(&((u32 *)kaddr)[i], &val); if (!bad) p += sprintf(p, i == 0 ? "(%08x) " : "%08x ", val); else p += sprintf(p, i == 0 ? "(????????) " : "???????? "); } printk(KERN_EMERG "Code: %s\n", str); } #define S_SMP " SMP" static int __die(const char *str, long err, struct pt_regs *regs) { static int die_counter; int ret; unsigned long addr = instruction_pointer(regs); pr_emerg("Internal error: %s: %016lx [#%d] " S_SMP "\n", str, err, ++die_counter); /* trap and error numbers are mostly meaningless on ARM */ ret = notify_die(DIE_OOPS, str, regs, err, 0, SIGSEGV); if (ret == NOTIFY_STOP) return ret; print_modules(); show_regs(regs); if (user_mode(regs)) return ret; dump_kernel_instr(addr); return ret; } static DEFINE_RAW_SPINLOCK(die_lock); /* * This function is protected against re-entrancy. */ void die(const char *str, struct pt_regs *regs, long err) { int ret; unsigned long flags; raw_spin_lock_irqsave(&die_lock, flags); oops_enter(); console_verbose(); bust_spinlocks(1); ret = __die(str, err, regs); if (regs && kexec_should_crash(current)) crash_kexec(regs); bust_spinlocks(0); add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE); oops_exit(); if (in_interrupt()) panic("%s: Fatal exception in interrupt", str); if (panic_on_oops) panic("%s: Fatal exception", str); raw_spin_unlock_irqrestore(&die_lock, flags); if (ret != NOTIFY_STOP) make_task_dead(SIGSEGV); } static void arm64_show_signal(int signo, const char *str) { static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); struct task_struct *tsk = current; unsigned long esr = tsk->thread.fault_code; struct pt_regs *regs = task_pt_regs(tsk); /* Leave if the signal won't be shown */ if (!show_unhandled_signals || !unhandled_signal(tsk, signo) || !__ratelimit(&rs)) return; pr_info("%s[%d]: unhandled exception: ", tsk->comm, task_pid_nr(tsk)); if (esr) pr_cont("%s, ESR 0x%016lx, ", esr_get_class_string(esr), esr); pr_cont("%s", str); print_vma_addr(KERN_CONT " in ", regs->pc); pr_cont("\n"); __show_regs(regs); } void arm64_force_sig_fault(int signo, int code, unsigned long far, const char *str) { arm64_show_signal(signo, str); if (signo == SIGKILL) force_sig(SIGKILL); else force_sig_fault(signo, code, (void __user *)far); } void arm64_force_sig_fault_pkey(unsigned long far, const char *str, int pkey) { arm64_show_signal(SIGSEGV, str); force_sig_pkuerr((void __user *)far, pkey); } void arm64_force_sig_mceerr(int code, unsigned long far, short lsb, const char *str) { arm64_show_signal(SIGBUS, str); force_sig_mceerr(code, (void __user *)far, lsb); } void arm64_force_sig_ptrace_errno_trap(int errno, unsigned long far, const char *str) { arm64_show_signal(SIGTRAP, str); force_sig_ptrace_errno_trap(errno, (void __user *)far); } void arm64_notify_die(const char *str, struct pt_regs *regs, int signo, int sicode, unsigned long far, unsigned long err) { if (user_mode(regs)) { WARN_ON(regs != current_pt_regs()); current->thread.fault_address = 0; current->thread.fault_code = err; arm64_force_sig_fault(signo, sicode, far, str); } else { die(str, regs, err); } } #ifdef CONFIG_COMPAT #define PSTATE_IT_1_0_SHIFT 25 #define PSTATE_IT_1_0_MASK (0x3 << PSTATE_IT_1_0_SHIFT) #define PSTATE_IT_7_2_SHIFT 10 #define PSTATE_IT_7_2_MASK (0x3f << PSTATE_IT_7_2_SHIFT) static u32 compat_get_it_state(struct pt_regs *regs) { u32 it, pstate = regs->pstate; it = (pstate & PSTATE_IT_1_0_MASK) >> PSTATE_IT_1_0_SHIFT; it |= ((pstate & PSTATE_IT_7_2_MASK) >> PSTATE_IT_7_2_SHIFT) << 2; return it; } static void compat_set_it_state(struct pt_regs *regs, u32 it) { u32 pstate_it; pstate_it = (it << PSTATE_IT_1_0_SHIFT) & PSTATE_IT_1_0_MASK; pstate_it |= ((it >> 2) << PSTATE_IT_7_2_SHIFT) & PSTATE_IT_7_2_MASK; regs->pstate &= ~PSR_AA32_IT_MASK; regs->pstate |= pstate_it; } static void advance_itstate(struct pt_regs *regs) { u32 it; /* ARM mode */ if (!(regs->pstate & PSR_AA32_T_BIT) || !(regs->pstate & PSR_AA32_IT_MASK)) return; it = compat_get_it_state(regs); /* * If this is the last instruction of the block, wipe the IT * state. Otherwise advance it. */ if (!(it & 7)) it = 0; else it = (it & 0xe0) | ((it << 1) & 0x1f); compat_set_it_state(regs, it); } #else static void advance_itstate(struct pt_regs *regs) { } #endif void arm64_skip_faulting_instruction(struct pt_regs *regs, unsigned long size) { regs->pc += size; /* * If we were single stepping, we want to get the step exception after * we return from the trap. */ if (user_mode(regs)) user_fastforward_single_step(current); if (compat_user_mode(regs)) advance_itstate(regs); else regs->pstate &= ~PSR_BTYPE_MASK; } static int user_insn_read(struct pt_regs *regs, u32 *insnp) { u32 instr; unsigned long pc = instruction_pointer(regs); if (compat_thumb_mode(regs)) { /* 16-bit Thumb instruction */ __le16 instr_le; if (get_user(instr_le, (__le16 __user *)pc)) return -EFAULT; instr = le16_to_cpu(instr_le); if (aarch32_insn_is_wide(instr)) { u32 instr2; if (get_user(instr_le, (__le16 __user *)(pc + 2))) return -EFAULT; instr2 = le16_to_cpu(instr_le); instr = (instr << 16) | instr2; } } else { /* 32-bit ARM instruction */ __le32 instr_le; if (get_user(instr_le, (__le32 __user *)pc)) return -EFAULT; instr = le32_to_cpu(instr_le); } *insnp = instr; return 0; } void force_signal_inject(int signal, int code, unsigned long address, unsigned long err) { const char *desc; struct pt_regs *regs = current_pt_regs(); if (WARN_ON(!user_mode(regs))) return; switch (signal) { case SIGILL: desc = "undefined instruction"; break; case SIGSEGV: desc = "illegal memory access"; break; default: desc = "unknown or unrecoverable error"; break; } /* Force signals we don't understand to SIGKILL */ if (WARN_ON(signal != SIGKILL && siginfo_layout(signal, code) != SIL_FAULT)) { signal = SIGKILL; } arm64_notify_die(desc, regs, signal, code, address, err); } /* * Set up process info to signal segmentation fault - called on access error. */ void arm64_notify_segfault(unsigned long addr) { int code; mmap_read_lock(current->mm); if (find_vma(current->mm, untagged_addr(addr)) == NULL) code = SEGV_MAPERR; else code = SEGV_ACCERR; mmap_read_unlock(current->mm); force_signal_inject(SIGSEGV, code, addr, 0); } void do_el0_undef(struct pt_regs *regs, unsigned long esr) { u32 insn; /* check for AArch32 breakpoint instructions */ if (try_handle_aarch32_break(regs)) return; if (user_insn_read(regs, &insn)) goto out_err; if (try_emulate_mrs(regs, insn)) return; if (try_emulate_armv8_deprecated(regs, insn)) return; out_err: force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0); } void do_el1_undef(struct pt_regs *regs, unsigned long esr) { u32 insn; if (aarch64_insn_read((void *)regs->pc, &insn)) goto out_err; if (try_emulate_el1_ssbs(regs, insn)) return; out_err: die("Oops - Undefined instruction", regs, esr); } void do_el0_bti(struct pt_regs *regs) { force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0); } void do_el1_bti(struct pt_regs *regs, unsigned long esr) { if (efi_runtime_fixup_exception(regs, "BTI violation")) { regs->pstate &= ~PSR_BTYPE_MASK; return; } die("Oops - BTI", regs, esr); } void do_el0_gcs(struct pt_regs *regs, unsigned long esr) { force_signal_inject(SIGSEGV, SEGV_CPERR, regs->pc, 0); } void do_el1_gcs(struct pt_regs *regs, unsigned long esr) { die("Oops - GCS", regs, esr); } void do_el0_fpac(struct pt_regs *regs, unsigned long esr) { force_signal_inject(SIGILL, ILL_ILLOPN, regs->pc, esr); } void do_el1_fpac(struct pt_regs *regs, unsigned long esr) { /* * Unexpected FPAC exception in the kernel: kill the task before it * does any more harm. */ die("Oops - FPAC", regs, esr); } void do_el0_mops(struct pt_regs *regs, unsigned long esr) { arm64_mops_reset_regs(&regs->user_regs, esr); /* * If single stepping then finish the step before executing the * prologue instruction. */ user_fastforward_single_step(current); } void do_el1_mops(struct pt_regs *regs, unsigned long esr) { arm64_mops_reset_regs(&regs->user_regs, esr); kernel_fastforward_single_step(regs); } #define __user_cache_maint(insn, address, res) \ if (address >= TASK_SIZE_MAX) { \ res = -EFAULT; \ } else { \ uaccess_ttbr0_enable(); \ asm volatile ( \ "1: " insn ", %1\n" \ " mov %w0, #0\n" \ "2:\n" \ _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w0) \ : "=r" (res) \ : "r" (address)); \ uaccess_ttbr0_disable(); \ } static void user_cache_maint_handler(unsigned long esr, struct pt_regs *regs) { unsigned long tagged_address, address; int rt = ESR_ELx_SYS64_ISS_RT(esr); int crm = (esr & ESR_ELx_SYS64_ISS_CRM_MASK) >> ESR_ELx_SYS64_ISS_CRM_SHIFT; int ret = 0; tagged_address = pt_regs_read_reg(regs, rt); address = untagged_addr(tagged_address); switch (crm) { case ESR_ELx_SYS64_ISS_CRM_DC_CVAU: /* DC CVAU, gets promoted */ __user_cache_maint("dc civac", address, ret); break; case ESR_ELx_SYS64_ISS_CRM_DC_CVAC: /* DC CVAC, gets promoted */ __user_cache_maint("dc civac", address, ret); break; case ESR_ELx_SYS64_ISS_CRM_DC_CVADP: /* DC CVADP */ __user_cache_maint("sys 3, c7, c13, 1", address, ret); break; case ESR_ELx_SYS64_ISS_CRM_DC_CVAP: /* DC CVAP */ __user_cache_maint("sys 3, c7, c12, 1", address, ret); break; case ESR_ELx_SYS64_ISS_CRM_DC_CIVAC: /* DC CIVAC */ __user_cache_maint("dc civac", address, ret); break; case ESR_ELx_SYS64_ISS_CRM_IC_IVAU: /* IC IVAU */ __user_cache_maint("ic ivau", address, ret); break; default: force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0); return; } if (ret) arm64_notify_segfault(tagged_address); else arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); } static void ctr_read_handler(unsigned long esr, struct pt_regs *regs) { int rt = ESR_ELx_SYS64_ISS_RT(esr); unsigned long val = arm64_ftr_reg_user_value(&arm64_ftr_reg_ctrel0); if (cpus_have_final_cap(ARM64_WORKAROUND_1542419)) { /* Hide DIC so that we can trap the unnecessary maintenance...*/ val &= ~BIT(CTR_EL0_DIC_SHIFT); /* ... and fake IminLine to reduce the number of traps. */ val &= ~CTR_EL0_IminLine_MASK; val |= (PAGE_SHIFT - 2) & CTR_EL0_IminLine_MASK; } pt_regs_write_reg(regs, rt, val); arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); } static void cntvct_read_handler(unsigned long esr, struct pt_regs *regs) { if (test_thread_flag(TIF_TSC_SIGSEGV)) { force_sig(SIGSEGV); } else { int rt = ESR_ELx_SYS64_ISS_RT(esr); pt_regs_write_reg(regs, rt, arch_timer_read_counter()); arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); } } static void cntfrq_read_handler(unsigned long esr, struct pt_regs *regs) { if (test_thread_flag(TIF_TSC_SIGSEGV)) { force_sig(SIGSEGV); } else { int rt = ESR_ELx_SYS64_ISS_RT(esr); pt_regs_write_reg(regs, rt, arch_timer_get_rate()); arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); } } static void mrs_handler(unsigned long esr, struct pt_regs *regs) { u32 sysreg, rt; rt = ESR_ELx_SYS64_ISS_RT(esr); sysreg = esr_sys64_to_sysreg(esr); if (do_emulate_mrs(regs, sysreg, rt) != 0) force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0); } static void wfi_handler(unsigned long esr, struct pt_regs *regs) { arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); } struct sys64_hook { unsigned long esr_mask; unsigned long esr_val; void (*handler)(unsigned long esr, struct pt_regs *regs); }; static const struct sys64_hook sys64_hooks[] = { { .esr_mask = ESR_ELx_SYS64_ISS_EL0_CACHE_OP_MASK, .esr_val = ESR_ELx_SYS64_ISS_EL0_CACHE_OP_VAL, .handler = user_cache_maint_handler, }, { /* Trap read access to CTR_EL0 */ .esr_mask = ESR_ELx_SYS64_ISS_SYS_OP_MASK, .esr_val = ESR_ELx_SYS64_ISS_SYS_CTR_READ, .handler = ctr_read_handler, }, { /* Trap read access to CNTVCT_EL0 */ .esr_mask = ESR_ELx_SYS64_ISS_SYS_OP_MASK, .esr_val = ESR_ELx_SYS64_ISS_SYS_CNTVCT, .handler = cntvct_read_handler, }, { /* Trap read access to CNTVCTSS_EL0 */ .esr_mask = ESR_ELx_SYS64_ISS_SYS_OP_MASK, .esr_val = ESR_ELx_SYS64_ISS_SYS_CNTVCTSS, .handler = cntvct_read_handler, }, { /* Trap read access to CNTFRQ_EL0 */ .esr_mask = ESR_ELx_SYS64_ISS_SYS_OP_MASK, .esr_val = ESR_ELx_SYS64_ISS_SYS_CNTFRQ, .handler = cntfrq_read_handler, }, { /* Trap read access to CPUID registers */ .esr_mask = ESR_ELx_SYS64_ISS_SYS_MRS_OP_MASK, .esr_val = ESR_ELx_SYS64_ISS_SYS_MRS_OP_VAL, .handler = mrs_handler, }, { /* Trap WFI instructions executed in userspace */ .esr_mask = ESR_ELx_WFx_MASK, .esr_val = ESR_ELx_WFx_WFI_VAL, .handler = wfi_handler, }, {}, }; #ifdef CONFIG_COMPAT static bool cp15_cond_valid(unsigned long esr, struct pt_regs *regs) { int cond; /* Only a T32 instruction can trap without CV being set */ if (!(esr & ESR_ELx_CV)) { u32 it; it = compat_get_it_state(regs); if (!it) return true; cond = it >> 4; } else { cond = (esr & ESR_ELx_COND_MASK) >> ESR_ELx_COND_SHIFT; } return aarch32_opcode_cond_checks[cond](regs->pstate); } static void compat_cntfrq_read_handler(unsigned long esr, struct pt_regs *regs) { int reg = (esr & ESR_ELx_CP15_32_ISS_RT_MASK) >> ESR_ELx_CP15_32_ISS_RT_SHIFT; pt_regs_write_reg(regs, reg, arch_timer_get_rate()); arm64_skip_faulting_instruction(regs, 4); } static const struct sys64_hook cp15_32_hooks[] = { { .esr_mask = ESR_ELx_CP15_32_ISS_SYS_MASK, .esr_val = ESR_ELx_CP15_32_ISS_SYS_CNTFRQ, .handler = compat_cntfrq_read_handler, }, {}, }; static void compat_cntvct_read_handler(unsigned long esr, struct pt_regs *regs) { int rt = (esr & ESR_ELx_CP15_64_ISS_RT_MASK) >> ESR_ELx_CP15_64_ISS_RT_SHIFT; int rt2 = (esr & ESR_ELx_CP15_64_ISS_RT2_MASK) >> ESR_ELx_CP15_64_ISS_RT2_SHIFT; u64 val = arch_timer_read_counter(); pt_regs_write_reg(regs, rt, lower_32_bits(val)); pt_regs_write_reg(regs, rt2, upper_32_bits(val)); arm64_skip_faulting_instruction(regs, 4); } static const struct sys64_hook cp15_64_hooks[] = { { .esr_mask = ESR_ELx_CP15_64_ISS_SYS_MASK, .esr_val = ESR_ELx_CP15_64_ISS_SYS_CNTVCT, .handler = compat_cntvct_read_handler, }, { .esr_mask = ESR_ELx_CP15_64_ISS_SYS_MASK, .esr_val = ESR_ELx_CP15_64_ISS_SYS_CNTVCTSS, .handler = compat_cntvct_read_handler, }, {}, }; void do_el0_cp15(unsigned long esr, struct pt_regs *regs) { const struct sys64_hook *hook, *hook_base; if (!cp15_cond_valid(esr, regs)) { /* * There is no T16 variant of a CP access, so we * always advance PC by 4 bytes. */ arm64_skip_faulting_instruction(regs, 4); return; } switch (ESR_ELx_EC(esr)) { case ESR_ELx_EC_CP15_32: hook_base = cp15_32_hooks; break; case ESR_ELx_EC_CP15_64: hook_base = cp15_64_hooks; break; default: do_el0_undef(regs, esr); return; } for (hook = hook_base; hook->handler; hook++) if ((hook->esr_mask & esr) == hook->esr_val) { hook->handler(esr, regs); return; } /* * New cp15 instructions may previously have been undefined at * EL0. Fall back to our usual undefined instruction handler * so that we handle these consistently. */ do_el0_undef(regs, esr); } #endif void do_el0_sys(unsigned long esr, struct pt_regs *regs) { const struct sys64_hook *hook; for (hook = sys64_hooks; hook->handler; hook++) if ((hook->esr_mask & esr) == hook->esr_val) { hook->handler(esr, regs); return; } /* * New SYS instructions may previously have been undefined at EL0. Fall * back to our usual undefined instruction handler so that we handle * these consistently. */ do_el0_undef(regs, esr); } static const char *esr_class_str[] = { [0 ... ESR_ELx_EC_MAX] = "UNRECOGNIZED EC", [ESR_ELx_EC_UNKNOWN] = "Unknown/Uncategorized", [ESR_ELx_EC_WFx] = "WFI/WFE", [ESR_ELx_EC_CP15_32] = "CP15 MCR/MRC", [ESR_ELx_EC_CP15_64] = "CP15 MCRR/MRRC", [ESR_ELx_EC_CP14_MR] = "CP14 MCR/MRC", [ESR_ELx_EC_CP14_LS] = "CP14 LDC/STC", [ESR_ELx_EC_FP_ASIMD] = "ASIMD", [ESR_ELx_EC_CP10_ID] = "CP10 MRC/VMRS", [ESR_ELx_EC_PAC] = "PAC", [ESR_ELx_EC_CP14_64] = "CP14 MCRR/MRRC", [ESR_ELx_EC_BTI] = "BTI", [ESR_ELx_EC_ILL] = "PSTATE.IL", [ESR_ELx_EC_SVC32] = "SVC (AArch32)", [ESR_ELx_EC_HVC32] = "HVC (AArch32)", [ESR_ELx_EC_SMC32] = "SMC (AArch32)", [ESR_ELx_EC_SVC64] = "SVC (AArch64)", [ESR_ELx_EC_HVC64] = "HVC (AArch64)", [ESR_ELx_EC_SMC64] = "SMC (AArch64)", [ESR_ELx_EC_SYS64] = "MSR/MRS (AArch64)", [ESR_ELx_EC_SVE] = "SVE", [ESR_ELx_EC_ERET] = "ERET/ERETAA/ERETAB", [ESR_ELx_EC_FPAC] = "FPAC", [ESR_ELx_EC_SME] = "SME", [ESR_ELx_EC_IMP_DEF] = "EL3 IMP DEF", [ESR_ELx_EC_IABT_LOW] = "IABT (lower EL)", [ESR_ELx_EC_IABT_CUR] = "IABT (current EL)", [ESR_ELx_EC_PC_ALIGN] = "PC Alignment", [ESR_ELx_EC_DABT_LOW] = "DABT (lower EL)", [ESR_ELx_EC_DABT_CUR] = "DABT (current EL)", [ESR_ELx_EC_SP_ALIGN] = "SP Alignment", [ESR_ELx_EC_MOPS] = "MOPS", [ESR_ELx_EC_FP_EXC32] = "FP (AArch32)", [ESR_ELx_EC_FP_EXC64] = "FP (AArch64)", [ESR_ELx_EC_GCS] = "Guarded Control Stack", [ESR_ELx_EC_SERROR] = "SError", [ESR_ELx_EC_BREAKPT_LOW] = "Breakpoint (lower EL)", [ESR_ELx_EC_BREAKPT_CUR] = "Breakpoint (current EL)", [ESR_ELx_EC_SOFTSTP_LOW] = "Software Step (lower EL)", [ESR_ELx_EC_SOFTSTP_CUR] = "Software Step (current EL)", [ESR_ELx_EC_WATCHPT_LOW] = "Watchpoint (lower EL)", [ESR_ELx_EC_WATCHPT_CUR] = "Watchpoint (current EL)", [ESR_ELx_EC_BKPT32] = "BKPT (AArch32)", [ESR_ELx_EC_VECTOR32] = "Vector catch (AArch32)", [ESR_ELx_EC_BRK64] = "BRK (AArch64)", }; const char *esr_get_class_string(unsigned long esr) { return esr_class_str[ESR_ELx_EC(esr)]; } /* * bad_el0_sync handles unexpected, but potentially recoverable synchronous * exceptions taken from EL0. */ void bad_el0_sync(struct pt_regs *regs, int reason, unsigned long esr) { unsigned long pc = instruction_pointer(regs); current->thread.fault_address = 0; current->thread.fault_code = esr; arm64_force_sig_fault(SIGILL, ILL_ILLOPC, pc, "Bad EL0 synchronous exception"); } DEFINE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack) __aligned(16); void __noreturn panic_bad_stack(struct pt_regs *regs, unsigned long esr, unsigned long far) { unsigned long tsk_stk = (unsigned long)current->stack; unsigned long irq_stk = (unsigned long)this_cpu_read(irq_stack_ptr); unsigned long ovf_stk = (unsigned long)this_cpu_ptr(overflow_stack); console_verbose(); pr_emerg("Insufficient stack space to handle exception!"); pr_emerg("ESR: 0x%016lx -- %s\n", esr, esr_get_class_string(esr)); pr_emerg("FAR: 0x%016lx\n", far); pr_emerg("Task stack: [0x%016lx..0x%016lx]\n", tsk_stk, tsk_stk + THREAD_SIZE); pr_emerg("IRQ stack: [0x%016lx..0x%016lx]\n", irq_stk, irq_stk + IRQ_STACK_SIZE); pr_emerg("Overflow stack: [0x%016lx..0x%016lx]\n", ovf_stk, ovf_stk + OVERFLOW_STACK_SIZE); __show_regs(regs); /* * We use nmi_panic to limit the potential for recusive overflows, and * to get a better stack trace. */ nmi_panic(NULL, "kernel stack overflow"); cpu_park_loop(); } void __noreturn arm64_serror_panic(struct pt_regs *regs, unsigned long esr) { add_taint(TAINT_MACHINE_CHECK, LOCKDEP_STILL_OK); console_verbose(); pr_crit("SError Interrupt on CPU%d, code 0x%016lx -- %s\n", smp_processor_id(), esr, esr_get_class_string(esr)); if (regs) __show_regs(regs); nmi_panic(regs, "Asynchronous SError Interrupt"); cpu_park_loop(); } bool arm64_is_fatal_ras_serror(struct pt_regs *regs, unsigned long esr) { unsigned long aet = arm64_ras_serror_get_severity(esr); switch (aet) { case ESR_ELx_AET_CE: /* corrected error */ case ESR_ELx_AET_UEO: /* restartable, not yet consumed */ /* * The CPU can make progress. We may take UEO again as * a more severe error. */ return false; case ESR_ELx_AET_UEU: /* Uncorrected Unrecoverable */ case ESR_ELx_AET_UER: /* Uncorrected Recoverable */ /* * The CPU can't make progress. The exception may have * been imprecise. * * Neoverse-N1 #1349291 means a non-KVM SError reported as * Unrecoverable should be treated as Uncontainable. We * call arm64_serror_panic() in both cases. */ return true; case ESR_ELx_AET_UC: /* Uncontainable or Uncategorized error */ default: /* Error has been silently propagated */ arm64_serror_panic(regs, esr); } } void do_serror(struct pt_regs *regs, unsigned long esr) { /* non-RAS errors are not containable */ if (!arm64_is_ras_serror(esr) || arm64_is_fatal_ras_serror(regs, esr)) arm64_serror_panic(regs, esr); } /* GENERIC_BUG traps */ #ifdef CONFIG_GENERIC_BUG int is_valid_bugaddr(unsigned long addr) { /* * bug_brk_handler() only called for BRK #BUG_BRK_IMM. * So the answer is trivial -- any spurious instances with no * bug table entry will be rejected by report_bug() and passed * back to the debug-monitors code and handled as a fatal * unexpected debug exception. */ return 1; } #endif int bug_brk_handler(struct pt_regs *regs, unsigned long esr) { switch (report_bug(regs->pc, regs)) { case BUG_TRAP_TYPE_BUG: die("Oops - BUG", regs, esr); break; case BUG_TRAP_TYPE_WARN: break; default: /* unknown/unrecognised bug trap type */ return DBG_HOOK_ERROR; } /* If thread survives, skip over the BUG instruction and continue: */ arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); return DBG_HOOK_HANDLED; } #ifdef CONFIG_CFI_CLANG int cfi_brk_handler(struct pt_regs *regs, unsigned long esr) { unsigned long target; u32 type; target = pt_regs_read_reg(regs, FIELD_GET(CFI_BRK_IMM_TARGET, esr)); type = (u32)pt_regs_read_reg(regs, FIELD_GET(CFI_BRK_IMM_TYPE, esr)); switch (report_cfi_failure(regs, regs->pc, &target, type)) { case BUG_TRAP_TYPE_BUG: die("Oops - CFI", regs, esr); break; case BUG_TRAP_TYPE_WARN: break; default: return DBG_HOOK_ERROR; } arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); return DBG_HOOK_HANDLED; } #endif /* CONFIG_CFI_CLANG */ int reserved_fault_brk_handler(struct pt_regs *regs, unsigned long esr) { pr_err("%s generated an invalid instruction at %pS!\n", "Kernel text patching", (void *)instruction_pointer(regs)); /* We cannot handle this */ return DBG_HOOK_ERROR; } #ifdef CONFIG_KASAN_SW_TAGS #define KASAN_ESR_RECOVER 0x20 #define KASAN_ESR_WRITE 0x10 #define KASAN_ESR_SIZE_MASK 0x0f #define KASAN_ESR_SIZE(esr) (1 << ((esr) & KASAN_ESR_SIZE_MASK)) int kasan_brk_handler(struct pt_regs *regs, unsigned long esr) { bool recover = esr & KASAN_ESR_RECOVER; bool write = esr & KASAN_ESR_WRITE; size_t size = KASAN_ESR_SIZE(esr); void *addr = (void *)regs->regs[0]; u64 pc = regs->pc; kasan_report(addr, size, write, pc); /* * The instrumentation allows to control whether we can proceed after * a crash was detected. This is done by passing the -recover flag to * the compiler. Disabling recovery allows to generate more compact * code. * * Unfortunately disabling recovery doesn't work for the kernel right * now. KASAN reporting is disabled in some contexts (for example when * the allocator accesses slab object metadata; this is controlled by * current->kasan_depth). All these accesses are detected by the tool, * even though the reports for them are not printed. * * This is something that might be fixed at some point in the future. */ if (!recover) die("Oops - KASAN", regs, esr); /* If thread survives, skip over the brk instruction and continue: */ arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); return DBG_HOOK_HANDLED; } #endif #ifdef CONFIG_UBSAN_TRAP int ubsan_brk_handler(struct pt_regs *regs, unsigned long esr) { die(report_ubsan_failure(esr & UBSAN_BRK_MASK), regs, esr); return DBG_HOOK_HANDLED; } #endif
29 69 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCHED_H #define _LINUX_SCHED_H /* * Define 'struct task_struct' and provide the main scheduler * APIs (schedule(), wakeup variants, etc.) */ #include <uapi/linux/sched.h> #include <asm/current.h> #include <asm/processor.h> #include <linux/thread_info.h> #include <linux/preempt.h> #include <linux/cpumask_types.h> #include <linux/cache.h> #include <linux/irqflags_types.h> #include <linux/smp_types.h> #include <linux/pid_types.h> #include <linux/sem_types.h> #include <linux/shm.h> #include <linux/kmsan_types.h> #include <linux/mutex_types.h> #include <linux/plist_types.h> #include <linux/hrtimer_types.h> #include <linux/timer_types.h> #include <linux/seccomp_types.h> #include <linux/nodemask_types.h> #include <linux/refcount_types.h> #include <linux/resource.h> #include <linux/latencytop.h> #include <linux/sched/prio.h> #include <linux/sched/types.h> #include <linux/signal_types.h> #include <linux/spinlock.h> #include <linux/syscall_user_dispatch_types.h> #include <linux/mm_types_task.h> #include <linux/netdevice_xmit.h> #include <linux/task_io_accounting.h> #include <linux/posix-timers_types.h> #include <linux/restart_block.h> #include <uapi/linux/rseq.h> #include <linux/seqlock_types.h> #include <linux/kcsan.h> #include <linux/rv.h> #include <linux/uidgid_types.h> #include <linux/tracepoint-defs.h> #include <linux/unwind_deferred_types.h> #include <asm/kmap_size.h> /* task_struct member predeclarations (sorted alphabetically): */ struct audit_context; struct bio_list; struct blk_plug; struct bpf_local_storage; struct bpf_run_ctx; struct bpf_net_context; struct capture_control; struct cfs_rq; struct fs_struct; struct futex_pi_state; struct io_context; struct io_uring_task; struct mempolicy; struct nameidata; struct nsproxy; struct perf_event_context; struct perf_ctx_data; struct pid_namespace; struct pipe_inode_info; struct rcu_node; struct reclaim_state; struct robust_list_head; struct root_domain; struct rq; struct sched_attr; struct sched_dl_entity; struct seq_file; struct sighand_struct; struct signal_struct; struct task_delay_info; struct task_group; struct task_struct; struct user_event_mm; #include <linux/sched/ext.h> /* * Task state bitmask. NOTE! These bits are also * encoded in fs/proc/array.c: get_task_state(). * * We have two separate sets of flags: task->__state * is about runnability, while task->exit_state are * about the task exiting. Confusing, but this way * modifying one set can't modify the other one by * mistake. */ /* Used in tsk->__state: */ #define TASK_RUNNING 0x00000000 #define TASK_INTERRUPTIBLE 0x00000001 #define TASK_UNINTERRUPTIBLE 0x00000002 #define __TASK_STOPPED 0x00000004 #define __TASK_TRACED 0x00000008 /* Used in tsk->exit_state: */ #define EXIT_DEAD 0x00000010 #define EXIT_ZOMBIE 0x00000020 #define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD) /* Used in tsk->__state again: */ #define TASK_PARKED 0x00000040 #define TASK_DEAD 0x00000080 #define TASK_WAKEKILL 0x00000100 #define TASK_WAKING 0x00000200 #define TASK_NOLOAD 0x00000400 #define TASK_NEW 0x00000800 #define TASK_RTLOCK_WAIT 0x00001000 #define TASK_FREEZABLE 0x00002000 #define __TASK_FREEZABLE_UNSAFE (0x00004000 * IS_ENABLED(CONFIG_LOCKDEP)) #define TASK_FROZEN 0x00008000 #define TASK_STATE_MAX 0x00010000 #define TASK_ANY (TASK_STATE_MAX-1) /* * DO NOT ADD ANY NEW USERS ! */ #define TASK_FREEZABLE_UNSAFE (TASK_FREEZABLE | __TASK_FREEZABLE_UNSAFE) /* Convenience macros for the sake of set_current_state: */ #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) #define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED) #define TASK_TRACED __TASK_TRACED #define TASK_IDLE (TASK_UNINTERRUPTIBLE | TASK_NOLOAD) /* Convenience macros for the sake of wake_up(): */ #define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE) /* get_task_state(): */ #define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \ TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \ __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \ TASK_PARKED) #define task_is_running(task) (READ_ONCE((task)->__state) == TASK_RUNNING) #define task_is_traced(task) ((READ_ONCE(task->jobctl) & JOBCTL_TRACED) != 0) #define task_is_stopped(task) ((READ_ONCE(task->jobctl) & JOBCTL_STOPPED) != 0) #define task_is_stopped_or_traced(task) ((READ_ONCE(task->jobctl) & (JOBCTL_STOPPED | JOBCTL_TRACED)) != 0) /* * Special states are those that do not use the normal wait-loop pattern. See * the comment with set_special_state(). */ #define is_special_task_state(state) \ ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | \ TASK_DEAD | TASK_FROZEN)) #ifdef CONFIG_DEBUG_ATOMIC_SLEEP # define debug_normal_state_change(state_value) \ do { \ WARN_ON_ONCE(is_special_task_state(state_value)); \ current->task_state_change = _THIS_IP_; \ } while (0) # define debug_special_state_change(state_value) \ do { \ WARN_ON_ONCE(!is_special_task_state(state_value)); \ current->task_state_change = _THIS_IP_; \ } while (0) # define debug_rtlock_wait_set_state() \ do { \ current->saved_state_change = current->task_state_change;\ current->task_state_change = _THIS_IP_; \ } while (0) # define debug_rtlock_wait_restore_state() \ do { \ current->task_state_change = current->saved_state_change;\ } while (0) #else # define debug_normal_state_change(cond) do { } while (0) # define debug_special_state_change(cond) do { } while (0) # define debug_rtlock_wait_set_state() do { } while (0) # define debug_rtlock_wait_restore_state() do { } while (0) #endif #define trace_set_current_state(state_value) \ do { \ if (tracepoint_enabled(sched_set_state_tp)) \ __trace_set_current_state(state_value); \ } while (0) /* * set_current_state() includes a barrier so that the write of current->__state * is correctly serialised wrt the caller's subsequent test of whether to * actually sleep: * * for (;;) { * set_current_state(TASK_UNINTERRUPTIBLE); * if (CONDITION) * break; * * schedule(); * } * __set_current_state(TASK_RUNNING); * * If the caller does not need such serialisation (because, for instance, the * CONDITION test and condition change and wakeup are under the same lock) then * use __set_current_state(). * * The above is typically ordered against the wakeup, which does: * * CONDITION = 1; * wake_up_state(p, TASK_UNINTERRUPTIBLE); * * where wake_up_state()/try_to_wake_up() executes a full memory barrier before * accessing p->__state. * * Wakeup will do: if (@state & p->__state) p->__state = TASK_RUNNING, that is, * once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a * TASK_RUNNING store which can collide with __set_current_state(TASK_RUNNING). * * However, with slightly different timing the wakeup TASK_RUNNING store can * also collide with the TASK_UNINTERRUPTIBLE store. Losing that store is not * a problem either because that will result in one extra go around the loop * and our @cond test will save the day. * * Also see the comments of try_to_wake_up(). */ #define __set_current_state(state_value) \ do { \ debug_normal_state_change((state_value)); \ trace_set_current_state(state_value); \ WRITE_ONCE(current->__state, (state_value)); \ } while (0) #define set_current_state(state_value) \ do { \ debug_normal_state_change((state_value)); \ trace_set_current_state(state_value); \ smp_store_mb(current->__state, (state_value)); \ } while (0) /* * set_special_state() should be used for those states when the blocking task * can not use the regular condition based wait-loop. In that case we must * serialize against wakeups such that any possible in-flight TASK_RUNNING * stores will not collide with our state change. */ #define set_special_state(state_value) \ do { \ unsigned long flags; /* may shadow */ \ \ raw_spin_lock_irqsave(&current->pi_lock, flags); \ debug_special_state_change((state_value)); \ trace_set_current_state(state_value); \ WRITE_ONCE(current->__state, (state_value)); \ raw_spin_unlock_irqrestore(&current->pi_lock, flags); \ } while (0) /* * PREEMPT_RT specific variants for "sleeping" spin/rwlocks * * RT's spin/rwlock substitutions are state preserving. The state of the * task when blocking on the lock is saved in task_struct::saved_state and * restored after the lock has been acquired. These operations are * serialized by task_struct::pi_lock against try_to_wake_up(). Any non RT * lock related wakeups while the task is blocked on the lock are * redirected to operate on task_struct::saved_state to ensure that these * are not dropped. On restore task_struct::saved_state is set to * TASK_RUNNING so any wakeup attempt redirected to saved_state will fail. * * The lock operation looks like this: * * current_save_and_set_rtlock_wait_state(); * for (;;) { * if (try_lock()) * break; * raw_spin_unlock_irq(&lock->wait_lock); * schedule_rtlock(); * raw_spin_lock_irq(&lock->wait_lock); * set_current_state(TASK_RTLOCK_WAIT); * } * current_restore_rtlock_saved_state(); */ #define current_save_and_set_rtlock_wait_state() \ do { \ lockdep_assert_irqs_disabled(); \ raw_spin_lock(&current->pi_lock); \ current->saved_state = current->__state; \ debug_rtlock_wait_set_state(); \ trace_set_current_state(TASK_RTLOCK_WAIT); \ WRITE_ONCE(current->__state, TASK_RTLOCK_WAIT); \ raw_spin_unlock(&current->pi_lock); \ } while (0); #define current_restore_rtlock_saved_state() \ do { \ lockdep_assert_irqs_disabled(); \ raw_spin_lock(&current->pi_lock); \ debug_rtlock_wait_restore_state(); \ trace_set_current_state(current->saved_state); \ WRITE_ONCE(current->__state, current->saved_state); \ current->saved_state = TASK_RUNNING; \ raw_spin_unlock(&current->pi_lock); \ } while (0); #define get_current_state() READ_ONCE(current->__state) /* * Define the task command name length as enum, then it can be visible to * BPF programs. */ enum { TASK_COMM_LEN = 16, }; extern void sched_tick(void); #define MAX_SCHEDULE_TIMEOUT LONG_MAX extern long schedule_timeout(long timeout); extern long schedule_timeout_interruptible(long timeout); extern long schedule_timeout_killable(long timeout); extern long schedule_timeout_uninterruptible(long timeout); extern long schedule_timeout_idle(long timeout); asmlinkage void schedule(void); extern void schedule_preempt_disabled(void); asmlinkage void preempt_schedule_irq(void); #ifdef CONFIG_PREEMPT_RT extern void schedule_rtlock(void); #endif extern int __must_check io_schedule_prepare(void); extern void io_schedule_finish(int token); extern long io_schedule_timeout(long timeout); extern void io_schedule(void); /* wrapper functions to trace from this header file */ DECLARE_TRACEPOINT(sched_set_state_tp); extern void __trace_set_current_state(int state_value); DECLARE_TRACEPOINT(sched_set_need_resched_tp); extern void __trace_set_need_resched(struct task_struct *curr, int tif); /** * struct prev_cputime - snapshot of system and user cputime * @utime: time spent in user mode * @stime: time spent in system mode * @lock: protects the above two fields * * Stores previous user/system time values such that we can guarantee * monotonicity. */ struct prev_cputime { #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE u64 utime; u64 stime; raw_spinlock_t lock; #endif }; enum vtime_state { /* Task is sleeping or running in a CPU with VTIME inactive: */ VTIME_INACTIVE = 0, /* Task is idle */ VTIME_IDLE, /* Task runs in kernelspace in a CPU with VTIME active: */ VTIME_SYS, /* Task runs in userspace in a CPU with VTIME active: */ VTIME_USER, /* Task runs as guests in a CPU with VTIME active: */ VTIME_GUEST, }; struct vtime { seqcount_t seqcount; unsigned long long starttime; enum vtime_state state; unsigned int cpu; u64 utime; u64 stime; u64 gtime; }; /* * Utilization clamp constraints. * @UCLAMP_MIN: Minimum utilization * @UCLAMP_MAX: Maximum utilization * @UCLAMP_CNT: Utilization clamp constraints count */ enum uclamp_id { UCLAMP_MIN = 0, UCLAMP_MAX, UCLAMP_CNT }; extern struct root_domain def_root_domain; extern struct mutex sched_domains_mutex; extern void sched_domains_mutex_lock(void); extern void sched_domains_mutex_unlock(void); struct sched_param { int sched_priority; }; struct sched_info { #ifdef CONFIG_SCHED_INFO /* Cumulative counters: */ /* # of times we have run on this CPU: */ unsigned long pcount; /* Time spent waiting on a runqueue: */ unsigned long long run_delay; /* Max time spent waiting on a runqueue: */ unsigned long long max_run_delay; /* Min time spent waiting on a runqueue: */ unsigned long long min_run_delay; /* Timestamps: */ /* When did we last run on a CPU? */ unsigned long long last_arrival; /* When were we last queued to run? */ unsigned long long last_queued; #endif /* CONFIG_SCHED_INFO */ }; /* * Integer metrics need fixed point arithmetic, e.g., sched/fair * has a few: load, load_avg, util_avg, freq, and capacity. * * We define a basic fixed point arithmetic range, and then formalize * all these metrics based on that basic range. */ # define SCHED_FIXEDPOINT_SHIFT 10 # define SCHED_FIXEDPOINT_SCALE (1L << SCHED_FIXEDPOINT_SHIFT) /* Increase resolution of cpu_capacity calculations */ # define SCHED_CAPACITY_SHIFT SCHED_FIXEDPOINT_SHIFT # define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT) struct load_weight { unsigned long weight; u32 inv_weight; }; /* * The load/runnable/util_avg accumulates an infinite geometric series * (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c). * * [load_avg definition] * * load_avg = runnable% * scale_load_down(load) * * [runnable_avg definition] * * runnable_avg = runnable% * SCHED_CAPACITY_SCALE * * [util_avg definition] * * util_avg = running% * SCHED_CAPACITY_SCALE * * where runnable% is the time ratio that a sched_entity is runnable and * running% the time ratio that a sched_entity is running. * * For cfs_rq, they are the aggregated values of all runnable and blocked * sched_entities. * * The load/runnable/util_avg doesn't directly factor frequency scaling and CPU * capacity scaling. The scaling is done through the rq_clock_pelt that is used * for computing those signals (see update_rq_clock_pelt()) * * N.B., the above ratios (runnable% and running%) themselves are in the * range of [0, 1]. To do fixed point arithmetics, we therefore scale them * to as large a range as necessary. This is for example reflected by * util_avg's SCHED_CAPACITY_SCALE. * * [Overflow issue] * * The 64-bit load_sum can have 4353082796 (=2^64/47742/88761) entities * with the highest load (=88761), always runnable on a single cfs_rq, * and should not overflow as the number already hits PID_MAX_LIMIT. * * For all other cases (including 32-bit kernels), struct load_weight's * weight will overflow first before we do, because: * * Max(load_avg) <= Max(load.weight) * * Then it is the load_weight's responsibility to consider overflow * issues. */ struct sched_avg { u64 last_update_time; u64 load_sum; u64 runnable_sum; u32 util_sum; u32 period_contrib; unsigned long load_avg; unsigned long runnable_avg; unsigned long util_avg; unsigned int util_est; } ____cacheline_aligned; /* * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg * updates. When a task is dequeued, its util_est should not be updated if its * util_avg has not been updated in the meantime. * This information is mapped into the MSB bit of util_est at dequeue time. * Since max value of util_est for a task is 1024 (PELT util_avg for a task) * it is safe to use MSB. */ #define UTIL_EST_WEIGHT_SHIFT 2 #define UTIL_AVG_UNCHANGED 0x80000000 struct sched_statistics { #ifdef CONFIG_SCHEDSTATS u64 wait_start; u64 wait_max; u64 wait_count; u64 wait_sum; u64 iowait_count; u64 iowait_sum; u64 sleep_start; u64 sleep_max; s64 sum_sleep_runtime; u64 block_start; u64 block_max; s64 sum_block_runtime; s64 exec_max; u64 slice_max; u64 nr_migrations_cold; u64 nr_failed_migrations_affine; u64 nr_failed_migrations_running; u64 nr_failed_migrations_hot; u64 nr_forced_migrations; u64 nr_wakeups; u64 nr_wakeups_sync; u64 nr_wakeups_migrate; u64 nr_wakeups_local; u64 nr_wakeups_remote; u64 nr_wakeups_affine; u64 nr_wakeups_affine_attempts; u64 nr_wakeups_passive; u64 nr_wakeups_idle; #ifdef CONFIG_SCHED_CORE u64 core_forceidle_sum; #endif #endif /* CONFIG_SCHEDSTATS */ } ____cacheline_aligned; struct sched_entity { /* For load-balancing: */ struct load_weight load; struct rb_node run_node; u64 deadline; u64 min_vruntime; u64 min_slice; struct list_head group_node; unsigned char on_rq; unsigned char sched_delayed; unsigned char rel_deadline; unsigned char custom_slice; /* hole */ u64 exec_start; u64 sum_exec_runtime; u64 prev_sum_exec_runtime; u64 vruntime; union { /* * When !@on_rq this field is vlag. * When cfs_rq->curr == se (which implies @on_rq) * this field is vprot. See protect_slice(). */ s64 vlag; u64 vprot; }; u64 slice; u64 nr_migrations; #ifdef CONFIG_FAIR_GROUP_SCHED int depth; struct sched_entity *parent; /* rq on which this entity is (to be) queued: */ struct cfs_rq *cfs_rq; /* rq "owned" by this entity/group: */ struct cfs_rq *my_q; /* cached value of my_q->h_nr_running */ unsigned long runnable_weight; #endif /* * Per entity load average tracking. * * Put into separate cache line so it does not * collide with read-mostly values above. */ struct sched_avg avg; }; struct sched_rt_entity { struct list_head run_list; unsigned long timeout; unsigned long watchdog_stamp; unsigned int time_slice; unsigned short on_rq; unsigned short on_list; struct sched_rt_entity *back; #ifdef CONFIG_RT_GROUP_SCHED struct sched_rt_entity *parent; /* rq on which this entity is (to be) queued: */ struct rt_rq *rt_rq; /* rq "owned" by this entity/group: */ struct rt_rq *my_q; #endif } __randomize_layout; typedef bool (*dl_server_has_tasks_f)(struct sched_dl_entity *); typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *); struct sched_dl_entity { struct rb_node rb_node; /* * Original scheduling parameters. Copied here from sched_attr * during sched_setattr(), they will remain the same until * the next sched_setattr(). */ u64 dl_runtime; /* Maximum runtime for each instance */ u64 dl_deadline; /* Relative deadline of each instance */ u64 dl_period; /* Separation of two instances (period) */ u64 dl_bw; /* dl_runtime / dl_period */ u64 dl_density; /* dl_runtime / dl_deadline */ /* * Actual scheduling parameters. Initialized with the values above, * they are continuously updated during task execution. Note that * the remaining runtime could be < 0 in case we are in overrun. */ s64 runtime; /* Remaining runtime for this instance */ u64 deadline; /* Absolute deadline for this instance */ unsigned int flags; /* Specifying the scheduler behaviour */ /* * Some bool flags: * * @dl_throttled tells if we exhausted the runtime. If so, the * task has to wait for a replenishment to be performed at the * next firing of dl_timer. * * @dl_yielded tells if task gave up the CPU before consuming * all its available runtime during the last job. * * @dl_non_contending tells if the task is inactive while still * contributing to the active utilization. In other words, it * indicates if the inactive timer has been armed and its handler * has not been executed yet. This flag is useful to avoid race * conditions between the inactive timer handler and the wakeup * code. * * @dl_overrun tells if the task asked to be informed about runtime * overruns. * * @dl_server tells if this is a server entity. * * @dl_defer tells if this is a deferred or regular server. For * now only defer server exists. * * @dl_defer_armed tells if the deferrable server is waiting * for the replenishment timer to activate it. * * @dl_server_active tells if the dlserver is active(started). * dlserver is started on first cfs enqueue on an idle runqueue * and is stopped when a dequeue results in 0 cfs tasks on the * runqueue. In other words, dlserver is active only when cpu's * runqueue has atleast one cfs task. * * @dl_defer_running tells if the deferrable server is actually * running, skipping the defer phase. */ unsigned int dl_throttled : 1; unsigned int dl_yielded : 1; unsigned int dl_non_contending : 1; unsigned int dl_overrun : 1; unsigned int dl_server : 1; unsigned int dl_server_active : 1; unsigned int dl_defer : 1; unsigned int dl_defer_armed : 1; unsigned int dl_defer_running : 1; unsigned int dl_server_idle : 1; /* * Bandwidth enforcement timer. Each -deadline task has its * own bandwidth to be enforced, thus we need one timer per task. */ struct hrtimer dl_timer; /* * Inactive timer, responsible for decreasing the active utilization * at the "0-lag time". When a -deadline task blocks, it contributes * to GRUB's active utilization until the "0-lag time", hence a * timer is needed to decrease the active utilization at the correct * time. */ struct hrtimer inactive_timer; /* * Bits for DL-server functionality. Also see the comment near * dl_server_update(). * * @rq the runqueue this server is for * * @server_has_tasks() returns true if @server_pick return a * runnable task. */ struct rq *rq; dl_server_has_tasks_f server_has_tasks; dl_server_pick_f server_pick_task; #ifdef CONFIG_RT_MUTEXES /* * Priority Inheritance. When a DEADLINE scheduling entity is boosted * pi_se points to the donor, otherwise points to the dl_se it belongs * to (the original one/itself). */ struct sched_dl_entity *pi_se; #endif }; #ifdef CONFIG_UCLAMP_TASK /* Number of utilization clamp buckets (shorter alias) */ #define UCLAMP_BUCKETS CONFIG_UCLAMP_BUCKETS_COUNT /* * Utilization clamp for a scheduling entity * @value: clamp value "assigned" to a se * @bucket_id: bucket index corresponding to the "assigned" value * @active: the se is currently refcounted in a rq's bucket * @user_defined: the requested clamp value comes from user-space * * The bucket_id is the index of the clamp bucket matching the clamp value * which is pre-computed and stored to avoid expensive integer divisions from * the fast path. * * The active bit is set whenever a task has got an "effective" value assigned, * which can be different from the clamp value "requested" from user-space. * This allows to know a task is refcounted in the rq's bucket corresponding * to the "effective" bucket_id. * * The user_defined bit is set whenever a task has got a task-specific clamp * value requested from userspace, i.e. the system defaults apply to this task * just as a restriction. This allows to relax default clamps when a less * restrictive task-specific value has been requested, thus allowing to * implement a "nice" semantic. For example, a task running with a 20% * default boost can still drop its own boosting to 0%. */ struct uclamp_se { unsigned int value : bits_per(SCHED_CAPACITY_SCALE); unsigned int bucket_id : bits_per(UCLAMP_BUCKETS); unsigned int active : 1; unsigned int user_defined : 1; }; #endif /* CONFIG_UCLAMP_TASK */ union rcu_special { struct { u8 blocked; u8 need_qs; u8 exp_hint; /* Hint for performance. */ u8 need_mb; /* Readers need smp_mb(). */ } b; /* Bits. */ u32 s; /* Set of bits. */ }; enum perf_event_task_context { perf_invalid_context = -1, perf_hw_context = 0, perf_sw_context, perf_nr_task_contexts, }; /* * Number of contexts where an event can trigger: * task, softirq, hardirq, nmi. */ #define PERF_NR_CONTEXTS 4 struct wake_q_node { struct wake_q_node *next; }; struct kmap_ctrl { #ifdef CONFIG_KMAP_LOCAL int idx; pte_t pteval[KM_MAX_IDX]; #endif }; struct task_struct { #ifdef CONFIG_THREAD_INFO_IN_TASK /* * For reasons of header soup (see current_thread_info()), this * must be the first element of task_struct. */ struct thread_info thread_info; #endif unsigned int __state; /* saved state for "spinlock sleepers" */ unsigned int saved_state; /* * This begins the randomizable portion of task_struct. Only * scheduling-critical items should be added above here. */ randomized_struct_fields_start void *stack; refcount_t usage; /* Per task flags (PF_*), defined further below: */ unsigned int flags; unsigned int ptrace; #ifdef CONFIG_MEM_ALLOC_PROFILING struct alloc_tag *alloc_tag; #endif int on_cpu; struct __call_single_node wake_entry; unsigned int wakee_flips; unsigned long wakee_flip_decay_ts; struct task_struct *last_wakee; /* * recent_used_cpu is initially set as the last CPU used by a task * that wakes affine another task. Waker/wakee relationships can * push tasks around a CPU where each wakeup moves to the next one. * Tracking a recently used CPU allows a quick search for a recently * used CPU that may be idle. */ int recent_used_cpu; int wake_cpu; int on_rq; int prio; int static_prio; int normal_prio; unsigned int rt_priority; struct sched_entity se; struct sched_rt_entity rt; struct sched_dl_entity dl; struct sched_dl_entity *dl_server; #ifdef CONFIG_SCHED_CLASS_EXT struct sched_ext_entity scx; #endif const struct sched_class *sched_class; #ifdef CONFIG_SCHED_CORE struct rb_node core_node; unsigned long core_cookie; unsigned int core_occupation; #endif #ifdef CONFIG_CGROUP_SCHED struct task_group *sched_task_group; #endif #ifdef CONFIG_UCLAMP_TASK /* * Clamp values requested for a scheduling entity. * Must be updated with task_rq_lock() held. */ struct uclamp_se uclamp_req[UCLAMP_CNT]; /* * Effective clamp values used for a scheduling entity. * Must be updated with task_rq_lock() held. */ struct uclamp_se uclamp[UCLAMP_CNT]; #endif struct sched_statistics stats; #ifdef CONFIG_PREEMPT_NOTIFIERS /* List of struct preempt_notifier: */ struct hlist_head preempt_notifiers; #endif #ifdef CONFIG_BLK_DEV_IO_TRACE unsigned int btrace_seq; #endif unsigned int policy; unsigned long max_allowed_capacity; int nr_cpus_allowed; const cpumask_t *cpus_ptr; cpumask_t *user_cpus_ptr; cpumask_t cpus_mask; void *migration_pending; unsigned short migration_disabled; unsigned short migration_flags; #ifdef CONFIG_PREEMPT_RCU int rcu_read_lock_nesting; union rcu_special rcu_read_unlock_special; struct list_head rcu_node_entry; struct rcu_node *rcu_blocked_node; #endif /* #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_TASKS_RCU unsigned long rcu_tasks_nvcsw; u8 rcu_tasks_holdout; u8 rcu_tasks_idx; int rcu_tasks_idle_cpu; struct list_head rcu_tasks_holdout_list; int rcu_tasks_exit_cpu; struct list_head rcu_tasks_exit_list; #endif /* #ifdef CONFIG_TASKS_RCU */ #ifdef CONFIG_TASKS_TRACE_RCU int trc_reader_nesting; int trc_ipi_to_cpu; union rcu_special trc_reader_special; struct list_head trc_holdout_list; struct list_head trc_blkd_node; int trc_blkd_cpu; #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ struct sched_info sched_info; struct list_head tasks; struct plist_node pushable_tasks; struct rb_node pushable_dl_tasks; struct mm_struct *mm; struct mm_struct *active_mm; struct address_space *faults_disabled_mapping; int exit_state; int exit_code; int exit_signal; /* The signal sent when the parent dies: */ int pdeath_signal; /* JOBCTL_*, siglock protected: */ unsigned long jobctl; /* Used for emulating ABI behavior of previous Linux versions: */ unsigned int personality; /* Scheduler bits, serialized by scheduler locks: */ unsigned sched_reset_on_fork:1; unsigned sched_contributes_to_load:1; unsigned sched_migrated:1; unsigned sched_task_hot:1; /* Force alignment to the next boundary: */ unsigned :0; /* Unserialized, strictly 'current' */ /* * This field must not be in the scheduler word above due to wakelist * queueing no longer being serialized by p->on_cpu. However: * * p->XXX = X; ttwu() * schedule() if (p->on_rq && ..) // false * smp_mb__after_spinlock(); if (smp_load_acquire(&p->on_cpu) && //true * deactivate_task() ttwu_queue_wakelist()) * p->on_rq = 0; p->sched_remote_wakeup = Y; * * guarantees all stores of 'current' are visible before * ->sched_remote_wakeup gets used, so it can be in this word. */ unsigned sched_remote_wakeup:1; #ifdef CONFIG_RT_MUTEXES unsigned sched_rt_mutex:1; #endif /* Bit to tell TOMOYO we're in execve(): */ unsigned in_execve:1; unsigned in_iowait:1; #ifndef TIF_RESTORE_SIGMASK unsigned restore_sigmask:1; #endif #ifdef CONFIG_MEMCG_V1 unsigned in_user_fault:1; #endif #ifdef CONFIG_LRU_GEN /* whether the LRU algorithm may apply to this access */ unsigned in_lru_fault:1; #endif #ifdef CONFIG_COMPAT_BRK unsigned brk_randomized:1; #endif #ifdef CONFIG_CGROUPS /* disallow userland-initiated cgroup migration */ unsigned no_cgroup_migration:1; /* task is frozen/stopped (used by the cgroup freezer) */ unsigned frozen:1; #endif #ifdef CONFIG_BLK_CGROUP unsigned use_memdelay:1; #endif #ifdef CONFIG_PSI /* Stalled due to lack of memory */ unsigned in_memstall:1; #endif #ifdef CONFIG_PAGE_OWNER /* Used by page_owner=on to detect recursion in page tracking. */ unsigned in_page_owner:1; #endif #ifdef CONFIG_EVENTFD /* Recursion prevention for eventfd_signal() */ unsigned in_eventfd:1; #endif #ifdef CONFIG_ARCH_HAS_CPU_PASID unsigned pasid_activated:1; #endif #ifdef CONFIG_X86_BUS_LOCK_DETECT unsigned reported_split_lock:1; #endif #ifdef CONFIG_TASK_DELAY_ACCT /* delay due to memory thrashing */ unsigned in_thrashing:1; #endif unsigned in_nf_duplicate:1; #ifdef CONFIG_PREEMPT_RT struct netdev_xmit net_xmit; #endif unsigned long atomic_flags; /* Flags requiring atomic access. */ struct restart_block restart_block; pid_t pid; pid_t tgid; #ifdef CONFIG_STACKPROTECTOR /* Canary value for the -fstack-protector GCC feature: */ unsigned long stack_canary; #endif /* * Pointers to the (original) parent process, youngest child, younger sibling, * older sibling, respectively. (p->father can be replaced with * p->real_parent->pid) */ /* Real parent process: */ struct task_struct __rcu *real_parent; /* Recipient of SIGCHLD, wait4() reports: */ struct task_struct __rcu *parent; /* * Children/sibling form the list of natural children: */ struct list_head children; struct list_head sibling; struct task_struct *group_leader; /* * 'ptraced' is the list of tasks this task is using ptrace() on. * * This includes both natural children and PTRACE_ATTACH targets. * 'ptrace_entry' is this task's link on the p->parent->ptraced list. */ struct list_head ptraced; struct list_head ptrace_entry; /* PID/PID hash table linkage. */ struct pid *thread_pid; struct hlist_node pid_links[PIDTYPE_MAX]; struct list_head thread_node; struct completion *vfork_done; /* CLONE_CHILD_SETTID: */ int __user *set_child_tid; /* CLONE_CHILD_CLEARTID: */ int __user *clear_child_tid; /* PF_KTHREAD | PF_IO_WORKER */ void *worker_private; u64 utime; u64 stime; #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME u64 utimescaled; u64 stimescaled; #endif u64 gtime; struct prev_cputime prev_cputime; #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN struct vtime vtime; #endif #ifdef CONFIG_NO_HZ_FULL atomic_t tick_dep_mask; #endif /* Context switch counts: */ unsigned long nvcsw; unsigned long nivcsw; /* Monotonic time in nsecs: */ u64 start_time; /* Boot based time in nsecs: */ u64 start_boottime; /* MM fault and swap info: this can arguably be seen as either mm-specific or thread-specific: */ unsigned long min_flt; unsigned long maj_flt; /* Empty if CONFIG_POSIX_CPUTIMERS=n */ struct posix_cputimers posix_cputimers; #ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK struct posix_cputimers_work posix_cputimers_work; #endif /* Process credentials: */ /* Tracer's credentials at attach: */ const struct cred __rcu *ptracer_cred; /* Objective and real subjective task credentials (COW): */ const struct cred __rcu *real_cred; /* Effective (overridable) subjective task credentials (COW): */ const struct cred __rcu *cred; #ifdef CONFIG_KEYS /* Cached requested key. */ struct key *cached_requested_key; #endif /* * executable name, excluding path. * * - normally initialized begin_new_exec() * - set it with set_task_comm() * - strscpy_pad() to ensure it is always NUL-terminated and * zero-padded * - task_lock() to ensure the operation is atomic and the name is * fully updated. */ char comm[TASK_COMM_LEN]; struct nameidata *nameidata; #ifdef CONFIG_SYSVIPC struct sysv_sem sysvsem; struct sysv_shm sysvshm; #endif #ifdef CONFIG_DETECT_HUNG_TASK unsigned long last_switch_count; unsigned long last_switch_time; #endif /* Filesystem information: */ struct fs_struct *fs; /* Open file information: */ struct files_struct *files; #ifdef CONFIG_IO_URING struct io_uring_task *io_uring; #endif /* Namespaces: */ struct nsproxy *nsproxy; /* Signal handlers: */ struct signal_struct *signal; struct sighand_struct __rcu *sighand; sigset_t blocked; sigset_t real_blocked; /* Restored if set_restore_sigmask() was used: */ sigset_t saved_sigmask; struct sigpending pending; unsigned long sas_ss_sp; size_t sas_ss_size; unsigned int sas_ss_flags; struct callback_head *task_works; #ifdef CONFIG_AUDIT #ifdef CONFIG_AUDITSYSCALL struct audit_context *audit_context; #endif kuid_t loginuid; unsigned int sessionid; #endif struct seccomp seccomp; struct syscall_user_dispatch syscall_dispatch; /* Thread group tracking: */ u64 parent_exec_id; u64 self_exec_id; /* Protection against (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, mempolicy: */ spinlock_t alloc_lock; /* Protection of the PI data structures: */ raw_spinlock_t pi_lock; struct wake_q_node wake_q; #ifdef CONFIG_RT_MUTEXES /* PI waiters blocked on a rt_mutex held by this task: */ struct rb_root_cached pi_waiters; /* Updated under owner's pi_lock and rq lock */ struct task_struct *pi_top_task; /* Deadlock detection and priority inheritance handling: */ struct rt_mutex_waiter *pi_blocked_on; #endif struct mutex *blocked_on; /* lock we're blocked on */ #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER /* * Encoded lock address causing task block (lower 2 bits = type from * <linux/hung_task.h>). Accessed via hung_task_*() helpers. */ unsigned long blocker; #endif #ifdef CONFIG_DEBUG_ATOMIC_SLEEP int non_block_count; #endif #ifdef CONFIG_TRACE_IRQFLAGS struct irqtrace_events irqtrace; unsigned int hardirq_threaded; u64 hardirq_chain_key; int softirqs_enabled; int softirq_context; int irq_config; #endif #ifdef CONFIG_PREEMPT_RT int softirq_disable_cnt; #endif #ifdef CONFIG_LOCKDEP # define MAX_LOCK_DEPTH 48UL u64 curr_chain_key; int lockdep_depth; unsigned int lockdep_recursion; struct held_lock held_locks[MAX_LOCK_DEPTH]; #endif #if defined(CONFIG_UBSAN) && !defined(CONFIG_UBSAN_TRAP) unsigned int in_ubsan; #endif /* Journalling filesystem info: */ void *journal_info; /* Stacked block device info: */ struct bio_list *bio_list; /* Stack plugging: */ struct blk_plug *plug; /* VM state: */ struct reclaim_state *reclaim_state; struct io_context *io_context; #ifdef CONFIG_COMPACTION struct capture_control *capture_control; #endif /* Ptrace state: */ unsigned long ptrace_message; kernel_siginfo_t *last_siginfo; struct task_io_accounting ioac; #ifdef CONFIG_PSI /* Pressure stall state */ unsigned int psi_flags; #endif #ifdef CONFIG_TASK_XACCT /* Accumulated RSS usage: */ u64 acct_rss_mem1; /* Accumulated virtual memory usage: */ u64 acct_vm_mem1; /* stime + utime since last update: */ u64 acct_timexpd; #endif #ifdef CONFIG_CPUSETS /* Protected by ->alloc_lock: */ nodemask_t mems_allowed; /* Sequence number to catch updates: */ seqcount_spinlock_t mems_allowed_seq; int cpuset_mem_spread_rotor; #endif #ifdef CONFIG_CGROUPS /* Control Group info protected by css_set_lock: */ struct css_set __rcu *cgroups; /* cg_list protected by css_set_lock and tsk->alloc_lock: */ struct list_head cg_list; #endif #ifdef CONFIG_X86_CPU_RESCTRL u32 closid; u32 rmid; #endif #ifdef CONFIG_FUTEX struct robust_list_head __user *robust_list; #ifdef CONFIG_COMPAT struct compat_robust_list_head __user *compat_robust_list; #endif struct list_head pi_state_list; struct futex_pi_state *pi_state_cache; struct mutex futex_exit_mutex; unsigned int futex_state; #endif #ifdef CONFIG_PERF_EVENTS u8 perf_recursion[PERF_NR_CONTEXTS]; struct perf_event_context *perf_event_ctxp; struct mutex perf_event_mutex; struct list_head perf_event_list; struct perf_ctx_data __rcu *perf_ctx_data; #endif #ifdef CONFIG_DEBUG_PREEMPT unsigned long preempt_disable_ip; #endif #ifdef CONFIG_NUMA /* Protected by alloc_lock: */ struct mempolicy *mempolicy; short il_prev; u8 il_weight; short pref_node_fork; #endif #ifdef CONFIG_NUMA_BALANCING int numa_scan_seq; unsigned int numa_scan_period; unsigned int numa_scan_period_max; int numa_preferred_nid; unsigned long numa_migrate_retry; /* Migration stamp: */ u64 node_stamp; u64 last_task_numa_placement; u64 last_sum_exec_runtime; struct callback_head numa_work; /* * This pointer is only modified for current in syscall and * pagefault context (and for tasks being destroyed), so it can be read * from any of the following contexts: * - RCU read-side critical section * - current->numa_group from everywhere * - task's runqueue locked, task not running */ struct numa_group __rcu *numa_group; /* * numa_faults is an array split into four regions: * faults_memory, faults_cpu, faults_memory_buffer, faults_cpu_buffer * in this precise order. * * faults_memory: Exponential decaying average of faults on a per-node * basis. Scheduling placement decisions are made based on these * counts. The values remain static for the duration of a PTE scan. * faults_cpu: Track the nodes the process was running on when a NUMA * hinting fault was incurred. * faults_memory_buffer and faults_cpu_buffer: Record faults per node * during the current scan window. When the scan completes, the counts * in faults_memory and faults_cpu decay and these values are copied. */ unsigned long *numa_faults; unsigned long total_numa_faults; /* * numa_faults_locality tracks if faults recorded during the last * scan window were remote/local or failed to migrate. The task scan * period is adapted based on the locality of the faults with different * weights depending on whether they were shared or private faults */ unsigned long numa_faults_locality[3]; unsigned long numa_pages_migrated; #endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_RSEQ struct rseq __user *rseq; u32 rseq_len; u32 rseq_sig; /* * RmW on rseq_event_mask must be performed atomically * with respect to preemption. */ unsigned long rseq_event_mask; # ifdef CONFIG_DEBUG_RSEQ /* * This is a place holder to save a copy of the rseq fields for * validation of read-only fields. The struct rseq has a * variable-length array at the end, so it cannot be used * directly. Reserve a size large enough for the known fields. */ char rseq_fields[sizeof(struct rseq)]; # endif #endif #ifdef CONFIG_SCHED_MM_CID int mm_cid; /* Current cid in mm */ int last_mm_cid; /* Most recent cid in mm */ int migrate_from_cpu; int mm_cid_active; /* Whether cid bitmap is active */ struct callback_head cid_work; #endif struct tlbflush_unmap_batch tlb_ubc; /* Cache last used pipe for splice(): */ struct pipe_inode_info *splice_pipe; struct page_frag task_frag; #ifdef CONFIG_TASK_DELAY_ACCT struct task_delay_info *delays; #endif #ifdef CONFIG_FAULT_INJECTION int make_it_fail; unsigned int fail_nth; #endif /* * When (nr_dirtied >= nr_dirtied_pause), it's time to call * balance_dirty_pages() for a dirty throttling pause: */ int nr_dirtied; int nr_dirtied_pause; /* Start of a write-and-pause period: */ unsigned long dirty_paused_when; #ifdef CONFIG_LATENCYTOP int latency_record_count; struct latency_record latency_record[LT_SAVECOUNT]; #endif /* * Time slack values; these are used to round up poll() and * select() etc timeout values. These are in nanoseconds. */ u64 timer_slack_ns; u64 default_timer_slack_ns; #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) unsigned int kasan_depth; #endif #ifdef CONFIG_KCSAN struct kcsan_ctx kcsan_ctx; #ifdef CONFIG_TRACE_IRQFLAGS struct irqtrace_events kcsan_save_irqtrace; #endif #ifdef CONFIG_KCSAN_WEAK_MEMORY int kcsan_stack_depth; #endif #endif #ifdef CONFIG_KMSAN struct kmsan_ctx kmsan_ctx; #endif #if IS_ENABLED(CONFIG_KUNIT) struct kunit *kunit_test; #endif #ifdef CONFIG_FUNCTION_GRAPH_TRACER /* Index of current stored address in ret_stack: */ int curr_ret_stack; int curr_ret_depth; /* Stack of return addresses for return function tracing: */ unsigned long *ret_stack; /* Timestamp for last schedule: */ unsigned long long ftrace_timestamp; unsigned long long ftrace_sleeptime; /* * Number of functions that haven't been traced * because of depth overrun: */ atomic_t trace_overrun; /* Pause tracing: */ atomic_t tracing_graph_pause; #endif #ifdef CONFIG_TRACING /* Bitmask and counter of trace recursion: */ unsigned long trace_recursion; #endif /* CONFIG_TRACING */ #ifdef CONFIG_KCOV /* See kernel/kcov.c for more details. */ /* Coverage collection mode enabled for this task (0 if disabled): */ unsigned int kcov_mode; /* Size of the kcov_area: */ unsigned int kcov_size; /* Buffer for coverage collection: */ void *kcov_area; /* KCOV descriptor wired with this task or NULL: */ struct kcov *kcov; /* KCOV common handle for remote coverage collection: */ u64 kcov_handle; /* KCOV sequence number: */ int kcov_sequence; /* Collect coverage from softirq context: */ unsigned int kcov_softirq; #endif #ifdef CONFIG_MEMCG_V1 struct mem_cgroup *memcg_in_oom; #endif #ifdef CONFIG_MEMCG /* Number of pages to reclaim on returning to userland: */ unsigned int memcg_nr_pages_over_high; /* Used by memcontrol for targeted memcg charge: */ struct mem_cgroup *active_memcg; /* Cache for current->cgroups->memcg->objcg lookups: */ struct obj_cgroup *objcg; #endif #ifdef CONFIG_BLK_CGROUP struct gendisk *throttle_disk; #endif #ifdef CONFIG_UPROBES struct uprobe_task *utask; #endif #if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE) unsigned int sequential_io; unsigned int sequential_io_avg; #endif struct kmap_ctrl kmap_ctrl; #ifdef CONFIG_DEBUG_ATOMIC_SLEEP unsigned long task_state_change; # ifdef CONFIG_PREEMPT_RT unsigned long saved_state_change; # endif #endif struct rcu_head rcu; refcount_t rcu_users; int pagefault_disabled; #ifdef CONFIG_MMU struct task_struct *oom_reaper_list; struct timer_list oom_reaper_timer; #endif #ifdef CONFIG_VMAP_STACK struct vm_struct *stack_vm_area; #endif #ifdef CONFIG_THREAD_INFO_IN_TASK /* A live task holds one reference: */ refcount_t stack_refcount; #endif #ifdef CONFIG_LIVEPATCH int patch_state; #endif #ifdef CONFIG_SECURITY /* Used by LSM modules for access restriction: */ void *security; #endif #ifdef CONFIG_BPF_SYSCALL /* Used by BPF task local storage */ struct bpf_local_storage __rcu *bpf_storage; /* Used for BPF run context */ struct bpf_run_ctx *bpf_ctx; #endif /* Used by BPF for per-TASK xdp storage */ struct bpf_net_context *bpf_net_context; #ifdef CONFIG_KSTACK_ERASE unsigned long lowest_stack; #endif #ifdef CONFIG_KSTACK_ERASE_METRICS unsigned long prev_lowest_stack; #endif #ifdef CONFIG_X86_MCE void __user *mce_vaddr; __u64 mce_kflags; u64 mce_addr; __u64 mce_ripv : 1, mce_whole_page : 1, __mce_reserved : 62; struct callback_head mce_kill_me; int mce_count; #endif #ifdef CONFIG_KRETPROBES struct llist_head kretprobe_instances; #endif #ifdef CONFIG_RETHOOK struct llist_head rethooks; #endif #ifdef CONFIG_ARCH_HAS_PARANOID_L1D_FLUSH /* * If L1D flush is supported on mm context switch * then we use this callback head to queue kill work * to kill tasks that are not running on SMT disabled * cores */ struct callback_head l1d_flush_kill; #endif #ifdef CONFIG_RV /* * Per-task RV monitor, fixed in CONFIG_RV_PER_TASK_MONITORS. * If memory becomes a concern, we can think about a dynamic method. */ union rv_task_monitor rv[CONFIG_RV_PER_TASK_MONITORS]; #endif #ifdef CONFIG_USER_EVENTS struct user_event_mm *user_event_mm; #endif #ifdef CONFIG_UNWIND_USER struct unwind_task_info unwind_info; #endif /* CPU-specific state of this task: */ struct thread_struct thread; /* * New fields for task_struct should be added above here, so that * they are included in the randomized portion of task_struct. */ randomized_struct_fields_end } __attribute__ ((aligned (64))); #ifdef CONFIG_SCHED_PROXY_EXEC DECLARE_STATIC_KEY_TRUE(__sched_proxy_exec); static inline bool sched_proxy_exec(void) { return static_branch_likely(&__sched_proxy_exec); } #else static inline bool sched_proxy_exec(void) { return false; } #endif #define TASK_REPORT_IDLE (TASK_REPORT + 1) #define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1) static inline unsigned int __task_state_index(unsigned int tsk_state, unsigned int tsk_exit_state) { unsigned int state = (tsk_state | tsk_exit_state) & TASK_REPORT; BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX); if ((tsk_state & TASK_IDLE) == TASK_IDLE) state = TASK_REPORT_IDLE; /* * We're lying here, but rather than expose a completely new task state * to userspace, we can make this appear as if the task has gone through * a regular rt_mutex_lock() call. * Report frozen tasks as uninterruptible. */ if ((tsk_state & TASK_RTLOCK_WAIT) || (tsk_state & TASK_FROZEN)) state = TASK_UNINTERRUPTIBLE; return fls(state); } static inline unsigned int task_state_index(struct task_struct *tsk) { return __task_state_index(READ_ONCE(tsk->__state), tsk->exit_state); } static inline char task_index_to_char(unsigned int state) { static const char state_char[] = "RSDTtXZPI"; BUILD_BUG_ON(TASK_REPORT_MAX * 2 != 1 << (sizeof(state_char) - 1)); return state_char[state]; } static inline char task_state_to_char(struct task_struct *tsk) { return task_index_to_char(task_state_index(tsk)); } extern struct pid *cad_pid; /* * Per process flags */ #define PF_VCPU 0x00000001 /* I'm a virtual CPU */ #define PF_IDLE 0x00000002 /* I am an IDLE thread */ #define PF_EXITING 0x00000004 /* Getting shut down */ #define PF_POSTCOREDUMP 0x00000008 /* Coredumps should ignore this task */ #define PF_IO_WORKER 0x00000010 /* Task is an IO worker */ #define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ #define PF_FORKNOEXEC 0x00000040 /* Forked but didn't exec */ #define PF_MCE_PROCESS 0x00000080 /* Process policy on mce errors */ #define PF_SUPERPRIV 0x00000100 /* Used super-user privileges */ #define PF_DUMPCORE 0x00000200 /* Dumped core */ #define PF_SIGNALED 0x00000400 /* Killed by a signal */ #define PF_MEMALLOC 0x00000800 /* Allocating memory to free memory. See memalloc_noreclaim_save() */ #define PF_NPROC_EXCEEDED 0x00001000 /* set_user() noticed that RLIMIT_NPROC was exceeded */ #define PF_USED_MATH 0x00002000 /* If unset the fpu must be initialized before use */ #define PF_USER_WORKER 0x00004000 /* Kernel thread cloned from userspace thread */ #define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */ #define PF_KCOMPACTD 0x00010000 /* I am kcompactd */ #define PF_KSWAPD 0x00020000 /* I am kswapd */ #define PF_MEMALLOC_NOFS 0x00040000 /* All allocations inherit GFP_NOFS. See memalloc_nfs_save() */ #define PF_MEMALLOC_NOIO 0x00080000 /* All allocations inherit GFP_NOIO. See memalloc_noio_save() */ #define PF_LOCAL_THROTTLE 0x00100000 /* Throttle writes only against the bdi I write to, * I am cleaning dirty pages from some other bdi. */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ #define PF__HOLE__00800000 0x00800000 #define PF__HOLE__01000000 0x01000000 #define PF__HOLE__02000000 0x02000000 #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_MEMALLOC_PIN 0x10000000 /* Allocations constrained to zones which allow long term pinning. * See memalloc_pin_save() */ #define PF_BLOCK_TS 0x20000000 /* plug has ts that needs updating */ #define PF__HOLE__40000000 0x40000000 #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ /* * Only the _current_ task can read/write to tsk->flags, but other * tasks can access tsk->flags in readonly mode for example * with tsk_used_math (like during threaded core dumping). * There is however an exception to this rule during ptrace * or during fork: the ptracer task is allowed to write to the * child->flags of its traced child (same goes for fork, the parent * can write to the child->flags), because we're guaranteed the * child is not running and in turn not changing child->flags * at the same time the parent does it. */ #define clear_stopped_child_used_math(child) do { (child)->flags &= ~PF_USED_MATH; } while (0) #define set_stopped_child_used_math(child) do { (child)->flags |= PF_USED_MATH; } while (0) #define clear_used_math() clear_stopped_child_used_math(current) #define set_used_math() set_stopped_child_used_math(current) #define conditional_stopped_child_used_math(condition, child) \ do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= (condition) ? PF_USED_MATH : 0; } while (0) #define conditional_used_math(condition) conditional_stopped_child_used_math(condition, current) #define copy_to_stopped_child_used_math(child) \ do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= current->flags & PF_USED_MATH; } while (0) /* NOTE: this will return 0 or PF_USED_MATH, it will never return 1 */ #define tsk_used_math(p) ((p)->flags & PF_USED_MATH) #define used_math() tsk_used_math(current) static __always_inline bool is_percpu_thread(void) { return (current->flags & PF_NO_SETAFFINITY) && (current->nr_cpus_allowed == 1); } /* Per-process atomic flags. */ #define PFA_NO_NEW_PRIVS 0 /* May not gain new privileges. */ #define PFA_SPREAD_PAGE 1 /* Spread page cache over cpuset */ #define PFA_SPREAD_SLAB 2 /* Spread some slab caches over cpuset */ #define PFA_SPEC_SSB_DISABLE 3 /* Speculative Store Bypass disabled */ #define PFA_SPEC_SSB_FORCE_DISABLE 4 /* Speculative Store Bypass force disabled*/ #define PFA_SPEC_IB_DISABLE 5 /* Indirect branch speculation restricted */ #define PFA_SPEC_IB_FORCE_DISABLE 6 /* Indirect branch speculation permanently restricted */ #define PFA_SPEC_SSB_NOEXEC 7 /* Speculative Store Bypass clear on execve() */ #define TASK_PFA_TEST(name, func) \ static inline bool task_##func(struct task_struct *p) \ { return test_bit(PFA_##name, &p->atomic_flags); } #define TASK_PFA_SET(name, func) \ static inline void task_set_##func(struct task_struct *p) \ { set_bit(PFA_##name, &p->atomic_flags); } #define TASK_PFA_CLEAR(name, func) \ static inline void task_clear_##func(struct task_struct *p) \ { clear_bit(PFA_##name, &p->atomic_flags); } TASK_PFA_TEST(NO_NEW_PRIVS, no_new_privs) TASK_PFA_SET(NO_NEW_PRIVS, no_new_privs) TASK_PFA_TEST(SPREAD_PAGE, spread_page) TASK_PFA_SET(SPREAD_PAGE, spread_page) TASK_PFA_CLEAR(SPREAD_PAGE, spread_page) TASK_PFA_TEST(SPREAD_SLAB, spread_slab) TASK_PFA_SET(SPREAD_SLAB, spread_slab) TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab) TASK_PFA_TEST(SPEC_SSB_DISABLE, spec_ssb_disable) TASK_PFA_SET(SPEC_SSB_DISABLE, spec_ssb_disable) TASK_PFA_CLEAR(SPEC_SSB_DISABLE, spec_ssb_disable) TASK_PFA_TEST(SPEC_SSB_NOEXEC, spec_ssb_noexec) TASK_PFA_SET(SPEC_SSB_NOEXEC, spec_ssb_noexec) TASK_PFA_CLEAR(SPEC_SSB_NOEXEC, spec_ssb_noexec) TASK_PFA_TEST(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable) TASK_PFA_SET(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable) TASK_PFA_TEST(SPEC_IB_DISABLE, spec_ib_disable) TASK_PFA_SET(SPEC_IB_DISABLE, spec_ib_disable) TASK_PFA_CLEAR(SPEC_IB_DISABLE, spec_ib_disable) TASK_PFA_TEST(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable) TASK_PFA_SET(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable) static inline void current_restore_flags(unsigned long orig_flags, unsigned long flags) { current->flags &= ~flags; current->flags |= orig_flags & flags; } extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); extern int task_can_attach(struct task_struct *p); extern int dl_bw_alloc(int cpu, u64 dl_bw); extern void dl_bw_free(int cpu, u64 dl_bw); /* do_set_cpus_allowed() - consider using set_cpus_allowed_ptr() instead */ extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); /** * set_cpus_allowed_ptr - set CPU affinity mask of a task * @p: the task * @new_mask: CPU affinity mask * * Return: zero if successful, or a negative error code */ extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask); extern int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node); extern void release_user_cpus_ptr(struct task_struct *p); extern int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask); extern void force_compatible_cpus_allowed_ptr(struct task_struct *p); extern void relax_compatible_cpus_allowed_ptr(struct task_struct *p); extern int yield_to(struct task_struct *p, bool preempt); extern void set_user_nice(struct task_struct *p, long nice); extern int task_prio(const struct task_struct *p); /** * task_nice - return the nice value of a given task. * @p: the task in question. * * Return: The nice value [ -20 ... 0 ... 19 ]. */ static inline int task_nice(const struct task_struct *p) { return PRIO_TO_NICE((p)->static_prio); } extern int can_nice(const struct task_struct *p, const int nice); extern int task_curr(const struct task_struct *p); extern int idle_cpu(int cpu); extern int available_idle_cpu(int cpu); extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *); extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *); extern void sched_set_fifo(struct task_struct *p); extern void sched_set_fifo_low(struct task_struct *p); extern void sched_set_normal(struct task_struct *p, int nice); extern int sched_setattr(struct task_struct *, const struct sched_attr *); extern int sched_setattr_nocheck(struct task_struct *, const struct sched_attr *); extern struct task_struct *idle_task(int cpu); /** * is_idle_task - is the specified task an idle task? * @p: the task in question. * * Return: 1 if @p is an idle task. 0 otherwise. */ static __always_inline bool is_idle_task(const struct task_struct *p) { return !!(p->flags & PF_IDLE); } extern struct task_struct *curr_task(int cpu); extern void ia64_set_curr_task(int cpu, struct task_struct *p); void yield(void); union thread_union { struct task_struct task; #ifndef CONFIG_THREAD_INFO_IN_TASK struct thread_info thread_info; #endif unsigned long stack[THREAD_SIZE/sizeof(long)]; }; #ifndef CONFIG_THREAD_INFO_IN_TASK extern struct thread_info init_thread_info; #endif extern unsigned long init_stack[THREAD_SIZE / sizeof(unsigned long)]; #ifdef CONFIG_THREAD_INFO_IN_TASK # define task_thread_info(task) (&(task)->thread_info) #else # define task_thread_info(task) ((struct thread_info *)(task)->stack) #endif /* * find a task by one of its numerical ids * * find_task_by_pid_ns(): * finds a task by its pid in the specified namespace * find_task_by_vpid(): * finds a task by its virtual pid * * see also find_vpid() etc in include/linux/pid.h */ extern struct task_struct *find_task_by_vpid(pid_t nr); extern struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns); /* * find a task by its virtual pid and get the task struct */ extern struct task_struct *find_get_task_by_vpid(pid_t nr); extern int wake_up_state(struct task_struct *tsk, unsigned int state); extern int wake_up_process(struct task_struct *tsk); extern void wake_up_new_task(struct task_struct *tsk); extern void kick_process(struct task_struct *tsk); extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec); #define set_task_comm(tsk, from) ({ \ BUILD_BUG_ON(sizeof(from) != TASK_COMM_LEN); \ __set_task_comm(tsk, from, false); \ }) /* * - Why not use task_lock()? * User space can randomly change their names anyway, so locking for readers * doesn't make sense. For writers, locking is probably necessary, as a race * condition could lead to long-term mixed results. * The strscpy_pad() in __set_task_comm() can ensure that the task comm is * always NUL-terminated and zero-padded. Therefore the race condition between * reader and writer is not an issue. * * - BUILD_BUG_ON() can help prevent the buf from being truncated. * Since the callers don't perform any return value checks, this safeguard is * necessary. */ #define get_task_comm(buf, tsk) ({ \ BUILD_BUG_ON(sizeof(buf) < TASK_COMM_LEN); \ strscpy_pad(buf, (tsk)->comm); \ buf; \ }) static __always_inline void scheduler_ipi(void) { /* * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting * TIF_NEED_RESCHED remotely (for the first time) will also send * this IPI. */ preempt_fold_need_resched(); } extern unsigned long wait_task_inactive(struct task_struct *, unsigned int match_state); /* * Set thread flags in other task's structures. * See asm/thread_info.h for TIF_xxxx flags available: */ static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag) { set_ti_thread_flag(task_thread_info(tsk), flag); } static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag) { clear_ti_thread_flag(task_thread_info(tsk), flag); } static inline void update_tsk_thread_flag(struct task_struct *tsk, int flag, bool value) { update_ti_thread_flag(task_thread_info(tsk), flag, value); } static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag) { return test_and_set_ti_thread_flag(task_thread_info(tsk), flag); } static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag) { return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag); } static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag) { return test_ti_thread_flag(task_thread_info(tsk), flag); } static inline void set_tsk_need_resched(struct task_struct *tsk) { if (tracepoint_enabled(sched_set_need_resched_tp) && !test_tsk_thread_flag(tsk, TIF_NEED_RESCHED)) __trace_set_need_resched(tsk, TIF_NEED_RESCHED); set_tsk_thread_flag(tsk,TIF_NEED_RESCHED); } static inline void clear_tsk_need_resched(struct task_struct *tsk) { atomic_long_andnot(_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY, (atomic_long_t *)&task_thread_info(tsk)->flags); } static inline int test_tsk_need_resched(struct task_struct *tsk) { return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED)); } /* * cond_resched() and cond_resched_lock(): latency reduction via * explicit rescheduling in places that are safe. The return * value indicates whether a reschedule was done in fact. * cond_resched_lock() will drop the spinlock before scheduling, */ #if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) extern int __cond_resched(void); #if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) DECLARE_STATIC_CALL(cond_resched, __cond_resched); static __always_inline int _cond_resched(void) { return static_call_mod(cond_resched)(); } #elif defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) extern int dynamic_cond_resched(void); static __always_inline int _cond_resched(void) { return dynamic_cond_resched(); } #else /* !CONFIG_PREEMPTION */ static inline int _cond_resched(void) { return __cond_resched(); } #endif /* PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */ #else /* CONFIG_PREEMPTION && !CONFIG_PREEMPT_DYNAMIC */ static inline int _cond_resched(void) { return 0; } #endif /* !CONFIG_PREEMPTION || CONFIG_PREEMPT_DYNAMIC */ #define cond_resched() ({ \ __might_resched(__FILE__, __LINE__, 0); \ _cond_resched(); \ }) extern int __cond_resched_lock(spinlock_t *lock); extern int __cond_resched_rwlock_read(rwlock_t *lock); extern int __cond_resched_rwlock_write(rwlock_t *lock); #define MIGHT_RESCHED_RCU_SHIFT 8 #define MIGHT_RESCHED_PREEMPT_MASK ((1U << MIGHT_RESCHED_RCU_SHIFT) - 1) #ifndef CONFIG_PREEMPT_RT /* * Non RT kernels have an elevated preempt count due to the held lock, * but are not allowed to be inside a RCU read side critical section */ # define PREEMPT_LOCK_RESCHED_OFFSETS PREEMPT_LOCK_OFFSET #else /* * spin/rw_lock() on RT implies rcu_read_lock(). The might_sleep() check in * cond_resched*lock() has to take that into account because it checks for * preempt_count() and rcu_preempt_depth(). */ # define PREEMPT_LOCK_RESCHED_OFFSETS \ (PREEMPT_LOCK_OFFSET + (1U << MIGHT_RESCHED_RCU_SHIFT)) #endif #define cond_resched_lock(lock) ({ \ __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ __cond_resched_lock(lock); \ }) #define cond_resched_rwlock_read(lock) ({ \ __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ __cond_resched_rwlock_read(lock); \ }) #define cond_resched_rwlock_write(lock) ({ \ __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ __cond_resched_rwlock_write(lock); \ }) #ifndef CONFIG_PREEMPT_RT static inline struct mutex *__get_task_blocked_on(struct task_struct *p) { struct mutex *m = p->blocked_on; if (m) lockdep_assert_held_once(&m->wait_lock); return m; } static inline void __set_task_blocked_on(struct task_struct *p, struct mutex *m) { struct mutex *blocked_on = READ_ONCE(p->blocked_on); WARN_ON_ONCE(!m); /* The task should only be setting itself as blocked */ WARN_ON_ONCE(p != current); /* Currently we serialize blocked_on under the mutex::wait_lock */ lockdep_assert_held_once(&m->wait_lock); /* * Check ensure we don't overwrite existing mutex value * with a different mutex. Note, setting it to the same * lock repeatedly is ok. */ WARN_ON_ONCE(blocked_on && blocked_on != m); WRITE_ONCE(p->blocked_on, m); } static inline void set_task_blocked_on(struct task_struct *p, struct mutex *m) { guard(raw_spinlock_irqsave)(&m->wait_lock); __set_task_blocked_on(p, m); } static inline void __clear_task_blocked_on(struct task_struct *p, struct mutex *m) { if (m) { struct mutex *blocked_on = READ_ONCE(p->blocked_on); /* Currently we serialize blocked_on under the mutex::wait_lock */ lockdep_assert_held_once(&m->wait_lock); /* * There may be cases where we re-clear already cleared * blocked_on relationships, but make sure we are not * clearing the relationship with a different lock. */ WARN_ON_ONCE(blocked_on && blocked_on != m); } WRITE_ONCE(p->blocked_on, NULL); } static inline void clear_task_blocked_on(struct task_struct *p, struct mutex *m) { guard(raw_spinlock_irqsave)(&m->wait_lock); __clear_task_blocked_on(p, m); } #else static inline void __clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m) { } static inline void clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m) { } #endif /* !CONFIG_PREEMPT_RT */ static __always_inline bool need_resched(void) { return unlikely(tif_need_resched()); } /* * Wrappers for p->thread_info->cpu access. No-op on UP. */ #ifdef CONFIG_SMP static inline unsigned int task_cpu(const struct task_struct *p) { return READ_ONCE(task_thread_info(p)->cpu); } extern void set_task_cpu(struct task_struct *p, unsigned int cpu); #else static inline unsigned int task_cpu(const struct task_struct *p) { return 0; } static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) { } #endif /* CONFIG_SMP */ static inline bool task_is_runnable(struct task_struct *p) { return p->on_rq && !p->se.sched_delayed; } extern bool sched_task_on_rq(struct task_struct *p); extern unsigned long get_wchan(struct task_struct *p); extern struct task_struct *cpu_curr_snapshot(int cpu); /* * In order to reduce various lock holder preemption latencies provide an * interface to see if a vCPU is currently running or not. * * This allows us to terminate optimistic spin loops and block, analogous to * the native optimistic spin heuristic of testing if the lock owner task is * running or not. */ #ifndef vcpu_is_preempted static inline bool vcpu_is_preempted(int cpu) { return false; } #endif extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask); extern long sched_getaffinity(pid_t pid, struct cpumask *mask); #ifndef TASK_SIZE_OF #define TASK_SIZE_OF(tsk) TASK_SIZE #endif static inline bool owner_on_cpu(struct task_struct *owner) { /* * As lock holder preemption issue, we both skip spinning if * task is not on cpu or its cpu is preempted */ return READ_ONCE(owner->on_cpu) && !vcpu_is_preempted(task_cpu(owner)); } /* Returns effective CPU energy utilization, as seen by the scheduler */ unsigned long sched_cpu_util(int cpu); #ifdef CONFIG_SCHED_CORE extern void sched_core_free(struct task_struct *tsk); extern void sched_core_fork(struct task_struct *p); extern int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type, unsigned long uaddr); extern int sched_core_idle_cpu(int cpu); #else static inline void sched_core_free(struct task_struct *tsk) { } static inline void sched_core_fork(struct task_struct *p) { } static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); } #endif extern void sched_set_stop_task(int cpu, struct task_struct *stop); #ifdef CONFIG_MEM_ALLOC_PROFILING static __always_inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag) { swap(current->alloc_tag, tag); return tag; } static __always_inline void alloc_tag_restore(struct alloc_tag *tag, struct alloc_tag *old) { #ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG WARN(current->alloc_tag != tag, "current->alloc_tag was changed:\n"); #endif current->alloc_tag = old; } #else #define alloc_tag_save(_tag) NULL #define alloc_tag_restore(_tag, _old) do {} while (0) #endif #endif
1438 1435 244 247 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 // SPDX-License-Identifier: GPL-2.0 #include <linux/bug.h> #include <linux/export.h> #include <linux/types.h> #include <linux/mmdebug.h> #include <linux/mm.h> #include <asm/memory.h> phys_addr_t __virt_to_phys(unsigned long x) { WARN(!__is_lm_address(__tag_reset(x)), "virt_to_phys used for non-linear address: %p (%pS)\n", (void *)x, (void *)x); return __virt_to_phys_nodebug(x); } EXPORT_SYMBOL(__virt_to_phys); phys_addr_t __phys_addr_symbol(unsigned long x) { /* * This is bounds checking against the kernel image only. * __pa_symbol should only be used on kernel symbol addresses. */ VIRTUAL_BUG_ON(x < (unsigned long) KERNEL_START || x > (unsigned long) KERNEL_END); return __pa_symbol_nodebug(x); } EXPORT_SYMBOL(__phys_addr_symbol);
4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 // SPDX-License-Identifier: GPL-2.0-only /* * Network interface table. * * Network interfaces (devices) do not have a security field, so we * maintain a table associating each interface with a SID. * * Author: James Morris <jmorris@redhat.com> * * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com> * Copyright (C) 2007 Hewlett-Packard Development Company, L.P. * Paul Moore <paul@paul-moore.com> */ #include <linux/init.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/stddef.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/notifier.h> #include <linux/netdevice.h> #include <linux/rcupdate.h> #include <net/net_namespace.h> #include "security.h" #include "objsec.h" #include "netif.h" #define SEL_NETIF_HASH_SIZE 64 #define SEL_NETIF_HASH_MAX 1024 struct sel_netif { struct list_head list; struct netif_security_struct nsec; struct rcu_head rcu_head; }; static u32 sel_netif_total; static DEFINE_SPINLOCK(sel_netif_lock); static struct list_head sel_netif_hash[SEL_NETIF_HASH_SIZE]; /** * sel_netif_hashfn - Hashing function for the interface table * @ns: the network namespace * @ifindex: the network interface * * Description: * This is the hashing function for the network interface table, it returns the * bucket number for the given interface. * */ static inline u32 sel_netif_hashfn(const struct net *ns, int ifindex) { return (((uintptr_t)ns + ifindex) & (SEL_NETIF_HASH_SIZE - 1)); } /** * sel_netif_find - Search for an interface record * @ns: the network namespace * @ifindex: the network interface * * Description: * Search the network interface table and return the record matching @ifindex. * If an entry can not be found in the table return NULL. * */ static inline struct sel_netif *sel_netif_find(const struct net *ns, int ifindex) { u32 idx = sel_netif_hashfn(ns, ifindex); struct sel_netif *netif; list_for_each_entry_rcu(netif, &sel_netif_hash[idx], list) if (net_eq(netif->nsec.ns, ns) && netif->nsec.ifindex == ifindex) return netif; return NULL; } /** * sel_netif_insert - Insert a new interface into the table * @netif: the new interface record * * Description: * Add a new interface record to the network interface hash table. Returns * zero on success, negative values on failure. * */ static int sel_netif_insert(struct sel_netif *netif) { u32 idx; if (sel_netif_total >= SEL_NETIF_HASH_MAX) return -ENOSPC; idx = sel_netif_hashfn(netif->nsec.ns, netif->nsec.ifindex); list_add_rcu(&netif->list, &sel_netif_hash[idx]); sel_netif_total++; return 0; } /** * sel_netif_destroy - Remove an interface record from the table * @netif: the existing interface record * * Description: * Remove an existing interface record from the network interface table. * */ static void sel_netif_destroy(struct sel_netif *netif) { list_del_rcu(&netif->list); sel_netif_total--; kfree_rcu(netif, rcu_head); } /** * sel_netif_sid_slow - Lookup the SID of a network interface using the policy * @ns: the network namespace * @ifindex: the network interface * @sid: interface SID * * Description: * This function determines the SID of a network interface by querying the * security policy. The result is added to the network interface table to * speedup future queries. Returns zero on success, negative values on * failure. * */ static int sel_netif_sid_slow(struct net *ns, int ifindex, u32 *sid) { int ret = 0; struct sel_netif *netif; struct sel_netif *new; struct net_device *dev; /* NOTE: we always use init's network namespace since we don't * currently support containers */ dev = dev_get_by_index(ns, ifindex); if (unlikely(dev == NULL)) { pr_warn("SELinux: failure in %s(), invalid network interface (%d)\n", __func__, ifindex); return -ENOENT; } spin_lock_bh(&sel_netif_lock); netif = sel_netif_find(ns, ifindex); if (netif != NULL) { *sid = netif->nsec.sid; goto out; } ret = security_netif_sid(dev->name, sid); if (ret != 0) goto out; /* If this memory allocation fails still return 0. The SID * is valid, it just won't be added to the cache. */ new = kmalloc(sizeof(*new), GFP_ATOMIC); if (new) { new->nsec.ns = ns; new->nsec.ifindex = ifindex; new->nsec.sid = *sid; if (sel_netif_insert(new)) kfree(new); } out: spin_unlock_bh(&sel_netif_lock); dev_put(dev); if (unlikely(ret)) pr_warn("SELinux: failure in %s(), unable to determine network interface label (%d)\n", __func__, ifindex); return ret; } /** * sel_netif_sid - Lookup the SID of a network interface * @ns: the network namespace * @ifindex: the network interface * @sid: interface SID * * Description: * This function determines the SID of a network interface using the fastest * method possible. First the interface table is queried, but if an entry * can't be found then the policy is queried and the result is added to the * table to speedup future queries. Returns zero on success, negative values * on failure. * */ int sel_netif_sid(struct net *ns, int ifindex, u32 *sid) { struct sel_netif *netif; rcu_read_lock(); netif = sel_netif_find(ns, ifindex); if (likely(netif != NULL)) { *sid = netif->nsec.sid; rcu_read_unlock(); return 0; } rcu_read_unlock(); return sel_netif_sid_slow(ns, ifindex, sid); } /** * sel_netif_kill - Remove an entry from the network interface table * @ns: the network namespace * @ifindex: the network interface * * Description: * This function removes the entry matching @ifindex from the network interface * table if it exists. * */ static void sel_netif_kill(const struct net *ns, int ifindex) { struct sel_netif *netif; rcu_read_lock(); spin_lock_bh(&sel_netif_lock); netif = sel_netif_find(ns, ifindex); if (netif) sel_netif_destroy(netif); spin_unlock_bh(&sel_netif_lock); rcu_read_unlock(); } /** * sel_netif_flush - Flush the entire network interface table * * Description: * Remove all entries from the network interface table. * */ void sel_netif_flush(void) { int idx; struct sel_netif *netif; spin_lock_bh(&sel_netif_lock); for (idx = 0; idx < SEL_NETIF_HASH_SIZE; idx++) list_for_each_entry(netif, &sel_netif_hash[idx], list) sel_netif_destroy(netif); spin_unlock_bh(&sel_netif_lock); } static int sel_netif_netdev_notifier_handler(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (event == NETDEV_DOWN) sel_netif_kill(dev_net(dev), dev->ifindex); return NOTIFY_DONE; } static struct notifier_block sel_netif_netdev_notifier = { .notifier_call = sel_netif_netdev_notifier_handler, }; static __init int sel_netif_init(void) { int i; if (!selinux_enabled_boot) return 0; for (i = 0; i < SEL_NETIF_HASH_SIZE; i++) INIT_LIST_HEAD(&sel_netif_hash[i]); register_netdevice_notifier(&sel_netif_netdev_notifier); return 0; } __initcall(sel_netif_init);
336 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __ARM64_KVM_NESTED_H #define __ARM64_KVM_NESTED_H #include <linux/bitfield.h> #include <linux/kvm_host.h> #include <asm/kvm_emulate.h> #include <asm/kvm_pgtable.h> static inline bool vcpu_has_nv(const struct kvm_vcpu *vcpu) { return (!__is_defined(__KVM_NVHE_HYPERVISOR__) && cpus_have_final_cap(ARM64_HAS_NESTED_VIRT) && vcpu_has_feature(vcpu, KVM_ARM_VCPU_HAS_EL2)); } /* Translation helpers from non-VHE EL2 to EL1 */ static inline u64 tcr_el2_ps_to_tcr_el1_ips(u64 tcr_el2) { return (u64)FIELD_GET(TCR_EL2_PS_MASK, tcr_el2) << TCR_IPS_SHIFT; } static inline u64 translate_tcr_el2_to_tcr_el1(u64 tcr) { return TCR_EPD1_MASK | /* disable TTBR1_EL1 */ ((tcr & TCR_EL2_TBI) ? TCR_TBI0 : 0) | tcr_el2_ps_to_tcr_el1_ips(tcr) | (tcr & TCR_EL2_TG0_MASK) | (tcr & TCR_EL2_ORGN0_MASK) | (tcr & TCR_EL2_IRGN0_MASK) | (tcr & TCR_EL2_T0SZ_MASK); } static inline u64 translate_cptr_el2_to_cpacr_el1(u64 cptr_el2) { u64 cpacr_el1 = CPACR_EL1_RES1; if (cptr_el2 & CPTR_EL2_TTA) cpacr_el1 |= CPACR_EL1_TTA; if (!(cptr_el2 & CPTR_EL2_TFP)) cpacr_el1 |= CPACR_EL1_FPEN; if (!(cptr_el2 & CPTR_EL2_TZ)) cpacr_el1 |= CPACR_EL1_ZEN; cpacr_el1 |= cptr_el2 & (CPTR_EL2_TCPAC | CPTR_EL2_TAM); return cpacr_el1; } static inline u64 translate_sctlr_el2_to_sctlr_el1(u64 val) { /* Only preserve the minimal set of bits we support */ val &= (SCTLR_ELx_M | SCTLR_ELx_A | SCTLR_ELx_C | SCTLR_ELx_SA | SCTLR_ELx_I | SCTLR_ELx_IESB | SCTLR_ELx_WXN | SCTLR_ELx_EE); val |= SCTLR_EL1_RES1; return val; } static inline u64 translate_ttbr0_el2_to_ttbr0_el1(u64 ttbr0) { /* Clear the ASID field */ return ttbr0 & ~GENMASK_ULL(63, 48); } extern bool forward_smc_trap(struct kvm_vcpu *vcpu); extern bool forward_debug_exception(struct kvm_vcpu *vcpu); extern void kvm_init_nested(struct kvm *kvm); extern int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu); extern void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu); extern struct kvm_s2_mmu *lookup_s2_mmu(struct kvm_vcpu *vcpu); union tlbi_info; extern void kvm_s2_mmu_iterate_by_vmid(struct kvm *kvm, u16 vmid, const union tlbi_info *info, void (*)(struct kvm_s2_mmu *, const union tlbi_info *)); extern void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu); extern void kvm_vcpu_put_hw_mmu(struct kvm_vcpu *vcpu); extern void check_nested_vcpu_requests(struct kvm_vcpu *vcpu); extern void kvm_nested_flush_hwstate(struct kvm_vcpu *vcpu); extern void kvm_nested_sync_hwstate(struct kvm_vcpu *vcpu); extern void kvm_nested_setup_mdcr_el2(struct kvm_vcpu *vcpu); struct kvm_s2_trans { phys_addr_t output; unsigned long block_size; bool writable; bool readable; int level; u32 esr; u64 desc; }; static inline phys_addr_t kvm_s2_trans_output(struct kvm_s2_trans *trans) { return trans->output; } static inline unsigned long kvm_s2_trans_size(struct kvm_s2_trans *trans) { return trans->block_size; } static inline u32 kvm_s2_trans_esr(struct kvm_s2_trans *trans) { return trans->esr; } static inline bool kvm_s2_trans_readable(struct kvm_s2_trans *trans) { return trans->readable; } static inline bool kvm_s2_trans_writable(struct kvm_s2_trans *trans) { return trans->writable; } static inline bool kvm_s2_trans_executable(struct kvm_s2_trans *trans) { return !(trans->desc & BIT(54)); } extern int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa, struct kvm_s2_trans *result); extern int kvm_s2_handle_perm_fault(struct kvm_vcpu *vcpu, struct kvm_s2_trans *trans); extern int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64 esr_el2); extern void kvm_nested_s2_wp(struct kvm *kvm); extern void kvm_nested_s2_unmap(struct kvm *kvm, bool may_block); extern void kvm_nested_s2_flush(struct kvm *kvm); unsigned long compute_tlb_inval_range(struct kvm_s2_mmu *mmu, u64 val); static inline bool kvm_supported_tlbi_s1e1_op(struct kvm_vcpu *vpcu, u32 instr) { struct kvm *kvm = vpcu->kvm; u8 CRm = sys_reg_CRm(instr); if (!(sys_reg_Op0(instr) == TLBI_Op0 && sys_reg_Op1(instr) == TLBI_Op1_EL1)) return false; if (!(sys_reg_CRn(instr) == TLBI_CRn_XS || (sys_reg_CRn(instr) == TLBI_CRn_nXS && kvm_has_feat(kvm, ID_AA64ISAR1_EL1, XS, IMP)))) return false; if (CRm == TLBI_CRm_nROS && !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS)) return false; if ((CRm == TLBI_CRm_RIS || CRm == TLBI_CRm_ROS || CRm == TLBI_CRm_RNS) && !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, RANGE)) return false; return true; } static inline bool kvm_supported_tlbi_s1e2_op(struct kvm_vcpu *vpcu, u32 instr) { struct kvm *kvm = vpcu->kvm; u8 CRm = sys_reg_CRm(instr); if (!(sys_reg_Op0(instr) == TLBI_Op0 && sys_reg_Op1(instr) == TLBI_Op1_EL2)) return false; if (!(sys_reg_CRn(instr) == TLBI_CRn_XS || (sys_reg_CRn(instr) == TLBI_CRn_nXS && kvm_has_feat(kvm, ID_AA64ISAR1_EL1, XS, IMP)))) return false; if (CRm == TLBI_CRm_IPAIS || CRm == TLBI_CRm_IPAONS) return false; if (CRm == TLBI_CRm_nROS && !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS)) return false; if ((CRm == TLBI_CRm_RIS || CRm == TLBI_CRm_ROS || CRm == TLBI_CRm_RNS) && !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, RANGE)) return false; return true; } int kvm_init_nv_sysregs(struct kvm_vcpu *vcpu); u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val); #ifdef CONFIG_ARM64_PTR_AUTH bool kvm_auth_eretax(struct kvm_vcpu *vcpu, u64 *elr); #else static inline bool kvm_auth_eretax(struct kvm_vcpu *vcpu, u64 *elr) { /* We really should never execute this... */ WARN_ON_ONCE(1); *elr = 0xbad9acc0debadbad; return false; } #endif #define KVM_NV_GUEST_MAP_SZ (KVM_PGTABLE_PROT_SW1 | KVM_PGTABLE_PROT_SW0) static inline u64 kvm_encode_nested_level(struct kvm_s2_trans *trans) { return FIELD_PREP(KVM_NV_GUEST_MAP_SZ, trans->level); } /* Adjust alignment for the contiguous bit as per StageOA() */ #define contiguous_bit_shift(d, wi, l) \ ({ \ u8 shift = 0; \ \ if ((d) & PTE_CONT) { \ switch (BIT((wi)->pgshift)) { \ case SZ_4K: \ shift = 4; \ break; \ case SZ_16K: \ shift = (l) == 2 ? 5 : 7; \ break; \ case SZ_64K: \ shift = 5; \ break; \ } \ } \ \ shift; \ }) static inline u64 decode_range_tlbi(u64 val, u64 *range, u16 *asid) { u64 base, tg, num, scale; int shift; tg = FIELD_GET(GENMASK(47, 46), val); switch(tg) { case 1: shift = 12; break; case 2: shift = 14; break; case 3: default: /* IMPDEF: handle tg==0 as 64k */ shift = 16; break; } base = (val & GENMASK(36, 0)) << shift; if (asid) *asid = FIELD_GET(TLBIR_ASID_MASK, val); scale = FIELD_GET(GENMASK(45, 44), val); num = FIELD_GET(GENMASK(43, 39), val); *range = __TLBI_RANGE_PAGES(num, scale) << shift; return base; } static inline unsigned int ps_to_output_size(unsigned int ps, bool pa52bit) { switch (ps) { case 0: return 32; case 1: return 36; case 2: return 40; case 3: return 42; case 4: return 44; case 5: return 48; case 6: if (pa52bit) return 52; fallthrough; default: return 48; } } enum trans_regime { TR_EL10, TR_EL20, TR_EL2, }; struct s1_walk_info; struct s1_walk_context { struct s1_walk_info *wi; u64 table_ipa; int level; }; struct s1_walk_filter { int (*fn)(struct s1_walk_context *, void *); void *priv; }; struct s1_walk_info { struct s1_walk_filter *filter; u64 baddr; enum trans_regime regime; unsigned int max_oa_bits; unsigned int pgshift; unsigned int txsz; int sl; u8 sh; bool as_el0; bool hpd; bool e0poe; bool poe; bool pan; bool be; bool s2; bool pa52bit; }; struct s1_walk_result { union { struct { u64 desc; u64 pa; s8 level; u8 APTable; bool nG; u16 asid; bool UXNTable; bool PXNTable; bool uwxn; bool uov; bool ur; bool uw; bool ux; bool pwxn; bool pov; bool pr; bool pw; bool px; }; struct { u8 fst; bool ptw; bool s2; }; }; bool failed; }; int __kvm_translate_va(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, struct s1_walk_result *wr, u64 va); int __kvm_find_s1_desc_level(struct kvm_vcpu *vcpu, u64 va, u64 ipa, int *level); /* VNCR management */ int kvm_vcpu_allocate_vncr_tlb(struct kvm_vcpu *vcpu); int kvm_handle_vncr_abort(struct kvm_vcpu *vcpu); void kvm_handle_s1e2_tlbi(struct kvm_vcpu *vcpu, u32 inst, u64 val); #define vncr_fixmap(c) \ ({ \ u32 __c = (c); \ BUG_ON(__c >= NR_CPUS); \ (FIX_VNCR - __c); \ }) #endif /* __ARM64_KVM_NESTED_H */
182 3 119 629 5 629 330 116 553 6 560 558 631 351 555 202 83 81 121 121 95 3 82 84 44 196 25 60 34 10 10 14 10 10 4 10 196 58 25 39 62 27 27 27 3 6 23 27 203 196 196 197 194 196 50 204 64 63 60 84 33 98 91 14 223 204 43 225 224 151 98 202 43 225 223 94 14 14 14 98 5 5 5 56 10 43 14 43 31 5 52 242 129 29 244 14 42 38 38 28 37 5 5 5 57 57 5 7 7 4 6 569 567 566 120 4 3 34 34 34 3 33 33 120 120 87 87 143 143 142 142 47 136 130 130 130 129 38 38 131 129 38 16 16 16 16 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 // SPDX-License-Identifier: GPL-2.0+ /* * XArray implementation * Copyright (c) 2017-2018 Microsoft Corporation * Copyright (c) 2018-2020 Oracle * Author: Matthew Wilcox <willy@infradead.org> */ #include <linux/bitmap.h> #include <linux/export.h> #include <linux/list.h> #include <linux/slab.h> #include <linux/xarray.h> #include "radix-tree.h" /* * Coding conventions in this file: * * @xa is used to refer to the entire xarray. * @xas is the 'xarray operation state'. It may be either a pointer to * an xa_state, or an xa_state stored on the stack. This is an unfortunate * ambiguity. * @index is the index of the entry being operated on * @mark is an xa_mark_t; a small number indicating one of the mark bits. * @node refers to an xa_node; usually the primary one being operated on by * this function. * @offset is the index into the slots array inside an xa_node. * @parent refers to the @xa_node closer to the head than @node. * @entry refers to something stored in a slot in the xarray */ static inline unsigned int xa_lock_type(const struct xarray *xa) { return (__force unsigned int)xa->xa_flags & 3; } static inline void xas_lock_type(struct xa_state *xas, unsigned int lock_type) { if (lock_type == XA_LOCK_IRQ) xas_lock_irq(xas); else if (lock_type == XA_LOCK_BH) xas_lock_bh(xas); else xas_lock(xas); } static inline void xas_unlock_type(struct xa_state *xas, unsigned int lock_type) { if (lock_type == XA_LOCK_IRQ) xas_unlock_irq(xas); else if (lock_type == XA_LOCK_BH) xas_unlock_bh(xas); else xas_unlock(xas); } static inline bool xa_track_free(const struct xarray *xa) { return xa->xa_flags & XA_FLAGS_TRACK_FREE; } static inline bool xa_zero_busy(const struct xarray *xa) { return xa->xa_flags & XA_FLAGS_ZERO_BUSY; } static inline void xa_mark_set(struct xarray *xa, xa_mark_t mark) { if (!(xa->xa_flags & XA_FLAGS_MARK(mark))) xa->xa_flags |= XA_FLAGS_MARK(mark); } static inline void xa_mark_clear(struct xarray *xa, xa_mark_t mark) { if (xa->xa_flags & XA_FLAGS_MARK(mark)) xa->xa_flags &= ~(XA_FLAGS_MARK(mark)); } static inline unsigned long *node_marks(struct xa_node *node, xa_mark_t mark) { return node->marks[(__force unsigned)mark]; } static inline bool node_get_mark(struct xa_node *node, unsigned int offset, xa_mark_t mark) { return test_bit(offset, node_marks(node, mark)); } /* returns true if the bit was set */ static inline bool node_set_mark(struct xa_node *node, unsigned int offset, xa_mark_t mark) { return __test_and_set_bit(offset, node_marks(node, mark)); } /* returns true if the bit was set */ static inline bool node_clear_mark(struct xa_node *node, unsigned int offset, xa_mark_t mark) { return __test_and_clear_bit(offset, node_marks(node, mark)); } static inline bool node_any_mark(struct xa_node *node, xa_mark_t mark) { return !bitmap_empty(node_marks(node, mark), XA_CHUNK_SIZE); } static inline void node_mark_all(struct xa_node *node, xa_mark_t mark) { bitmap_fill(node_marks(node, mark), XA_CHUNK_SIZE); } #define mark_inc(mark) do { \ mark = (__force xa_mark_t)((__force unsigned)(mark) + 1); \ } while (0) /* * xas_squash_marks() - Merge all marks to the first entry * @xas: Array operation state. * * Set a mark on the first entry if any entry has it set. Clear marks on * all sibling entries. */ static void xas_squash_marks(const struct xa_state *xas) { xa_mark_t mark = 0; unsigned int limit = xas->xa_offset + xas->xa_sibs + 1; for (;;) { unsigned long *marks = node_marks(xas->xa_node, mark); if (find_next_bit(marks, limit, xas->xa_offset + 1) != limit) { __set_bit(xas->xa_offset, marks); bitmap_clear(marks, xas->xa_offset + 1, xas->xa_sibs); } if (mark == XA_MARK_MAX) break; mark_inc(mark); } } /* extracts the offset within this node from the index */ static unsigned int get_offset(unsigned long index, struct xa_node *node) { return (index >> node->shift) & XA_CHUNK_MASK; } static void xas_set_offset(struct xa_state *xas) { xas->xa_offset = get_offset(xas->xa_index, xas->xa_node); } /* move the index either forwards (find) or backwards (sibling slot) */ static void xas_move_index(struct xa_state *xas, unsigned long offset) { unsigned int shift = xas->xa_node->shift; xas->xa_index &= ~XA_CHUNK_MASK << shift; xas->xa_index += offset << shift; } static void xas_next_offset(struct xa_state *xas) { xas->xa_offset++; xas_move_index(xas, xas->xa_offset); } static void *set_bounds(struct xa_state *xas) { xas->xa_node = XAS_BOUNDS; return NULL; } /* * Starts a walk. If the @xas is already valid, we assume that it's on * the right path and just return where we've got to. If we're in an * error state, return NULL. If the index is outside the current scope * of the xarray, return NULL without changing @xas->xa_node. Otherwise * set @xas->xa_node to NULL and return the current head of the array. */ static void *xas_start(struct xa_state *xas) { void *entry; if (xas_valid(xas)) return xas_reload(xas); if (xas_error(xas)) return NULL; entry = xa_head(xas->xa); if (!xa_is_node(entry)) { if (xas->xa_index) return set_bounds(xas); } else { if ((xas->xa_index >> xa_to_node(entry)->shift) > XA_CHUNK_MASK) return set_bounds(xas); } xas->xa_node = NULL; return entry; } static __always_inline void *xas_descend(struct xa_state *xas, struct xa_node *node) { unsigned int offset = get_offset(xas->xa_index, node); void *entry = xa_entry(xas->xa, node, offset); xas->xa_node = node; while (xa_is_sibling(entry)) { offset = xa_to_sibling(entry); entry = xa_entry(xas->xa, node, offset); if (node->shift && xa_is_node(entry)) entry = XA_RETRY_ENTRY; } xas->xa_offset = offset; return entry; } /** * xas_load() - Load an entry from the XArray (advanced). * @xas: XArray operation state. * * Usually walks the @xas to the appropriate state to load the entry * stored at xa_index. However, it will do nothing and return %NULL if * @xas is in an error state. xas_load() will never expand the tree. * * If the xa_state is set up to operate on a multi-index entry, xas_load() * may return %NULL or an internal entry, even if there are entries * present within the range specified by @xas. * * Context: Any context. The caller should hold the xa_lock or the RCU lock. * Return: Usually an entry in the XArray, but see description for exceptions. */ void *xas_load(struct xa_state *xas) { void *entry = xas_start(xas); while (xa_is_node(entry)) { struct xa_node *node = xa_to_node(entry); if (xas->xa_shift > node->shift) break; entry = xas_descend(xas, node); if (node->shift == 0) break; } return entry; } EXPORT_SYMBOL_GPL(xas_load); #define XA_RCU_FREE ((struct xarray *)1) static void xa_node_free(struct xa_node *node) { XA_NODE_BUG_ON(node, !list_empty(&node->private_list)); node->array = XA_RCU_FREE; call_rcu(&node->rcu_head, radix_tree_node_rcu_free); } /* * xas_destroy() - Free any resources allocated during the XArray operation. * @xas: XArray operation state. * * Most users will not need to call this function; it is called for you * by xas_nomem(). */ void xas_destroy(struct xa_state *xas) { struct xa_node *next, *node = xas->xa_alloc; while (node) { XA_NODE_BUG_ON(node, !list_empty(&node->private_list)); next = rcu_dereference_raw(node->parent); radix_tree_node_rcu_free(&node->rcu_head); xas->xa_alloc = node = next; } } EXPORT_SYMBOL_GPL(xas_destroy); /** * xas_nomem() - Allocate memory if needed. * @xas: XArray operation state. * @gfp: Memory allocation flags. * * If we need to add new nodes to the XArray, we try to allocate memory * with GFP_NOWAIT while holding the lock, which will usually succeed. * If it fails, @xas is flagged as needing memory to continue. The caller * should drop the lock and call xas_nomem(). If xas_nomem() succeeds, * the caller should retry the operation. * * Forward progress is guaranteed as one node is allocated here and * stored in the xa_state where it will be found by xas_alloc(). More * nodes will likely be found in the slab allocator, but we do not tie * them up here. * * Return: true if memory was needed, and was successfully allocated. */ bool xas_nomem(struct xa_state *xas, gfp_t gfp) { if (xas->xa_node != XA_ERROR(-ENOMEM)) { xas_destroy(xas); return false; } if (xas->xa->xa_flags & XA_FLAGS_ACCOUNT) gfp |= __GFP_ACCOUNT; xas->xa_alloc = kmem_cache_alloc_lru(radix_tree_node_cachep, xas->xa_lru, gfp); if (!xas->xa_alloc) return false; xas->xa_alloc->parent = NULL; XA_NODE_BUG_ON(xas->xa_alloc, !list_empty(&xas->xa_alloc->private_list)); xas->xa_node = XAS_RESTART; return true; } EXPORT_SYMBOL_GPL(xas_nomem); /* * __xas_nomem() - Drop locks and allocate memory if needed. * @xas: XArray operation state. * @gfp: Memory allocation flags. * * Internal variant of xas_nomem(). * * Return: true if memory was needed, and was successfully allocated. */ static bool __xas_nomem(struct xa_state *xas, gfp_t gfp) __must_hold(xas->xa->xa_lock) { unsigned int lock_type = xa_lock_type(xas->xa); if (xas->xa_node != XA_ERROR(-ENOMEM)) { xas_destroy(xas); return false; } if (xas->xa->xa_flags & XA_FLAGS_ACCOUNT) gfp |= __GFP_ACCOUNT; if (gfpflags_allow_blocking(gfp)) { xas_unlock_type(xas, lock_type); xas->xa_alloc = kmem_cache_alloc_lru(radix_tree_node_cachep, xas->xa_lru, gfp); xas_lock_type(xas, lock_type); } else { xas->xa_alloc = kmem_cache_alloc_lru(radix_tree_node_cachep, xas->xa_lru, gfp); } if (!xas->xa_alloc) return false; xas->xa_alloc->parent = NULL; XA_NODE_BUG_ON(xas->xa_alloc, !list_empty(&xas->xa_alloc->private_list)); xas->xa_node = XAS_RESTART; return true; } static void xas_update(struct xa_state *xas, struct xa_node *node) { if (xas->xa_update) xas->xa_update(node); else XA_NODE_BUG_ON(node, !list_empty(&node->private_list)); } static void *xas_alloc(struct xa_state *xas, unsigned int shift) { struct xa_node *parent = xas->xa_node; struct xa_node *node = xas->xa_alloc; if (xas_invalid(xas)) return NULL; if (node) { xas->xa_alloc = NULL; } else { gfp_t gfp = GFP_NOWAIT | __GFP_NOWARN; if (xas->xa->xa_flags & XA_FLAGS_ACCOUNT) gfp |= __GFP_ACCOUNT; node = kmem_cache_alloc_lru(radix_tree_node_cachep, xas->xa_lru, gfp); if (!node) { xas_set_err(xas, -ENOMEM); return NULL; } } if (parent) { node->offset = xas->xa_offset; parent->count++; XA_NODE_BUG_ON(node, parent->count > XA_CHUNK_SIZE); xas_update(xas, parent); } XA_NODE_BUG_ON(node, shift > BITS_PER_LONG); XA_NODE_BUG_ON(node, !list_empty(&node->private_list)); node->shift = shift; node->count = 0; node->nr_values = 0; RCU_INIT_POINTER(node->parent, xas->xa_node); node->array = xas->xa; return node; } #ifdef CONFIG_XARRAY_MULTI /* Returns the number of indices covered by a given xa_state */ static unsigned long xas_size(const struct xa_state *xas) { return (xas->xa_sibs + 1UL) << xas->xa_shift; } #endif /* * Use this to calculate the maximum index that will need to be created * in order to add the entry described by @xas. Because we cannot store a * multi-index entry at index 0, the calculation is a little more complex * than you might expect. */ static unsigned long xas_max(struct xa_state *xas) { unsigned long max = xas->xa_index; #ifdef CONFIG_XARRAY_MULTI if (xas->xa_shift || xas->xa_sibs) { unsigned long mask = xas_size(xas) - 1; max |= mask; if (mask == max) max++; } #endif return max; } /* The maximum index that can be contained in the array without expanding it */ static unsigned long max_index(void *entry) { if (!xa_is_node(entry)) return 0; return (XA_CHUNK_SIZE << xa_to_node(entry)->shift) - 1; } static inline void *xa_zero_to_null(void *entry) { return xa_is_zero(entry) ? NULL : entry; } static void xas_shrink(struct xa_state *xas) { struct xarray *xa = xas->xa; struct xa_node *node = xas->xa_node; for (;;) { void *entry; XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE); if (node->count != 1) break; entry = xa_entry_locked(xa, node, 0); if (!entry) break; if (!xa_is_node(entry) && node->shift) break; if (xa_zero_busy(xa)) entry = xa_zero_to_null(entry); xas->xa_node = XAS_BOUNDS; RCU_INIT_POINTER(xa->xa_head, entry); if (xa_track_free(xa) && !node_get_mark(node, 0, XA_FREE_MARK)) xa_mark_clear(xa, XA_FREE_MARK); node->count = 0; node->nr_values = 0; if (!xa_is_node(entry)) RCU_INIT_POINTER(node->slots[0], XA_RETRY_ENTRY); xas_update(xas, node); xa_node_free(node); if (!xa_is_node(entry)) break; node = xa_to_node(entry); node->parent = NULL; } } /* * xas_delete_node() - Attempt to delete an xa_node * @xas: Array operation state. * * Attempts to delete the @xas->xa_node. This will fail if xa->node has * a non-zero reference count. */ static void xas_delete_node(struct xa_state *xas) { struct xa_node *node = xas->xa_node; for (;;) { struct xa_node *parent; XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE); if (node->count) break; parent = xa_parent_locked(xas->xa, node); xas->xa_node = parent; xas->xa_offset = node->offset; xa_node_free(node); if (!parent) { xas->xa->xa_head = NULL; xas->xa_node = XAS_BOUNDS; return; } parent->slots[xas->xa_offset] = NULL; parent->count--; XA_NODE_BUG_ON(parent, parent->count > XA_CHUNK_SIZE); node = parent; xas_update(xas, node); } if (!node->parent) xas_shrink(xas); } /** * xas_free_nodes() - Free this node and all nodes that it references * @xas: Array operation state. * @top: Node to free * * This node has been removed from the tree. We must now free it and all * of its subnodes. There may be RCU walkers with references into the tree, * so we must replace all entries with retry markers. */ static void xas_free_nodes(struct xa_state *xas, struct xa_node *top) { unsigned int offset = 0; struct xa_node *node = top; for (;;) { void *entry = xa_entry_locked(xas->xa, node, offset); if (node->shift && xa_is_node(entry)) { node = xa_to_node(entry); offset = 0; continue; } if (entry) RCU_INIT_POINTER(node->slots[offset], XA_RETRY_ENTRY); offset++; while (offset == XA_CHUNK_SIZE) { struct xa_node *parent; parent = xa_parent_locked(xas->xa, node); offset = node->offset + 1; node->count = 0; node->nr_values = 0; xas_update(xas, node); xa_node_free(node); if (node == top) return; node = parent; } } } /* * xas_expand adds nodes to the head of the tree until it has reached * sufficient height to be able to contain @xas->xa_index */ static int xas_expand(struct xa_state *xas, void *head) { struct xarray *xa = xas->xa; struct xa_node *node = NULL; unsigned int shift = 0; unsigned long max = xas_max(xas); if (!head) { if (max == 0) return 0; while ((max >> shift) >= XA_CHUNK_SIZE) shift += XA_CHUNK_SHIFT; return shift + XA_CHUNK_SHIFT; } else if (xa_is_node(head)) { node = xa_to_node(head); shift = node->shift + XA_CHUNK_SHIFT; } xas->xa_node = NULL; while (max > max_index(head)) { xa_mark_t mark = 0; XA_NODE_BUG_ON(node, shift > BITS_PER_LONG); node = xas_alloc(xas, shift); if (!node) return -ENOMEM; node->count = 1; if (xa_is_value(head)) node->nr_values = 1; RCU_INIT_POINTER(node->slots[0], head); /* Propagate the aggregated mark info to the new child */ for (;;) { if (xa_track_free(xa) && mark == XA_FREE_MARK) { node_mark_all(node, XA_FREE_MARK); if (!xa_marked(xa, XA_FREE_MARK)) { node_clear_mark(node, 0, XA_FREE_MARK); xa_mark_set(xa, XA_FREE_MARK); } } else if (xa_marked(xa, mark)) { node_set_mark(node, 0, mark); } if (mark == XA_MARK_MAX) break; mark_inc(mark); } /* * Now that the new node is fully initialised, we can add * it to the tree */ if (xa_is_node(head)) { xa_to_node(head)->offset = 0; rcu_assign_pointer(xa_to_node(head)->parent, node); } head = xa_mk_node(node); rcu_assign_pointer(xa->xa_head, head); xas_update(xas, node); shift += XA_CHUNK_SHIFT; } xas->xa_node = node; return shift; } /* * xas_create() - Create a slot to store an entry in. * @xas: XArray operation state. * @allow_root: %true if we can store the entry in the root directly * * Most users will not need to call this function directly, as it is called * by xas_store(). It is useful for doing conditional store operations * (see the xa_cmpxchg() implementation for an example). * * Return: If the slot already existed, returns the contents of this slot. * If the slot was newly created, returns %NULL. If it failed to create the * slot, returns %NULL and indicates the error in @xas. */ static void *xas_create(struct xa_state *xas, bool allow_root) { struct xarray *xa = xas->xa; void *entry; void __rcu **slot; struct xa_node *node = xas->xa_node; int shift; unsigned int order = xas->xa_shift; if (xas_top(node)) { entry = xa_head_locked(xa); xas->xa_node = NULL; if (!entry && xa_zero_busy(xa)) entry = XA_ZERO_ENTRY; shift = xas_expand(xas, entry); if (shift < 0) return NULL; if (!shift && !allow_root) shift = XA_CHUNK_SHIFT; entry = xa_head_locked(xa); slot = &xa->xa_head; } else if (xas_error(xas)) { return NULL; } else if (node) { unsigned int offset = xas->xa_offset; shift = node->shift; entry = xa_entry_locked(xa, node, offset); slot = &node->slots[offset]; } else { shift = 0; entry = xa_head_locked(xa); slot = &xa->xa_head; } while (shift > order) { shift -= XA_CHUNK_SHIFT; if (!entry) { node = xas_alloc(xas, shift); if (!node) break; if (xa_track_free(xa)) node_mark_all(node, XA_FREE_MARK); rcu_assign_pointer(*slot, xa_mk_node(node)); } else if (xa_is_node(entry)) { node = xa_to_node(entry); } else { break; } entry = xas_descend(xas, node); slot = &node->slots[xas->xa_offset]; } return entry; } /** * xas_create_range() - Ensure that stores to this range will succeed * @xas: XArray operation state. * * Creates all of the slots in the range covered by @xas. Sets @xas to * create single-index entries and positions it at the beginning of the * range. This is for the benefit of users which have not yet been * converted to use multi-index entries. */ void xas_create_range(struct xa_state *xas) { unsigned long index = xas->xa_index; unsigned char shift = xas->xa_shift; unsigned char sibs = xas->xa_sibs; xas->xa_index |= ((sibs + 1UL) << shift) - 1; if (xas_is_node(xas) && xas->xa_node->shift == xas->xa_shift) xas->xa_offset |= sibs; xas->xa_shift = 0; xas->xa_sibs = 0; for (;;) { xas_create(xas, true); if (xas_error(xas)) goto restore; if (xas->xa_index <= (index | XA_CHUNK_MASK)) goto success; xas->xa_index -= XA_CHUNK_SIZE; for (;;) { struct xa_node *node = xas->xa_node; if (node->shift >= shift) break; xas->xa_node = xa_parent_locked(xas->xa, node); xas->xa_offset = node->offset - 1; if (node->offset != 0) break; } } restore: xas->xa_shift = shift; xas->xa_sibs = sibs; xas->xa_index = index; return; success: xas->xa_index = index; if (xas->xa_node) xas_set_offset(xas); } EXPORT_SYMBOL_GPL(xas_create_range); static void update_node(struct xa_state *xas, struct xa_node *node, int count, int values) { if (!node || (!count && !values)) return; node->count += count; node->nr_values += values; XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE); XA_NODE_BUG_ON(node, node->nr_values > XA_CHUNK_SIZE); xas_update(xas, node); if (count < 0) xas_delete_node(xas); } /** * xas_store() - Store this entry in the XArray. * @xas: XArray operation state. * @entry: New entry. * * If @xas is operating on a multi-index entry, the entry returned by this * function is essentially meaningless (it may be an internal entry or it * may be %NULL, even if there are non-NULL entries at some of the indices * covered by the range). This is not a problem for any current users, * and can be changed if needed. * * Return: The old entry at this index. */ void *xas_store(struct xa_state *xas, void *entry) { struct xa_node *node; void __rcu **slot = &xas->xa->xa_head; unsigned int offset, max; int count = 0; int values = 0; void *first, *next; bool value = xa_is_value(entry); if (entry) { bool allow_root = !xa_is_node(entry) && !xa_is_zero(entry); first = xas_create(xas, allow_root); } else { first = xas_load(xas); } if (xas_invalid(xas)) return first; node = xas->xa_node; if (node && (xas->xa_shift < node->shift)) xas->xa_sibs = 0; if ((first == entry) && !xas->xa_sibs) return first; next = first; offset = xas->xa_offset; max = xas->xa_offset + xas->xa_sibs; if (node) { slot = &node->slots[offset]; if (xas->xa_sibs) xas_squash_marks(xas); } if (!entry) xas_init_marks(xas); for (;;) { /* * Must clear the marks before setting the entry to NULL, * otherwise xas_for_each_marked may find a NULL entry and * stop early. rcu_assign_pointer contains a release barrier * so the mark clearing will appear to happen before the * entry is set to NULL. */ rcu_assign_pointer(*slot, entry); if (xa_is_node(next) && (!node || node->shift)) xas_free_nodes(xas, xa_to_node(next)); if (!node) break; count += !next - !entry; values += !xa_is_value(first) - !value; if (entry) { if (offset == max) break; if (!xa_is_sibling(entry)) entry = xa_mk_sibling(xas->xa_offset); } else { if (offset == XA_CHUNK_MASK) break; } next = xa_entry_locked(xas->xa, node, ++offset); if (!xa_is_sibling(next)) { if (!entry && (offset > max)) break; first = next; } slot++; } update_node(xas, node, count, values); return first; } EXPORT_SYMBOL_GPL(xas_store); /** * xas_get_mark() - Returns the state of this mark. * @xas: XArray operation state. * @mark: Mark number. * * Return: true if the mark is set, false if the mark is clear or @xas * is in an error state. */ bool xas_get_mark(const struct xa_state *xas, xa_mark_t mark) { if (xas_invalid(xas)) return false; if (!xas->xa_node) return xa_marked(xas->xa, mark); return node_get_mark(xas->xa_node, xas->xa_offset, mark); } EXPORT_SYMBOL_GPL(xas_get_mark); /** * xas_set_mark() - Sets the mark on this entry and its parents. * @xas: XArray operation state. * @mark: Mark number. * * Sets the specified mark on this entry, and walks up the tree setting it * on all the ancestor entries. Does nothing if @xas has not been walked to * an entry, or is in an error state. */ void xas_set_mark(const struct xa_state *xas, xa_mark_t mark) { struct xa_node *node = xas->xa_node; unsigned int offset = xas->xa_offset; if (xas_invalid(xas)) return; while (node) { if (node_set_mark(node, offset, mark)) return; offset = node->offset; node = xa_parent_locked(xas->xa, node); } if (!xa_marked(xas->xa, mark)) xa_mark_set(xas->xa, mark); } EXPORT_SYMBOL_GPL(xas_set_mark); /** * xas_clear_mark() - Clears the mark on this entry and its parents. * @xas: XArray operation state. * @mark: Mark number. * * Clears the specified mark on this entry, and walks back to the head * attempting to clear it on all the ancestor entries. Does nothing if * @xas has not been walked to an entry, or is in an error state. */ void xas_clear_mark(const struct xa_state *xas, xa_mark_t mark) { struct xa_node *node = xas->xa_node; unsigned int offset = xas->xa_offset; if (xas_invalid(xas)) return; while (node) { if (!node_clear_mark(node, offset, mark)) return; if (node_any_mark(node, mark)) return; offset = node->offset; node = xa_parent_locked(xas->xa, node); } if (xa_marked(xas->xa, mark)) xa_mark_clear(xas->xa, mark); } EXPORT_SYMBOL_GPL(xas_clear_mark); /** * xas_init_marks() - Initialise all marks for the entry * @xas: Array operations state. * * Initialise all marks for the entry specified by @xas. If we're tracking * free entries with a mark, we need to set it on all entries. All other * marks are cleared. * * This implementation is not as efficient as it could be; we may walk * up the tree multiple times. */ void xas_init_marks(const struct xa_state *xas) { xa_mark_t mark = 0; for (;;) { if (xa_track_free(xas->xa) && mark == XA_FREE_MARK) xas_set_mark(xas, mark); else xas_clear_mark(xas, mark); if (mark == XA_MARK_MAX) break; mark_inc(mark); } } EXPORT_SYMBOL_GPL(xas_init_marks); #ifdef CONFIG_XARRAY_MULTI static unsigned int node_get_marks(struct xa_node *node, unsigned int offset) { unsigned int marks = 0; xa_mark_t mark = XA_MARK_0; for (;;) { if (node_get_mark(node, offset, mark)) marks |= 1 << (__force unsigned int)mark; if (mark == XA_MARK_MAX) break; mark_inc(mark); } return marks; } static inline void node_mark_slots(struct xa_node *node, unsigned int sibs, xa_mark_t mark) { int i; if (sibs == 0) node_mark_all(node, mark); else { for (i = 0; i < XA_CHUNK_SIZE; i += sibs + 1) node_set_mark(node, i, mark); } } static void node_set_marks(struct xa_node *node, unsigned int offset, struct xa_node *child, unsigned int sibs, unsigned int marks) { xa_mark_t mark = XA_MARK_0; for (;;) { if (marks & (1 << (__force unsigned int)mark)) { node_set_mark(node, offset, mark); if (child) node_mark_slots(child, sibs, mark); } if (mark == XA_MARK_MAX) break; mark_inc(mark); } } static void __xas_init_node_for_split(struct xa_state *xas, struct xa_node *node, void *entry) { unsigned int i; void *sibling = NULL; unsigned int mask = xas->xa_sibs; if (!node) return; node->array = xas->xa; for (i = 0; i < XA_CHUNK_SIZE; i++) { if ((i & mask) == 0) { RCU_INIT_POINTER(node->slots[i], entry); sibling = xa_mk_sibling(i); } else { RCU_INIT_POINTER(node->slots[i], sibling); } } } /** * xas_split_alloc() - Allocate memory for splitting an entry. * @xas: XArray operation state. * @entry: New entry which will be stored in the array. * @order: Current entry order. * @gfp: Memory allocation flags. * * This function should be called before calling xas_split(). * If necessary, it will allocate new nodes (and fill them with @entry) * to prepare for the upcoming split of an entry of @order size into * entries of the order stored in the @xas. * * Context: May sleep if @gfp flags permit. */ void xas_split_alloc(struct xa_state *xas, void *entry, unsigned int order, gfp_t gfp) { unsigned int sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1; /* XXX: no support for splitting really large entries yet */ if (WARN_ON(xas->xa_shift + 2 * XA_CHUNK_SHIFT <= order)) goto nomem; if (xas->xa_shift + XA_CHUNK_SHIFT > order) return; do { struct xa_node *node; node = kmem_cache_alloc_lru(radix_tree_node_cachep, xas->xa_lru, gfp); if (!node) goto nomem; __xas_init_node_for_split(xas, node, entry); RCU_INIT_POINTER(node->parent, xas->xa_alloc); xas->xa_alloc = node; } while (sibs-- > 0); return; nomem: xas_destroy(xas); xas_set_err(xas, -ENOMEM); } EXPORT_SYMBOL_GPL(xas_split_alloc); /** * xas_split() - Split a multi-index entry into smaller entries. * @xas: XArray operation state. * @entry: New entry to store in the array. * @order: Current entry order. * * The size of the new entries is set in @xas. The value in @entry is * copied to all the replacement entries. * * Context: Any context. The caller should hold the xa_lock. */ void xas_split(struct xa_state *xas, void *entry, unsigned int order) { unsigned int sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1; unsigned int offset, marks; struct xa_node *node; void *curr = xas_load(xas); int values = 0; node = xas->xa_node; if (xas_top(node)) return; marks = node_get_marks(node, xas->xa_offset); offset = xas->xa_offset + sibs; do { if (xas->xa_shift < node->shift) { struct xa_node *child = xas->xa_alloc; xas->xa_alloc = rcu_dereference_raw(child->parent); child->shift = node->shift - XA_CHUNK_SHIFT; child->offset = offset; child->count = XA_CHUNK_SIZE; child->nr_values = xa_is_value(entry) ? XA_CHUNK_SIZE : 0; RCU_INIT_POINTER(child->parent, node); node_set_marks(node, offset, child, xas->xa_sibs, marks); rcu_assign_pointer(node->slots[offset], xa_mk_node(child)); if (xa_is_value(curr)) values--; xas_update(xas, child); } else { unsigned int canon = offset - xas->xa_sibs; node_set_marks(node, canon, NULL, 0, marks); rcu_assign_pointer(node->slots[canon], entry); while (offset > canon) rcu_assign_pointer(node->slots[offset--], xa_mk_sibling(canon)); values += (xa_is_value(entry) - xa_is_value(curr)) * (xas->xa_sibs + 1); } } while (offset-- > xas->xa_offset); node->nr_values += values; xas_update(xas, node); } EXPORT_SYMBOL_GPL(xas_split); /** * xas_try_split_min_order() - Minimal split order xas_try_split() can accept * @order: Current entry order. * * xas_try_split() can split a multi-index entry to smaller than @order - 1 if * no new xa_node is needed. This function provides the minimal order * xas_try_split() supports. * * Return: the minimal order xas_try_split() supports * * Context: Any context. * */ unsigned int xas_try_split_min_order(unsigned int order) { if (order % XA_CHUNK_SHIFT == 0) return order == 0 ? 0 : order - 1; return order - (order % XA_CHUNK_SHIFT); } EXPORT_SYMBOL_GPL(xas_try_split_min_order); /** * xas_try_split() - Try to split a multi-index entry. * @xas: XArray operation state. * @entry: New entry to store in the array. * @order: Current entry order. * * The size of the new entries is set in @xas. The value in @entry is * copied to all the replacement entries. If and only if one new xa_node is * needed, the function will use GFP_NOWAIT to get one if xas->xa_alloc is * NULL. If more new xa_node are needed, the function gives EINVAL error. * * NOTE: use xas_try_split_min_order() to get next split order instead of * @order - 1 if you want to minmize xas_try_split() calls. * * Context: Any context. The caller should hold the xa_lock. */ void xas_try_split(struct xa_state *xas, void *entry, unsigned int order) { unsigned int sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1; unsigned int offset, marks; struct xa_node *node; void *curr = xas_load(xas); int values = 0; gfp_t gfp = GFP_NOWAIT; node = xas->xa_node; if (xas_top(node)) return; if (xas->xa->xa_flags & XA_FLAGS_ACCOUNT) gfp |= __GFP_ACCOUNT; marks = node_get_marks(node, xas->xa_offset); offset = xas->xa_offset + sibs; if (xas->xa_shift < node->shift) { struct xa_node *child = xas->xa_alloc; unsigned int expected_sibs = (1 << ((order - 1) % XA_CHUNK_SHIFT)) - 1; /* * No support for splitting sibling entries * (horizontally) or cascade split (vertically), which * requires two or more new xa_nodes. * Since if one xa_node allocation fails, * it is hard to free the prior allocations. */ if (sibs || xas->xa_sibs != expected_sibs) { xas_destroy(xas); xas_set_err(xas, -EINVAL); return; } if (!child) { child = kmem_cache_alloc_lru(radix_tree_node_cachep, xas->xa_lru, gfp); if (!child) { xas_destroy(xas); xas_set_err(xas, -ENOMEM); return; } RCU_INIT_POINTER(child->parent, xas->xa_alloc); } __xas_init_node_for_split(xas, child, entry); xas->xa_alloc = rcu_dereference_raw(child->parent); child->shift = node->shift - XA_CHUNK_SHIFT; child->offset = offset; child->count = XA_CHUNK_SIZE; child->nr_values = xa_is_value(entry) ? XA_CHUNK_SIZE : 0; RCU_INIT_POINTER(child->parent, node); node_set_marks(node, offset, child, xas->xa_sibs, marks); rcu_assign_pointer(node->slots[offset], xa_mk_node(child)); if (xa_is_value(curr)) values--; xas_update(xas, child); } else { do { unsigned int canon = offset - xas->xa_sibs; node_set_marks(node, canon, NULL, 0, marks); rcu_assign_pointer(node->slots[canon], entry); while (offset > canon) rcu_assign_pointer(node->slots[offset--], xa_mk_sibling(canon)); values += (xa_is_value(entry) - xa_is_value(curr)) * (xas->xa_sibs + 1); } while (offset-- > xas->xa_offset); } node->nr_values += values; xas_update(xas, node); } EXPORT_SYMBOL_GPL(xas_try_split); #endif /** * xas_pause() - Pause a walk to drop a lock. * @xas: XArray operation state. * * Some users need to pause a walk and drop the lock they're holding in * order to yield to a higher priority thread or carry out an operation * on an entry. Those users should call this function before they drop * the lock. It resets the @xas to be suitable for the next iteration * of the loop after the user has reacquired the lock. If most entries * found during a walk require you to call xas_pause(), the xa_for_each() * iterator may be more appropriate. * * Note that xas_pause() only works for forward iteration. If a user needs * to pause a reverse iteration, we will need a xas_pause_rev(). */ void xas_pause(struct xa_state *xas) { struct xa_node *node = xas->xa_node; if (xas_invalid(xas)) return; xas->xa_node = XAS_RESTART; if (node) { unsigned long offset = xas->xa_offset; while (++offset < XA_CHUNK_SIZE) { if (!xa_is_sibling(xa_entry(xas->xa, node, offset))) break; } xas->xa_index &= ~0UL << node->shift; xas->xa_index += (offset - xas->xa_offset) << node->shift; if (xas->xa_index == 0) xas->xa_node = XAS_BOUNDS; } else { xas->xa_index++; } } EXPORT_SYMBOL_GPL(xas_pause); /* * __xas_prev() - Find the previous entry in the XArray. * @xas: XArray operation state. * * Helper function for xas_prev() which handles all the complex cases * out of line. */ void *__xas_prev(struct xa_state *xas) { void *entry; if (!xas_frozen(xas->xa_node)) xas->xa_index--; if (!xas->xa_node) return set_bounds(xas); if (xas_not_node(xas->xa_node)) return xas_load(xas); if (xas->xa_offset != get_offset(xas->xa_index, xas->xa_node)) xas->xa_offset--; while (xas->xa_offset == 255) { xas->xa_offset = xas->xa_node->offset - 1; xas->xa_node = xa_parent(xas->xa, xas->xa_node); if (!xas->xa_node) return set_bounds(xas); } for (;;) { entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset); if (!xa_is_node(entry)) return entry; xas->xa_node = xa_to_node(entry); xas_set_offset(xas); } } EXPORT_SYMBOL_GPL(__xas_prev); /* * __xas_next() - Find the next entry in the XArray. * @xas: XArray operation state. * * Helper function for xas_next() which handles all the complex cases * out of line. */ void *__xas_next(struct xa_state *xas) { void *entry; if (!xas_frozen(xas->xa_node)) xas->xa_index++; if (!xas->xa_node) return set_bounds(xas); if (xas_not_node(xas->xa_node)) return xas_load(xas); if (xas->xa_offset != get_offset(xas->xa_index, xas->xa_node)) xas->xa_offset++; while (xas->xa_offset == XA_CHUNK_SIZE) { xas->xa_offset = xas->xa_node->offset + 1; xas->xa_node = xa_parent(xas->xa, xas->xa_node); if (!xas->xa_node) return set_bounds(xas); } for (;;) { entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset); if (!xa_is_node(entry)) return entry; xas->xa_node = xa_to_node(entry); xas_set_offset(xas); } } EXPORT_SYMBOL_GPL(__xas_next); /** * xas_find() - Find the next present entry in the XArray. * @xas: XArray operation state. * @max: Highest index to return. * * If the @xas has not yet been walked to an entry, return the entry * which has an index >= xas.xa_index. If it has been walked, the entry * currently being pointed at has been processed, and so we move to the * next entry. * * If no entry is found and the array is smaller than @max, the iterator * is set to the smallest index not yet in the array. This allows @xas * to be immediately passed to xas_store(). * * Return: The entry, if found, otherwise %NULL. */ void *xas_find(struct xa_state *xas, unsigned long max) { void *entry; if (xas_error(xas) || xas->xa_node == XAS_BOUNDS) return NULL; if (xas->xa_index > max) return set_bounds(xas); if (!xas->xa_node) { xas->xa_index = 1; return set_bounds(xas); } else if (xas->xa_node == XAS_RESTART) { entry = xas_load(xas); if (entry || xas_not_node(xas->xa_node)) return entry; } else if (!xas->xa_node->shift && xas->xa_offset != (xas->xa_index & XA_CHUNK_MASK)) { xas->xa_offset = ((xas->xa_index - 1) & XA_CHUNK_MASK) + 1; } xas_next_offset(xas); while (xas->xa_node && (xas->xa_index <= max)) { if (unlikely(xas->xa_offset == XA_CHUNK_SIZE)) { xas->xa_offset = xas->xa_node->offset + 1; xas->xa_node = xa_parent(xas->xa, xas->xa_node); continue; } entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset); if (xa_is_node(entry)) { xas->xa_node = xa_to_node(entry); xas->xa_offset = 0; continue; } if (entry && !xa_is_sibling(entry)) return entry; xas_next_offset(xas); } if (!xas->xa_node) xas->xa_node = XAS_BOUNDS; return NULL; } EXPORT_SYMBOL_GPL(xas_find); /** * xas_find_marked() - Find the next marked entry in the XArray. * @xas: XArray operation state. * @max: Highest index to return. * @mark: Mark number to search for. * * If the @xas has not yet been walked to an entry, return the marked entry * which has an index >= xas.xa_index. If it has been walked, the entry * currently being pointed at has been processed, and so we return the * first marked entry with an index > xas.xa_index. * * If no marked entry is found and the array is smaller than @max, @xas is * set to the bounds state and xas->xa_index is set to the smallest index * not yet in the array. This allows @xas to be immediately passed to * xas_store(). * * If no entry is found before @max is reached, @xas is set to the restart * state. * * Return: The entry, if found, otherwise %NULL. */ void *xas_find_marked(struct xa_state *xas, unsigned long max, xa_mark_t mark) { bool advance = true; unsigned int offset; void *entry; if (xas_error(xas)) return NULL; if (xas->xa_index > max) goto max; if (!xas->xa_node) { xas->xa_index = 1; goto out; } else if (xas_top(xas->xa_node)) { advance = false; entry = xa_head(xas->xa); xas->xa_node = NULL; if (xas->xa_index > max_index(entry)) goto out; if (!xa_is_node(entry)) { if (xa_marked(xas->xa, mark)) return entry; xas->xa_index = 1; goto out; } xas->xa_node = xa_to_node(entry); xas->xa_offset = xas->xa_index >> xas->xa_node->shift; } while (xas->xa_index <= max) { if (unlikely(xas->xa_offset == XA_CHUNK_SIZE)) { xas->xa_offset = xas->xa_node->offset + 1; xas->xa_node = xa_parent(xas->xa, xas->xa_node); if (!xas->xa_node) break; advance = false; continue; } if (!advance) { entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset); if (xa_is_sibling(entry)) { xas->xa_offset = xa_to_sibling(entry); xas_move_index(xas, xas->xa_offset); } } offset = xas_find_chunk(xas, advance, mark); if (offset > xas->xa_offset) { advance = false; xas_move_index(xas, offset); /* Mind the wrap */ if ((xas->xa_index - 1) >= max) goto max; xas->xa_offset = offset; if (offset == XA_CHUNK_SIZE) continue; } entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset); if (!entry && !(xa_track_free(xas->xa) && mark == XA_FREE_MARK)) continue; if (xa_is_sibling(entry)) continue; if (!xa_is_node(entry)) return entry; xas->xa_node = xa_to_node(entry); xas_set_offset(xas); } out: if (xas->xa_index > max) goto max; return set_bounds(xas); max: xas->xa_node = XAS_RESTART; return NULL; } EXPORT_SYMBOL_GPL(xas_find_marked); /** * xas_find_conflict() - Find the next present entry in a range. * @xas: XArray operation state. * * The @xas describes both a range and a position within that range. * * Context: Any context. Expects xa_lock to be held. * Return: The next entry in the range covered by @xas or %NULL. */ void *xas_find_conflict(struct xa_state *xas) { void *curr; if (xas_error(xas)) return NULL; if (!xas->xa_node) return NULL; if (xas_top(xas->xa_node)) { curr = xas_start(xas); if (!curr) return NULL; while (xa_is_node(curr)) { struct xa_node *node = xa_to_node(curr); curr = xas_descend(xas, node); } if (curr) return curr; } if (xas->xa_node->shift > xas->xa_shift) return NULL; for (;;) { if (xas->xa_node->shift == xas->xa_shift) { if ((xas->xa_offset & xas->xa_sibs) == xas->xa_sibs) break; } else if (xas->xa_offset == XA_CHUNK_MASK) { xas->xa_offset = xas->xa_node->offset; xas->xa_node = xa_parent_locked(xas->xa, xas->xa_node); if (!xas->xa_node) break; continue; } curr = xa_entry_locked(xas->xa, xas->xa_node, ++xas->xa_offset); if (xa_is_sibling(curr)) continue; while (xa_is_node(curr)) { xas->xa_node = xa_to_node(curr); xas->xa_offset = 0; curr = xa_entry_locked(xas->xa, xas->xa_node, 0); } if (curr) return curr; } xas->xa_offset -= xas->xa_sibs; return NULL; } EXPORT_SYMBOL_GPL(xas_find_conflict); /** * xa_load() - Load an entry from an XArray. * @xa: XArray. * @index: index into array. * * Context: Any context. Takes and releases the RCU lock. * Return: The entry at @index in @xa. */ void *xa_load(struct xarray *xa, unsigned long index) { XA_STATE(xas, xa, index); void *entry; rcu_read_lock(); do { entry = xa_zero_to_null(xas_load(&xas)); } while (xas_retry(&xas, entry)); rcu_read_unlock(); return entry; } EXPORT_SYMBOL(xa_load); static void *xas_result(struct xa_state *xas, void *curr) { if (xas_error(xas)) curr = xas->xa_node; return curr; } /** * __xa_erase() - Erase this entry from the XArray while locked. * @xa: XArray. * @index: Index into array. * * After this function returns, loading from @index will return %NULL. * If the index is part of a multi-index entry, all indices will be erased * and none of the entries will be part of a multi-index entry. * * Context: Any context. Expects xa_lock to be held on entry. * Return: The entry which used to be at this index. */ void *__xa_erase(struct xarray *xa, unsigned long index) { XA_STATE(xas, xa, index); return xas_result(&xas, xa_zero_to_null(xas_store(&xas, NULL))); } EXPORT_SYMBOL(__xa_erase); /** * xa_erase() - Erase this entry from the XArray. * @xa: XArray. * @index: Index of entry. * * After this function returns, loading from @index will return %NULL. * If the index is part of a multi-index entry, all indices will be erased * and none of the entries will be part of a multi-index entry. * * Context: Any context. Takes and releases the xa_lock. * Return: The entry which used to be at this index. */ void *xa_erase(struct xarray *xa, unsigned long index) { void *entry; xa_lock(xa); entry = __xa_erase(xa, index); xa_unlock(xa); return entry; } EXPORT_SYMBOL(xa_erase); /** * __xa_store() - Store this entry in the XArray. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * You must already be holding the xa_lock when calling this function. * It will drop the lock if needed to allocate memory, and then reacquire * it afterwards. * * Context: Any context. Expects xa_lock to be held on entry. May * release and reacquire xa_lock if @gfp flags permit. * Return: The old entry at this index or xa_err() if an error happened. */ void *__xa_store(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { XA_STATE(xas, xa, index); void *curr; if (WARN_ON_ONCE(xa_is_advanced(entry))) return XA_ERROR(-EINVAL); if (xa_track_free(xa) && !entry) entry = XA_ZERO_ENTRY; do { curr = xas_store(&xas, entry); if (xa_track_free(xa)) xas_clear_mark(&xas, XA_FREE_MARK); } while (__xas_nomem(&xas, gfp)); return xas_result(&xas, xa_zero_to_null(curr)); } EXPORT_SYMBOL(__xa_store); /** * xa_store() - Store this entry in the XArray. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * After this function returns, loads from this index will return @entry. * Storing into an existing multi-index entry updates the entry of every index. * The marks associated with @index are unaffected unless @entry is %NULL. * * Context: Any context. Takes and releases the xa_lock. * May sleep if the @gfp flags permit. * Return: The old entry at this index on success, xa_err(-EINVAL) if @entry * cannot be stored in an XArray, or xa_err(-ENOMEM) if memory allocation * failed. */ void *xa_store(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { void *curr; xa_lock(xa); curr = __xa_store(xa, index, entry, gfp); xa_unlock(xa); return curr; } EXPORT_SYMBOL(xa_store); static inline void *__xa_cmpxchg_raw(struct xarray *xa, unsigned long index, void *old, void *entry, gfp_t gfp); /** * __xa_cmpxchg() - Conditionally replace an entry in the XArray. * @xa: XArray. * @index: Index into array. * @old: Old value to test against. * @entry: New value to place in array. * @gfp: Memory allocation flags. * * You must already be holding the xa_lock when calling this function. * It will drop the lock if needed to allocate memory, and then reacquire * it afterwards. * * If the entry at @index is the same as @old, replace it with @entry. * If the return value is equal to @old, then the exchange was successful. * * Context: Any context. Expects xa_lock to be held on entry. May * release and reacquire xa_lock if @gfp flags permit. * Return: The old value at this index or xa_err() if an error happened. */ void *__xa_cmpxchg(struct xarray *xa, unsigned long index, void *old, void *entry, gfp_t gfp) { return xa_zero_to_null(__xa_cmpxchg_raw(xa, index, old, entry, gfp)); } EXPORT_SYMBOL(__xa_cmpxchg); static inline void *__xa_cmpxchg_raw(struct xarray *xa, unsigned long index, void *old, void *entry, gfp_t gfp) { XA_STATE(xas, xa, index); void *curr; if (WARN_ON_ONCE(xa_is_advanced(entry))) return XA_ERROR(-EINVAL); do { curr = xas_load(&xas); if (curr == old) { xas_store(&xas, entry); if (xa_track_free(xa) && entry && !curr) xas_clear_mark(&xas, XA_FREE_MARK); } } while (__xas_nomem(&xas, gfp)); return xas_result(&xas, curr); } /** * __xa_insert() - Store this entry in the XArray if no entry is present. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * Inserting a NULL entry will store a reserved entry (like xa_reserve()) * if no entry is present. Inserting will fail if a reserved entry is * present, even though loading from this index will return NULL. * * Context: Any context. Expects xa_lock to be held on entry. May * release and reacquire xa_lock if @gfp flags permit. * Return: 0 if the store succeeded. -EBUSY if another entry was present. * -ENOMEM if memory could not be allocated. */ int __xa_insert(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { void *curr; int errno; if (!entry) entry = XA_ZERO_ENTRY; curr = __xa_cmpxchg_raw(xa, index, NULL, entry, gfp); errno = xa_err(curr); if (errno) return errno; return (curr != NULL) ? -EBUSY : 0; } EXPORT_SYMBOL(__xa_insert); #ifdef CONFIG_XARRAY_MULTI static void xas_set_range(struct xa_state *xas, unsigned long first, unsigned long last) { unsigned int shift = 0; unsigned long sibs = last - first; unsigned int offset = XA_CHUNK_MASK; xas_set(xas, first); while ((first & XA_CHUNK_MASK) == 0) { if (sibs < XA_CHUNK_MASK) break; if ((sibs == XA_CHUNK_MASK) && (offset < XA_CHUNK_MASK)) break; shift += XA_CHUNK_SHIFT; if (offset == XA_CHUNK_MASK) offset = sibs & XA_CHUNK_MASK; sibs >>= XA_CHUNK_SHIFT; first >>= XA_CHUNK_SHIFT; } offset = first & XA_CHUNK_MASK; if (offset + sibs > XA_CHUNK_MASK) sibs = XA_CHUNK_MASK - offset; if ((((first + sibs + 1) << shift) - 1) > last) sibs -= 1; xas->xa_shift = shift; xas->xa_sibs = sibs; } /** * xa_store_range() - Store this entry at a range of indices in the XArray. * @xa: XArray. * @first: First index to affect. * @last: Last index to affect. * @entry: New entry. * @gfp: Memory allocation flags. * * After this function returns, loads from any index between @first and @last, * inclusive will return @entry. * Storing into an existing multi-index entry updates the entry of every index. * The marks associated with @index are unaffected unless @entry is %NULL. * * Context: Process context. Takes and releases the xa_lock. May sleep * if the @gfp flags permit. * Return: %NULL on success, xa_err(-EINVAL) if @entry cannot be stored in * an XArray, or xa_err(-ENOMEM) if memory allocation failed. */ void *xa_store_range(struct xarray *xa, unsigned long first, unsigned long last, void *entry, gfp_t gfp) { XA_STATE(xas, xa, 0); if (WARN_ON_ONCE(xa_is_internal(entry))) return XA_ERROR(-EINVAL); if (last < first) return XA_ERROR(-EINVAL); do { xas_lock(&xas); if (entry) { unsigned int order = BITS_PER_LONG; if (last + 1) order = __ffs(last + 1); xas_set_order(&xas, last, order); xas_create(&xas, true); if (xas_error(&xas)) goto unlock; } do { xas_set_range(&xas, first, last); xas_store(&xas, entry); if (xas_error(&xas)) goto unlock; first += xas_size(&xas); } while (first <= last); unlock: xas_unlock(&xas); } while (xas_nomem(&xas, gfp)); return xas_result(&xas, NULL); } EXPORT_SYMBOL(xa_store_range); /** * xas_get_order() - Get the order of an entry. * @xas: XArray operation state. * * Called after xas_load, the xas should not be in an error state. * The xas should not be pointing to a sibling entry. * * Return: A number between 0 and 63 indicating the order of the entry. */ int xas_get_order(struct xa_state *xas) { int order = 0; if (!xas->xa_node) return 0; XA_NODE_BUG_ON(xas->xa_node, xa_is_sibling(xa_entry(xas->xa, xas->xa_node, xas->xa_offset))); for (;;) { unsigned int slot = xas->xa_offset + (1 << order); if (slot >= XA_CHUNK_SIZE) break; if (!xa_is_sibling(xa_entry(xas->xa, xas->xa_node, slot))) break; order++; } order += xas->xa_node->shift; return order; } EXPORT_SYMBOL_GPL(xas_get_order); /** * xa_get_order() - Get the order of an entry. * @xa: XArray. * @index: Index of the entry. * * Return: A number between 0 and 63 indicating the order of the entry. */ int xa_get_order(struct xarray *xa, unsigned long index) { XA_STATE(xas, xa, index); int order = 0; void *entry; rcu_read_lock(); entry = xas_load(&xas); if (entry) order = xas_get_order(&xas); rcu_read_unlock(); return order; } EXPORT_SYMBOL(xa_get_order); #endif /* CONFIG_XARRAY_MULTI */ /** * __xa_alloc() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @limit: Range for allocated ID. * @entry: New entry. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Context: Any context. Expects xa_lock to be held on entry. May * release and reacquire xa_lock if @gfp flags permit. * Return: 0 on success, -ENOMEM if memory could not be allocated or * -EBUSY if there are no free entries in @limit. */ int __xa_alloc(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, gfp_t gfp) { XA_STATE(xas, xa, 0); if (WARN_ON_ONCE(xa_is_advanced(entry))) return -EINVAL; if (WARN_ON_ONCE(!xa_track_free(xa))) return -EINVAL; if (!entry) entry = XA_ZERO_ENTRY; do { xas.xa_index = limit.min; xas_find_marked(&xas, limit.max, XA_FREE_MARK); if (xas.xa_node == XAS_RESTART) xas_set_err(&xas, -EBUSY); else *id = xas.xa_index; xas_store(&xas, entry); xas_clear_mark(&xas, XA_FREE_MARK); } while (__xas_nomem(&xas, gfp)); return xas_error(&xas); } EXPORT_SYMBOL(__xa_alloc); /** * __xa_alloc_cyclic() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of allocated ID. * @next: Pointer to next ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * The search for an empty entry will start at @next and will wrap * around if necessary. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Context: Any context. Expects xa_lock to be held on entry. May * release and reacquire xa_lock if @gfp flags permit. * Return: 0 if the allocation succeeded without wrapping. 1 if the * allocation succeeded after wrapping, -ENOMEM if memory could not be * allocated or -EBUSY if there are no free entries in @limit. */ int __xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, u32 *next, gfp_t gfp) { u32 min = limit.min; int ret; limit.min = max(min, *next); ret = __xa_alloc(xa, id, entry, limit, gfp); if ((xa->xa_flags & XA_FLAGS_ALLOC_WRAPPED) && ret == 0) { xa->xa_flags &= ~XA_FLAGS_ALLOC_WRAPPED; ret = 1; } if (ret < 0 && limit.min > min) { limit.min = min; ret = __xa_alloc(xa, id, entry, limit, gfp); if (ret == 0) ret = 1; } if (ret >= 0) { *next = *id + 1; if (*next == 0) xa->xa_flags |= XA_FLAGS_ALLOC_WRAPPED; } return ret; } EXPORT_SYMBOL(__xa_alloc_cyclic); /** * __xa_set_mark() - Set this mark on this entry while locked. * @xa: XArray. * @index: Index of entry. * @mark: Mark number. * * Attempting to set a mark on a %NULL entry does not succeed. * * Context: Any context. Expects xa_lock to be held on entry. */ void __xa_set_mark(struct xarray *xa, unsigned long index, xa_mark_t mark) { XA_STATE(xas, xa, index); void *entry = xas_load(&xas); if (entry) xas_set_mark(&xas, mark); } EXPORT_SYMBOL(__xa_set_mark); /** * __xa_clear_mark() - Clear this mark on this entry while locked. * @xa: XArray. * @index: Index of entry. * @mark: Mark number. * * Context: Any context. Expects xa_lock to be held on entry. */ void __xa_clear_mark(struct xarray *xa, unsigned long index, xa_mark_t mark) { XA_STATE(xas, xa, index); void *entry = xas_load(&xas); if (entry) xas_clear_mark(&xas, mark); } EXPORT_SYMBOL(__xa_clear_mark); /** * xa_get_mark() - Inquire whether this mark is set on this entry. * @xa: XArray. * @index: Index of entry. * @mark: Mark number. * * This function uses the RCU read lock, so the result may be out of date * by the time it returns. If you need the result to be stable, use a lock. * * Context: Any context. Takes and releases the RCU lock. * Return: True if the entry at @index has this mark set, false if it doesn't. */ bool xa_get_mark(struct xarray *xa, unsigned long index, xa_mark_t mark) { XA_STATE(xas, xa, index); void *entry; rcu_read_lock(); entry = xas_start(&xas); while (xas_get_mark(&xas, mark)) { if (!xa_is_node(entry)) goto found; entry = xas_descend(&xas, xa_to_node(entry)); } rcu_read_unlock(); return false; found: rcu_read_unlock(); return true; } EXPORT_SYMBOL(xa_get_mark); /** * xa_set_mark() - Set this mark on this entry. * @xa: XArray. * @index: Index of entry. * @mark: Mark number. * * Attempting to set a mark on a %NULL entry does not succeed. * * Context: Process context. Takes and releases the xa_lock. */ void xa_set_mark(struct xarray *xa, unsigned long index, xa_mark_t mark) { xa_lock(xa); __xa_set_mark(xa, index, mark); xa_unlock(xa); } EXPORT_SYMBOL(xa_set_mark); /** * xa_clear_mark() - Clear this mark on this entry. * @xa: XArray. * @index: Index of entry. * @mark: Mark number. * * Clearing a mark always succeeds. * * Context: Process context. Takes and releases the xa_lock. */ void xa_clear_mark(struct xarray *xa, unsigned long index, xa_mark_t mark) { xa_lock(xa); __xa_clear_mark(xa, index, mark); xa_unlock(xa); } EXPORT_SYMBOL(xa_clear_mark); /** * xa_find() - Search the XArray for an entry. * @xa: XArray. * @indexp: Pointer to an index. * @max: Maximum index to search to. * @filter: Selection criterion. * * Finds the entry in @xa which matches the @filter, and has the lowest * index that is at least @indexp and no more than @max. * If an entry is found, @indexp is updated to be the index of the entry. * This function is protected by the RCU read lock, so it may not find * entries which are being simultaneously added. It will not return an * %XA_RETRY_ENTRY; if you need to see retry entries, use xas_find(). * * Context: Any context. Takes and releases the RCU lock. * Return: The entry, if found, otherwise %NULL. */ void *xa_find(struct xarray *xa, unsigned long *indexp, unsigned long max, xa_mark_t filter) { XA_STATE(xas, xa, *indexp); void *entry; rcu_read_lock(); do { if ((__force unsigned int)filter < XA_MAX_MARKS) entry = xas_find_marked(&xas, max, filter); else entry = xas_find(&xas, max); } while (xas_retry(&xas, entry)); rcu_read_unlock(); if (entry) *indexp = xas.xa_index; return entry; } EXPORT_SYMBOL(xa_find); static bool xas_sibling(struct xa_state *xas) { struct xa_node *node = xas->xa_node; unsigned long mask; if (!IS_ENABLED(CONFIG_XARRAY_MULTI) || !node) return false; mask = (XA_CHUNK_SIZE << node->shift) - 1; return (xas->xa_index & mask) > ((unsigned long)xas->xa_offset << node->shift); } /** * xa_find_after() - Search the XArray for a present entry. * @xa: XArray. * @indexp: Pointer to an index. * @max: Maximum index to search to. * @filter: Selection criterion. * * Finds the entry in @xa which matches the @filter and has the lowest * index that is above @indexp and no more than @max. * If an entry is found, @indexp is updated to be the index of the entry. * This function is protected by the RCU read lock, so it may miss entries * which are being simultaneously added. It will not return an * %XA_RETRY_ENTRY; if you need to see retry entries, use xas_find(). * * Context: Any context. Takes and releases the RCU lock. * Return: The pointer, if found, otherwise %NULL. */ void *xa_find_after(struct xarray *xa, unsigned long *indexp, unsigned long max, xa_mark_t filter) { XA_STATE(xas, xa, *indexp + 1); void *entry; if (xas.xa_index == 0) return NULL; rcu_read_lock(); for (;;) { if ((__force unsigned int)filter < XA_MAX_MARKS) entry = xas_find_marked(&xas, max, filter); else entry = xas_find(&xas, max); if (xas_invalid(&xas)) break; if (xas_sibling(&xas)) continue; if (!xas_retry(&xas, entry)) break; } rcu_read_unlock(); if (entry) *indexp = xas.xa_index; return entry; } EXPORT_SYMBOL(xa_find_after); static unsigned int xas_extract_present(struct xa_state *xas, void **dst, unsigned long max, unsigned int n) { void *entry; unsigned int i = 0; rcu_read_lock(); xas_for_each(xas, entry, max) { if (xas_retry(xas, entry)) continue; dst[i++] = entry; if (i == n) break; } rcu_read_unlock(); return i; } static unsigned int xas_extract_marked(struct xa_state *xas, void **dst, unsigned long max, unsigned int n, xa_mark_t mark) { void *entry; unsigned int i = 0; rcu_read_lock(); xas_for_each_marked(xas, entry, max, mark) { if (xas_retry(xas, entry)) continue; dst[i++] = entry; if (i == n) break; } rcu_read_unlock(); return i; } /** * xa_extract() - Copy selected entries from the XArray into a normal array. * @xa: The source XArray to copy from. * @dst: The buffer to copy entries into. * @start: The first index in the XArray eligible to be selected. * @max: The last index in the XArray eligible to be selected. * @n: The maximum number of entries to copy. * @filter: Selection criterion. * * Copies up to @n entries that match @filter from the XArray. The * copied entries will have indices between @start and @max, inclusive. * * The @filter may be an XArray mark value, in which case entries which are * marked with that mark will be copied. It may also be %XA_PRESENT, in * which case all entries which are not %NULL will be copied. * * The entries returned may not represent a snapshot of the XArray at a * moment in time. For example, if another thread stores to index 5, then * index 10, calling xa_extract() may return the old contents of index 5 * and the new contents of index 10. Indices not modified while this * function is running will not be skipped. * * If you need stronger guarantees, holding the xa_lock across calls to this * function will prevent concurrent modification. * * Context: Any context. Takes and releases the RCU lock. * Return: The number of entries copied. */ unsigned int xa_extract(struct xarray *xa, void **dst, unsigned long start, unsigned long max, unsigned int n, xa_mark_t filter) { XA_STATE(xas, xa, start); if (!n) return 0; if ((__force unsigned int)filter < XA_MAX_MARKS) return xas_extract_marked(&xas, dst, max, n, filter); return xas_extract_present(&xas, dst, max, n); } EXPORT_SYMBOL(xa_extract); /** * xa_delete_node() - Private interface for workingset code. * @node: Node to be removed from the tree. * @update: Function to call to update ancestor nodes. * * Context: xa_lock must be held on entry and will not be released. */ void xa_delete_node(struct xa_node *node, xa_update_node_t update) { struct xa_state xas = { .xa = node->array, .xa_index = (unsigned long)node->offset << (node->shift + XA_CHUNK_SHIFT), .xa_shift = node->shift + XA_CHUNK_SHIFT, .xa_offset = node->offset, .xa_node = xa_parent_locked(node->array, node), .xa_update = update, }; xas_store(&xas, NULL); } EXPORT_SYMBOL_GPL(xa_delete_node); /* For the benefit of the test suite */ /** * xa_destroy() - Free all internal data structures. * @xa: XArray. * * After calling this function, the XArray is empty and has freed all memory * allocated for its internal data structures. You are responsible for * freeing the objects referenced by the XArray. * * Context: Any context. Takes and releases the xa_lock, interrupt-safe. */ void xa_destroy(struct xarray *xa) { XA_STATE(xas, xa, 0); unsigned long flags; void *entry; xas.xa_node = NULL; xas_lock_irqsave(&xas, flags); entry = xa_head_locked(xa); RCU_INIT_POINTER(xa->xa_head, NULL); xas_init_marks(&xas); if (xa_zero_busy(xa)) xa_mark_clear(xa, XA_FREE_MARK); /* lockdep checks we're still holding the lock in xas_free_nodes() */ if (xa_is_node(entry)) xas_free_nodes(&xas, xa_to_node(entry)); xas_unlock_irqrestore(&xas, flags); } EXPORT_SYMBOL(xa_destroy); #ifdef XA_DEBUG void xa_dump_node(const struct xa_node *node) { unsigned i, j; if (!node) return; if ((unsigned long)node & 3) { pr_cont("node %px\n", node); return; } pr_cont("node %px %s %d parent %px shift %d count %d values %d " "array %px list %px %px marks", node, node->parent ? "offset" : "max", node->offset, node->parent, node->shift, node->count, node->nr_values, node->array, node->private_list.prev, node->private_list.next); for (i = 0; i < XA_MAX_MARKS; i++) for (j = 0; j < XA_MARK_LONGS; j++) pr_cont(" %lx", node->marks[i][j]); pr_cont("\n"); } void xa_dump_index(unsigned long index, unsigned int shift) { if (!shift) pr_info("%lu: ", index); else if (shift >= BITS_PER_LONG) pr_info("0-%lu: ", ~0UL); else pr_info("%lu-%lu: ", index, index | ((1UL << shift) - 1)); } void xa_dump_entry(const void *entry, unsigned long index, unsigned long shift) { if (!entry) return; xa_dump_index(index, shift); if (xa_is_node(entry)) { if (shift == 0) { pr_cont("%px\n", entry); } else { unsigned long i; struct xa_node *node = xa_to_node(entry); xa_dump_node(node); for (i = 0; i < XA_CHUNK_SIZE; i++) xa_dump_entry(node->slots[i], index + (i << node->shift), node->shift); } } else if (xa_is_value(entry)) pr_cont("value %ld (0x%lx) [%px]\n", xa_to_value(entry), xa_to_value(entry), entry); else if (!xa_is_internal(entry)) pr_cont("%px\n", entry); else if (xa_is_retry(entry)) pr_cont("retry (%ld)\n", xa_to_internal(entry)); else if (xa_is_sibling(entry)) pr_cont("sibling (slot %ld)\n", xa_to_sibling(entry)); else if (xa_is_zero(entry)) pr_cont("zero (%ld)\n", xa_to_internal(entry)); else pr_cont("UNKNOWN ENTRY (%px)\n", entry); } void xa_dump(const struct xarray *xa) { void *entry = xa->xa_head; unsigned int shift = 0; pr_info("xarray: %px head %px flags %x marks %d %d %d\n", xa, entry, xa->xa_flags, xa_marked(xa, XA_MARK_0), xa_marked(xa, XA_MARK_1), xa_marked(xa, XA_MARK_2)); if (xa_is_node(entry)) shift = xa_to_node(entry)->shift + XA_CHUNK_SHIFT; xa_dump_entry(entry, 0, shift); } #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 /* SPDX-License-Identifier: GPL-2.0-only */ /* include/net/xdp.h * * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. */ #ifndef __LINUX_NET_XDP_H__ #define __LINUX_NET_XDP_H__ #include <linux/bitfield.h> #include <linux/filter.h> #include <linux/netdevice.h> #include <linux/skbuff.h> /* skb_shared_info */ #include <net/page_pool/types.h> /** * DOC: XDP RX-queue information * * The XDP RX-queue info (xdp_rxq_info) is associated with the driver * level RX-ring queues. It is information that is specific to how * the driver has configured a given RX-ring queue. * * Each xdp_buff frame received in the driver carries a (pointer) * reference to this xdp_rxq_info structure. This provides the XDP * data-path read-access to RX-info for both kernel and bpf-side * (limited subset). * * For now, direct access is only safe while running in NAPI/softirq * context. Contents are read-mostly and must not be updated during * driver NAPI/softirq poll. * * The driver usage API is a register and unregister API. * * The struct is not directly tied to the XDP prog. A new XDP prog * can be attached as long as it doesn't change the underlying * RX-ring. If the RX-ring does change significantly, the NIC driver * naturally needs to stop the RX-ring before purging and reallocating * memory. In that process the driver MUST call unregister (which * also applies for driver shutdown and unload). The register API is * also mandatory during RX-ring setup. */ enum xdp_mem_type { MEM_TYPE_PAGE_SHARED = 0, /* Split-page refcnt based model */ MEM_TYPE_PAGE_ORDER0, /* Orig XDP full page model */ MEM_TYPE_PAGE_POOL, MEM_TYPE_XSK_BUFF_POOL, MEM_TYPE_MAX, }; /* XDP flags for ndo_xdp_xmit */ #define XDP_XMIT_FLUSH (1U << 0) /* doorbell signal consumer */ #define XDP_XMIT_FLAGS_MASK XDP_XMIT_FLUSH struct xdp_mem_info { u32 type; /* enum xdp_mem_type, but known size type */ u32 id; }; struct page_pool; struct xdp_rxq_info { struct net_device *dev; u32 queue_index; u32 reg_state; struct xdp_mem_info mem; u32 frag_size; } ____cacheline_aligned; /* perf critical, avoid false-sharing */ struct xdp_txq_info { struct net_device *dev; }; enum xdp_buff_flags { XDP_FLAGS_HAS_FRAGS = BIT(0), /* non-linear xdp buff */ XDP_FLAGS_FRAGS_PF_MEMALLOC = BIT(1), /* xdp paged memory is under * pressure */ }; struct xdp_buff { void *data; void *data_end; void *data_meta; void *data_hard_start; struct xdp_rxq_info *rxq; struct xdp_txq_info *txq; u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/ u32 flags; /* supported values defined in xdp_buff_flags */ }; static __always_inline bool xdp_buff_has_frags(const struct xdp_buff *xdp) { return !!(xdp->flags & XDP_FLAGS_HAS_FRAGS); } static __always_inline void xdp_buff_set_frags_flag(struct xdp_buff *xdp) { xdp->flags |= XDP_FLAGS_HAS_FRAGS; } static __always_inline void xdp_buff_clear_frags_flag(struct xdp_buff *xdp) { xdp->flags &= ~XDP_FLAGS_HAS_FRAGS; } static __always_inline bool xdp_buff_is_frag_pfmemalloc(const struct xdp_buff *xdp) { return !!(xdp->flags & XDP_FLAGS_FRAGS_PF_MEMALLOC); } static __always_inline void xdp_buff_set_frag_pfmemalloc(struct xdp_buff *xdp) { xdp->flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC; } static __always_inline void xdp_init_buff(struct xdp_buff *xdp, u32 frame_sz, struct xdp_rxq_info *rxq) { xdp->frame_sz = frame_sz; xdp->rxq = rxq; xdp->flags = 0; } static __always_inline void xdp_prepare_buff(struct xdp_buff *xdp, unsigned char *hard_start, int headroom, int data_len, const bool meta_valid) { unsigned char *data = hard_start + headroom; xdp->data_hard_start = hard_start; xdp->data = data; xdp->data_end = data + data_len; xdp->data_meta = meta_valid ? data : data + 1; } /* Reserve memory area at end-of data area. * * This macro reserves tailroom in the XDP buffer by limiting the * XDP/BPF data access to data_hard_end. Notice same area (and size) * is used for XDP_PASS, when constructing the SKB via build_skb(). */ #define xdp_data_hard_end(xdp) \ ((xdp)->data_hard_start + (xdp)->frame_sz - \ SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) static inline struct skb_shared_info * xdp_get_shared_info_from_buff(const struct xdp_buff *xdp) { return (struct skb_shared_info *)xdp_data_hard_end(xdp); } static __always_inline unsigned int xdp_get_buff_len(const struct xdp_buff *xdp) { unsigned int len = xdp->data_end - xdp->data; const struct skb_shared_info *sinfo; if (likely(!xdp_buff_has_frags(xdp))) goto out; sinfo = xdp_get_shared_info_from_buff(xdp); len += sinfo->xdp_frags_size; out: return len; } void xdp_return_frag(netmem_ref netmem, const struct xdp_buff *xdp); /** * __xdp_buff_add_frag - attach frag to &xdp_buff * @xdp: XDP buffer to attach the frag to * @netmem: network memory containing the frag * @offset: offset at which the frag starts * @size: size of the frag * @truesize: total memory size occupied by the frag * @try_coalesce: whether to try coalescing the frags (not valid for XSk) * * Attach frag to the XDP buffer. If it currently has no frags attached, * initialize the related fields, otherwise check that the frag number * didn't reach the limit of ``MAX_SKB_FRAGS``. If possible, try coalescing * the frag with the previous one. * The function doesn't check/update the pfmemalloc bit. Please use the * non-underscored wrapper in drivers. * * Return: true on success, false if there's no space for the frag in * the shared info struct. */ static inline bool __xdp_buff_add_frag(struct xdp_buff *xdp, netmem_ref netmem, u32 offset, u32 size, u32 truesize, bool try_coalesce) { struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); skb_frag_t *prev; u32 nr_frags; if (!xdp_buff_has_frags(xdp)) { xdp_buff_set_frags_flag(xdp); nr_frags = 0; sinfo->xdp_frags_size = 0; sinfo->xdp_frags_truesize = 0; goto fill; } nr_frags = sinfo->nr_frags; prev = &sinfo->frags[nr_frags - 1]; if (try_coalesce && netmem == skb_frag_netmem(prev) && offset == skb_frag_off(prev) + skb_frag_size(prev)) { skb_frag_size_add(prev, size); /* Guaranteed to only decrement the refcount */ xdp_return_frag(netmem, xdp); } else if (unlikely(nr_frags == MAX_SKB_FRAGS)) { return false; } else { fill: __skb_fill_netmem_desc_noacc(sinfo, nr_frags++, netmem, offset, size); } sinfo->nr_frags = nr_frags; sinfo->xdp_frags_size += size; sinfo->xdp_frags_truesize += truesize; return true; } /** * xdp_buff_add_frag - attach frag to &xdp_buff * @xdp: XDP buffer to attach the frag to * @netmem: network memory containing the frag * @offset: offset at which the frag starts * @size: size of the frag * @truesize: total memory size occupied by the frag * * Version of __xdp_buff_add_frag() which takes care of the pfmemalloc bit. * * Return: true on success, false if there's no space for the frag in * the shared info struct. */ static inline bool xdp_buff_add_frag(struct xdp_buff *xdp, netmem_ref netmem, u32 offset, u32 size, u32 truesize) { if (!__xdp_buff_add_frag(xdp, netmem, offset, size, truesize, true)) return false; if (unlikely(netmem_is_pfmemalloc(netmem))) xdp_buff_set_frag_pfmemalloc(xdp); return true; } struct xdp_frame { void *data; u32 len; u32 headroom; u32 metasize; /* uses lower 8-bits */ /* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time, * while mem_type is valid on remote CPU. */ enum xdp_mem_type mem_type:32; struct net_device *dev_rx; /* used by cpumap */ u32 frame_sz; u32 flags; /* supported values defined in xdp_buff_flags */ }; static __always_inline bool xdp_frame_has_frags(const struct xdp_frame *frame) { return !!(frame->flags & XDP_FLAGS_HAS_FRAGS); } static __always_inline bool xdp_frame_is_frag_pfmemalloc(const struct xdp_frame *frame) { return !!(frame->flags & XDP_FLAGS_FRAGS_PF_MEMALLOC); } #define XDP_BULK_QUEUE_SIZE 16 struct xdp_frame_bulk { int count; netmem_ref q[XDP_BULK_QUEUE_SIZE]; }; static __always_inline void xdp_frame_bulk_init(struct xdp_frame_bulk *bq) { bq->count = 0; } static inline struct skb_shared_info * xdp_get_shared_info_from_frame(const struct xdp_frame *frame) { void *data_hard_start = frame->data - frame->headroom - sizeof(*frame); return (struct skb_shared_info *)(data_hard_start + frame->frame_sz - SKB_DATA_ALIGN(sizeof(struct skb_shared_info))); } struct xdp_cpumap_stats { unsigned int redirect; unsigned int pass; unsigned int drop; }; /* Clear kernel pointers in xdp_frame */ static inline void xdp_scrub_frame(struct xdp_frame *frame) { frame->data = NULL; frame->dev_rx = NULL; } static inline void xdp_update_skb_shared_info(struct sk_buff *skb, u8 nr_frags, unsigned int size, unsigned int truesize, bool pfmemalloc) { struct skb_shared_info *sinfo = skb_shinfo(skb); sinfo->nr_frags = nr_frags; /* * ``destructor_arg`` is unionized with ``xdp_frags_{,true}size``, * reset it after that these fields aren't used anymore. */ sinfo->destructor_arg = NULL; skb->len += size; skb->data_len += size; skb->truesize += truesize; skb->pfmemalloc |= pfmemalloc; } /* Avoids inlining WARN macro in fast-path */ void xdp_warn(const char *msg, const char *func, const int line); #define XDP_WARN(msg) xdp_warn(msg, __func__, __LINE__) struct sk_buff *xdp_build_skb_from_buff(const struct xdp_buff *xdp); struct sk_buff *xdp_build_skb_from_zc(struct xdp_buff *xdp); struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp); struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, struct sk_buff *skb, struct net_device *dev); struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf, struct net_device *dev); struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf); static inline void xdp_convert_frame_to_buff(const struct xdp_frame *frame, struct xdp_buff *xdp) { xdp->data_hard_start = frame->data - frame->headroom - sizeof(*frame); xdp->data = frame->data; xdp->data_end = frame->data + frame->len; xdp->data_meta = frame->data - frame->metasize; xdp->frame_sz = frame->frame_sz; xdp->flags = frame->flags; } static inline int xdp_update_frame_from_buff(const struct xdp_buff *xdp, struct xdp_frame *xdp_frame) { int metasize, headroom; /* Assure headroom is available for storing info */ headroom = xdp->data - xdp->data_hard_start; metasize = xdp->data - xdp->data_meta; metasize = metasize > 0 ? metasize : 0; if (unlikely((headroom - metasize) < sizeof(*xdp_frame))) return -ENOSPC; /* Catch if driver didn't reserve tailroom for skb_shared_info */ if (unlikely(xdp->data_end > xdp_data_hard_end(xdp))) { XDP_WARN("Driver BUG: missing reserved tailroom"); return -ENOSPC; } xdp_frame->data = xdp->data; xdp_frame->len = xdp->data_end - xdp->data; xdp_frame->headroom = headroom - sizeof(*xdp_frame); xdp_frame->metasize = metasize; xdp_frame->frame_sz = xdp->frame_sz; xdp_frame->flags = xdp->flags; return 0; } /* Convert xdp_buff to xdp_frame */ static inline struct xdp_frame *xdp_convert_buff_to_frame(struct xdp_buff *xdp) { struct xdp_frame *xdp_frame; if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) return xdp_convert_zc_to_xdp_frame(xdp); /* Store info in top of packet */ xdp_frame = xdp->data_hard_start; if (unlikely(xdp_update_frame_from_buff(xdp, xdp_frame) < 0)) return NULL; /* rxq only valid until napi_schedule ends, convert to xdp_mem_type */ xdp_frame->mem_type = xdp->rxq->mem.type; return xdp_frame; } void __xdp_return(netmem_ref netmem, enum xdp_mem_type mem_type, bool napi_direct, struct xdp_buff *xdp); void xdp_return_frame(struct xdp_frame *xdpf); void xdp_return_frame_rx_napi(struct xdp_frame *xdpf); void xdp_return_buff(struct xdp_buff *xdp); void xdp_return_frame_bulk(struct xdp_frame *xdpf, struct xdp_frame_bulk *bq); static inline void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq) { if (unlikely(!bq->count)) return; page_pool_put_netmem_bulk(bq->q, bq->count); bq->count = 0; } static __always_inline unsigned int xdp_get_frame_len(const struct xdp_frame *xdpf) { const struct skb_shared_info *sinfo; unsigned int len = xdpf->len; if (likely(!xdp_frame_has_frags(xdpf))) goto out; sinfo = xdp_get_shared_info_from_frame(xdpf); len += sinfo->xdp_frags_size; out: return len; } int __xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, struct net_device *dev, u32 queue_index, unsigned int napi_id, u32 frag_size); static inline int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, struct net_device *dev, u32 queue_index, unsigned int napi_id) { return __xdp_rxq_info_reg(xdp_rxq, dev, queue_index, napi_id, 0); } void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq); void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq); bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq); int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, enum xdp_mem_type type, void *allocator); void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq); int xdp_reg_mem_model(struct xdp_mem_info *mem, enum xdp_mem_type type, void *allocator); void xdp_unreg_mem_model(struct xdp_mem_info *mem); int xdp_reg_page_pool(struct page_pool *pool); void xdp_unreg_page_pool(const struct page_pool *pool); void xdp_rxq_info_attach_page_pool(struct xdp_rxq_info *xdp_rxq, const struct page_pool *pool); /** * xdp_rxq_info_attach_mem_model - attach registered mem info to RxQ info * @xdp_rxq: XDP RxQ info to attach the memory info to * @mem: already registered memory info * * If the driver registers its memory providers manually, it must use this * function instead of xdp_rxq_info_reg_mem_model(). */ static inline void xdp_rxq_info_attach_mem_model(struct xdp_rxq_info *xdp_rxq, const struct xdp_mem_info *mem) { xdp_rxq->mem = *mem; } /** * xdp_rxq_info_detach_mem_model - detach registered mem info from RxQ info * @xdp_rxq: XDP RxQ info to detach the memory info from * * If the driver registers its memory providers manually and then attaches it * via xdp_rxq_info_attach_mem_model(), it must call this function before * xdp_rxq_info_unreg(). */ static inline void xdp_rxq_info_detach_mem_model(struct xdp_rxq_info *xdp_rxq) { xdp_rxq->mem = (struct xdp_mem_info){ }; } /* Drivers not supporting XDP metadata can use this helper, which * rejects any room expansion for metadata as a result. */ static __always_inline void xdp_set_data_meta_invalid(struct xdp_buff *xdp) { xdp->data_meta = xdp->data + 1; } static __always_inline bool xdp_data_meta_unsupported(const struct xdp_buff *xdp) { return unlikely(xdp->data_meta > xdp->data); } static inline bool xdp_metalen_invalid(unsigned long metalen) { unsigned long meta_max; meta_max = type_max(typeof_member(struct skb_shared_info, meta_len)); BUILD_BUG_ON(!__builtin_constant_p(meta_max)); return !IS_ALIGNED(metalen, sizeof(u32)) || metalen > meta_max; } struct xdp_attachment_info { struct bpf_prog *prog; u32 flags; }; struct netdev_bpf; void xdp_attachment_setup(struct xdp_attachment_info *info, struct netdev_bpf *bpf); #define DEV_MAP_BULK_SIZE XDP_BULK_QUEUE_SIZE /* Define the relationship between xdp-rx-metadata kfunc and * various other entities: * - xdp_rx_metadata enum * - netdev netlink enum (Documentation/netlink/specs/netdev.yaml) * - kfunc name * - xdp_metadata_ops field */ #define XDP_METADATA_KFUNC_xxx \ XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_TIMESTAMP, \ NETDEV_XDP_RX_METADATA_TIMESTAMP, \ bpf_xdp_metadata_rx_timestamp, \ xmo_rx_timestamp) \ XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_HASH, \ NETDEV_XDP_RX_METADATA_HASH, \ bpf_xdp_metadata_rx_hash, \ xmo_rx_hash) \ XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_VLAN_TAG, \ NETDEV_XDP_RX_METADATA_VLAN_TAG, \ bpf_xdp_metadata_rx_vlan_tag, \ xmo_rx_vlan_tag) \ enum xdp_rx_metadata { #define XDP_METADATA_KFUNC(name, _, __, ___) name, XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC MAX_XDP_METADATA_KFUNC, }; enum xdp_rss_hash_type { /* First part: Individual bits for L3/L4 types */ XDP_RSS_L3_IPV4 = BIT(0), XDP_RSS_L3_IPV6 = BIT(1), /* The fixed (L3) IPv4 and IPv6 headers can both be followed by * variable/dynamic headers, IPv4 called Options and IPv6 called * Extension Headers. HW RSS type can contain this info. */ XDP_RSS_L3_DYNHDR = BIT(2), /* When RSS hash covers L4 then drivers MUST set XDP_RSS_L4 bit in * addition to the protocol specific bit. This ease interaction with * SKBs and avoids reserving a fixed mask for future L4 protocol bits. */ XDP_RSS_L4 = BIT(3), /* L4 based hash, proto can be unknown */ XDP_RSS_L4_TCP = BIT(4), XDP_RSS_L4_UDP = BIT(5), XDP_RSS_L4_SCTP = BIT(6), XDP_RSS_L4_IPSEC = BIT(7), /* L4 based hash include IPSEC SPI */ XDP_RSS_L4_ICMP = BIT(8), /* Second part: RSS hash type combinations used for driver HW mapping */ XDP_RSS_TYPE_NONE = 0, XDP_RSS_TYPE_L2 = XDP_RSS_TYPE_NONE, XDP_RSS_TYPE_L3_IPV4 = XDP_RSS_L3_IPV4, XDP_RSS_TYPE_L3_IPV6 = XDP_RSS_L3_IPV6, XDP_RSS_TYPE_L3_IPV4_OPT = XDP_RSS_L3_IPV4 | XDP_RSS_L3_DYNHDR, XDP_RSS_TYPE_L3_IPV6_EX = XDP_RSS_L3_IPV6 | XDP_RSS_L3_DYNHDR, XDP_RSS_TYPE_L4_ANY = XDP_RSS_L4, XDP_RSS_TYPE_L4_IPV4_TCP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_TCP, XDP_RSS_TYPE_L4_IPV4_UDP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_UDP, XDP_RSS_TYPE_L4_IPV4_SCTP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_SCTP, XDP_RSS_TYPE_L4_IPV4_IPSEC = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_IPSEC, XDP_RSS_TYPE_L4_IPV4_ICMP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_ICMP, XDP_RSS_TYPE_L4_IPV6_TCP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_TCP, XDP_RSS_TYPE_L4_IPV6_UDP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_UDP, XDP_RSS_TYPE_L4_IPV6_SCTP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_SCTP, XDP_RSS_TYPE_L4_IPV6_IPSEC = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_IPSEC, XDP_RSS_TYPE_L4_IPV6_ICMP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_ICMP, XDP_RSS_TYPE_L4_IPV6_TCP_EX = XDP_RSS_TYPE_L4_IPV6_TCP | XDP_RSS_L3_DYNHDR, XDP_RSS_TYPE_L4_IPV6_UDP_EX = XDP_RSS_TYPE_L4_IPV6_UDP | XDP_RSS_L3_DYNHDR, XDP_RSS_TYPE_L4_IPV6_SCTP_EX = XDP_RSS_TYPE_L4_IPV6_SCTP | XDP_RSS_L3_DYNHDR, }; struct xdp_metadata_ops { int (*xmo_rx_timestamp)(const struct xdp_md *ctx, u64 *timestamp); int (*xmo_rx_hash)(const struct xdp_md *ctx, u32 *hash, enum xdp_rss_hash_type *rss_type); int (*xmo_rx_vlan_tag)(const struct xdp_md *ctx, __be16 *vlan_proto, u16 *vlan_tci); }; #ifdef CONFIG_NET u32 bpf_xdp_metadata_kfunc_id(int id); bool bpf_dev_bound_kfunc_id(u32 btf_id); void xdp_set_features_flag(struct net_device *dev, xdp_features_t val); void xdp_set_features_flag_locked(struct net_device *dev, xdp_features_t val); void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg); void xdp_features_set_redirect_target_locked(struct net_device *dev, bool support_sg); void xdp_features_clear_redirect_target(struct net_device *dev); void xdp_features_clear_redirect_target_locked(struct net_device *dev); #else static inline u32 bpf_xdp_metadata_kfunc_id(int id) { return 0; } static inline bool bpf_dev_bound_kfunc_id(u32 btf_id) { return false; } static inline void xdp_set_features_flag(struct net_device *dev, xdp_features_t val) { } static inline void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg) { } static inline void xdp_features_clear_redirect_target(struct net_device *dev) { } #endif static inline void xdp_clear_features_flag(struct net_device *dev) { xdp_set_features_flag(dev, 0); } static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog, struct xdp_buff *xdp) { /* Driver XDP hooks are invoked within a single NAPI poll cycle and thus * under local_bh_disable(), which provides the needed RCU protection * for accessing map entries. */ u32 act = __bpf_prog_run(prog, xdp, BPF_DISPATCHER_FUNC(xdp)); if (static_branch_unlikely(&bpf_master_redirect_enabled_key)) { if (act == XDP_TX && netif_is_bond_slave(xdp->rxq->dev)) act = xdp_master_redirect(xdp); } return act; } #endif /* __LINUX_NET_XDP_H__ */
210 210 209 210 210 208 210 210 210 210 211 213 211 213 213 212 212 213 212 213 211 185 182 185 185 185 184 183 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2012-2015 - ARM Ltd * Author: Marc Zyngier <marc.zyngier@arm.com> */ #include <hyp/sysreg-sr.h> #include <linux/compiler.h> #include <linux/kvm_host.h> #include <asm/kprobes.h> #include <asm/kvm_asm.h> #include <asm/kvm_emulate.h> #include <asm/kvm_hyp.h> #include <asm/kvm_nested.h> static void __sysreg_save_vel2_state(struct kvm_vcpu *vcpu) { /* These registers are common with EL1 */ __vcpu_assign_sys_reg(vcpu, PAR_EL1, read_sysreg(par_el1)); __vcpu_assign_sys_reg(vcpu, TPIDR_EL1, read_sysreg(tpidr_el1)); __vcpu_assign_sys_reg(vcpu, ESR_EL2, read_sysreg_el1(SYS_ESR)); __vcpu_assign_sys_reg(vcpu, AFSR0_EL2, read_sysreg_el1(SYS_AFSR0)); __vcpu_assign_sys_reg(vcpu, AFSR1_EL2, read_sysreg_el1(SYS_AFSR1)); __vcpu_assign_sys_reg(vcpu, FAR_EL2, read_sysreg_el1(SYS_FAR)); __vcpu_assign_sys_reg(vcpu, MAIR_EL2, read_sysreg_el1(SYS_MAIR)); __vcpu_assign_sys_reg(vcpu, VBAR_EL2, read_sysreg_el1(SYS_VBAR)); __vcpu_assign_sys_reg(vcpu, CONTEXTIDR_EL2, read_sysreg_el1(SYS_CONTEXTIDR)); __vcpu_assign_sys_reg(vcpu, AMAIR_EL2, read_sysreg_el1(SYS_AMAIR)); /* * In VHE mode those registers are compatible between EL1 and EL2, * and the guest uses the _EL1 versions on the CPU naturally. * So we save them into their _EL2 versions here. * For nVHE mode we trap accesses to those registers, so our * _EL2 copy in sys_regs[] is always up-to-date and we don't need * to save anything here. */ if (vcpu_el2_e2h_is_set(vcpu)) { u64 val; /* * We don't save CPTR_EL2, as accesses to CPACR_EL1 * are always trapped, ensuring that the in-memory * copy is always up-to-date. A small blessing... */ __vcpu_assign_sys_reg(vcpu, SCTLR_EL2, read_sysreg_el1(SYS_SCTLR)); __vcpu_assign_sys_reg(vcpu, TTBR0_EL2, read_sysreg_el1(SYS_TTBR0)); __vcpu_assign_sys_reg(vcpu, TTBR1_EL2, read_sysreg_el1(SYS_TTBR1)); __vcpu_assign_sys_reg(vcpu, TCR_EL2, read_sysreg_el1(SYS_TCR)); if (ctxt_has_tcrx(&vcpu->arch.ctxt)) { __vcpu_assign_sys_reg(vcpu, TCR2_EL2, read_sysreg_el1(SYS_TCR2)); if (ctxt_has_s1pie(&vcpu->arch.ctxt)) { __vcpu_assign_sys_reg(vcpu, PIRE0_EL2, read_sysreg_el1(SYS_PIRE0)); __vcpu_assign_sys_reg(vcpu, PIR_EL2, read_sysreg_el1(SYS_PIR)); } if (ctxt_has_s1poe(&vcpu->arch.ctxt)) __vcpu_assign_sys_reg(vcpu, POR_EL2, read_sysreg_el1(SYS_POR)); } /* * The EL1 view of CNTKCTL_EL1 has a bunch of RES0 bits where * the interesting CNTHCTL_EL2 bits live. So preserve these * bits when reading back the guest-visible value. */ val = read_sysreg_el1(SYS_CNTKCTL); val &= CNTKCTL_VALID_BITS; __vcpu_rmw_sys_reg(vcpu, CNTHCTL_EL2, &=, ~CNTKCTL_VALID_BITS); __vcpu_rmw_sys_reg(vcpu, CNTHCTL_EL2, |=, val); } __vcpu_assign_sys_reg(vcpu, SP_EL2, read_sysreg(sp_el1)); __vcpu_assign_sys_reg(vcpu, ELR_EL2, read_sysreg_el1(SYS_ELR)); __vcpu_assign_sys_reg(vcpu, SPSR_EL2, read_sysreg_el1(SYS_SPSR)); if (ctxt_has_sctlr2(&vcpu->arch.ctxt)) __vcpu_assign_sys_reg(vcpu, SCTLR2_EL2, read_sysreg_el1(SYS_SCTLR2)); } static void __sysreg_restore_vel2_state(struct kvm_vcpu *vcpu) { u64 val; /* These registers are common with EL1 */ write_sysreg(__vcpu_sys_reg(vcpu, PAR_EL1), par_el1); write_sysreg(__vcpu_sys_reg(vcpu, TPIDR_EL1), tpidr_el1); write_sysreg(ctxt_midr_el1(&vcpu->arch.ctxt), vpidr_el2); write_sysreg(__vcpu_sys_reg(vcpu, MPIDR_EL1), vmpidr_el2); write_sysreg_el1(__vcpu_sys_reg(vcpu, MAIR_EL2), SYS_MAIR); write_sysreg_el1(__vcpu_sys_reg(vcpu, VBAR_EL2), SYS_VBAR); write_sysreg_el1(__vcpu_sys_reg(vcpu, CONTEXTIDR_EL2), SYS_CONTEXTIDR); write_sysreg_el1(__vcpu_sys_reg(vcpu, AMAIR_EL2), SYS_AMAIR); if (vcpu_el2_e2h_is_set(vcpu)) { /* * In VHE mode those registers are compatible between * EL1 and EL2. */ write_sysreg_el1(__vcpu_sys_reg(vcpu, SCTLR_EL2), SYS_SCTLR); write_sysreg_el1(__vcpu_sys_reg(vcpu, CPTR_EL2), SYS_CPACR); write_sysreg_el1(__vcpu_sys_reg(vcpu, TTBR0_EL2), SYS_TTBR0); write_sysreg_el1(__vcpu_sys_reg(vcpu, TTBR1_EL2), SYS_TTBR1); write_sysreg_el1(__vcpu_sys_reg(vcpu, TCR_EL2), SYS_TCR); write_sysreg_el1(__vcpu_sys_reg(vcpu, CNTHCTL_EL2), SYS_CNTKCTL); } else { /* * CNTHCTL_EL2 only affects EL1 when running nVHE, so * no need to restore it. */ val = translate_sctlr_el2_to_sctlr_el1(__vcpu_sys_reg(vcpu, SCTLR_EL2)); write_sysreg_el1(val, SYS_SCTLR); val = translate_cptr_el2_to_cpacr_el1(__vcpu_sys_reg(vcpu, CPTR_EL2)); write_sysreg_el1(val, SYS_CPACR); val = translate_ttbr0_el2_to_ttbr0_el1(__vcpu_sys_reg(vcpu, TTBR0_EL2)); write_sysreg_el1(val, SYS_TTBR0); val = translate_tcr_el2_to_tcr_el1(__vcpu_sys_reg(vcpu, TCR_EL2)); write_sysreg_el1(val, SYS_TCR); } if (ctxt_has_tcrx(&vcpu->arch.ctxt)) { write_sysreg_el1(__vcpu_sys_reg(vcpu, TCR2_EL2), SYS_TCR2); if (ctxt_has_s1pie(&vcpu->arch.ctxt)) { write_sysreg_el1(__vcpu_sys_reg(vcpu, PIR_EL2), SYS_PIR); write_sysreg_el1(__vcpu_sys_reg(vcpu, PIRE0_EL2), SYS_PIRE0); } if (ctxt_has_s1poe(&vcpu->arch.ctxt)) write_sysreg_el1(__vcpu_sys_reg(vcpu, POR_EL2), SYS_POR); } write_sysreg_el1(__vcpu_sys_reg(vcpu, ESR_EL2), SYS_ESR); write_sysreg_el1(__vcpu_sys_reg(vcpu, AFSR0_EL2), SYS_AFSR0); write_sysreg_el1(__vcpu_sys_reg(vcpu, AFSR1_EL2), SYS_AFSR1); write_sysreg_el1(__vcpu_sys_reg(vcpu, FAR_EL2), SYS_FAR); write_sysreg(__vcpu_sys_reg(vcpu, SP_EL2), sp_el1); write_sysreg_el1(__vcpu_sys_reg(vcpu, ELR_EL2), SYS_ELR); write_sysreg_el1(__vcpu_sys_reg(vcpu, SPSR_EL2), SYS_SPSR); if (ctxt_has_sctlr2(&vcpu->arch.ctxt)) write_sysreg_el1(__vcpu_sys_reg(vcpu, SCTLR2_EL2), SYS_SCTLR2); } /* * VHE: Host and guest must save mdscr_el1 and sp_el0 (and the PC and * pstate, which are handled as part of the el2 return state) on every * switch (sp_el0 is being dealt with in the assembly code). * tpidr_el0 and tpidrro_el0 only need to be switched when going * to host userspace or a different VCPU. EL1 registers only need to be * switched when potentially going to run a different VCPU. The latter two * classes are handled as part of kvm_arch_vcpu_load and kvm_arch_vcpu_put. */ void sysreg_save_host_state_vhe(struct kvm_cpu_context *ctxt) { __sysreg_save_common_state(ctxt); } NOKPROBE_SYMBOL(sysreg_save_host_state_vhe); void sysreg_save_guest_state_vhe(struct kvm_cpu_context *ctxt) { __sysreg_save_common_state(ctxt); __sysreg_save_el2_return_state(ctxt); } NOKPROBE_SYMBOL(sysreg_save_guest_state_vhe); void sysreg_restore_host_state_vhe(struct kvm_cpu_context *ctxt) { __sysreg_restore_common_state(ctxt); } NOKPROBE_SYMBOL(sysreg_restore_host_state_vhe); void sysreg_restore_guest_state_vhe(struct kvm_cpu_context *ctxt) { __sysreg_restore_common_state(ctxt); __sysreg_restore_el2_return_state(ctxt); } NOKPROBE_SYMBOL(sysreg_restore_guest_state_vhe); /** * __vcpu_load_switch_sysregs - Load guest system registers to the physical CPU * * @vcpu: The VCPU pointer * * Load system registers that do not affect the host's execution, for * example EL1 system registers on a VHE system where the host kernel * runs at EL2. This function is called from KVM's vcpu_load() function * and loading system register state early avoids having to load them on * every entry to the VM. */ void __vcpu_load_switch_sysregs(struct kvm_vcpu *vcpu) { struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt; struct kvm_cpu_context *host_ctxt; u64 midr, mpidr; host_ctxt = host_data_ptr(host_ctxt); __sysreg_save_user_state(host_ctxt); /* * When running a normal EL1 guest, we only load a new vcpu * after a context switch, which imvolves a DSB, so all * speculative EL1&0 walks will have already completed. * If running NV, the vcpu may transition between vEL1 and * vEL2 without a context switch, so make sure we complete * those walks before loading a new context. */ if (vcpu_has_nv(vcpu)) dsb(nsh); /* * Load guest EL1 and user state * * We must restore the 32-bit state before the sysregs, thanks * to erratum #852523 (Cortex-A57) or #853709 (Cortex-A72). */ __sysreg32_restore_state(vcpu); __sysreg_restore_user_state(guest_ctxt); if (unlikely(is_hyp_ctxt(vcpu))) { __sysreg_restore_vel2_state(vcpu); } else { if (vcpu_has_nv(vcpu)) { /* * As we're restoring a nested guest, set the value * provided by the guest hypervisor. */ midr = ctxt_sys_reg(guest_ctxt, VPIDR_EL2); mpidr = ctxt_sys_reg(guest_ctxt, VMPIDR_EL2); } else { midr = ctxt_midr_el1(guest_ctxt); mpidr = ctxt_sys_reg(guest_ctxt, MPIDR_EL1); } __sysreg_restore_el1_state(guest_ctxt, midr, mpidr); } vcpu_set_flag(vcpu, SYSREGS_ON_CPU); } /** * __vcpu_put_switch_sysregs - Restore host system registers to the physical CPU * * @vcpu: The VCPU pointer * * Save guest system registers that do not affect the host's execution, for * example EL1 system registers on a VHE system where the host kernel * runs at EL2. This function is called from KVM's vcpu_put() function * and deferring saving system register state until we're no longer running the * VCPU avoids having to save them on every exit from the VM. */ void __vcpu_put_switch_sysregs(struct kvm_vcpu *vcpu) { struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt; struct kvm_cpu_context *host_ctxt; host_ctxt = host_data_ptr(host_ctxt); if (unlikely(is_hyp_ctxt(vcpu))) __sysreg_save_vel2_state(vcpu); else __sysreg_save_el1_state(guest_ctxt); __sysreg_save_user_state(guest_ctxt); __sysreg32_save_state(vcpu); /* Restore host user state */ __sysreg_restore_user_state(host_ctxt); vcpu_clear_flag(vcpu, SYSREGS_ON_CPU); }
1479 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __ASM_PREEMPT_H #define __ASM_PREEMPT_H #include <linux/jump_label.h> #include <linux/thread_info.h> #define PREEMPT_NEED_RESCHED BIT(32) #define PREEMPT_ENABLED (PREEMPT_NEED_RESCHED) static inline int preempt_count(void) { return READ_ONCE(current_thread_info()->preempt.count); } static inline void preempt_count_set(u64 pc) { /* Preserve existing value of PREEMPT_NEED_RESCHED */ WRITE_ONCE(current_thread_info()->preempt.count, pc); } #define init_task_preempt_count(p) do { \ task_thread_info(p)->preempt_count = FORK_PREEMPT_COUNT; \ } while (0) #define init_idle_preempt_count(p, cpu) do { \ task_thread_info(p)->preempt_count = PREEMPT_DISABLED; \ } while (0) static inline void set_preempt_need_resched(void) { current_thread_info()->preempt.need_resched = 0; } static inline void clear_preempt_need_resched(void) { current_thread_info()->preempt.need_resched = 1; } static inline bool test_preempt_need_resched(void) { return !current_thread_info()->preempt.need_resched; } static inline void __preempt_count_add(int val) { u32 pc = READ_ONCE(current_thread_info()->preempt.count); pc += val; WRITE_ONCE(current_thread_info()->preempt.count, pc); } static inline void __preempt_count_sub(int val) { u32 pc = READ_ONCE(current_thread_info()->preempt.count); pc -= val; WRITE_ONCE(current_thread_info()->preempt.count, pc); } static inline bool __preempt_count_dec_and_test(void) { struct thread_info *ti = current_thread_info(); u64 pc = READ_ONCE(ti->preempt_count); /* Update only the count field, leaving need_resched unchanged */ WRITE_ONCE(ti->preempt.count, --pc); /* * If we wrote back all zeroes, then we're preemptible and in * need of a reschedule. Otherwise, we need to reload the * preempt_count in case the need_resched flag was cleared by an * interrupt occurring between the non-atomic READ_ONCE/WRITE_ONCE * pair. */ return !pc || !READ_ONCE(ti->preempt_count); } static inline bool should_resched(int preempt_offset) { u64 pc = READ_ONCE(current_thread_info()->preempt_count); return pc == preempt_offset; } #ifdef CONFIG_PREEMPTION void preempt_schedule(void); void preempt_schedule_notrace(void); #ifdef CONFIG_PREEMPT_DYNAMIC DECLARE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched); void dynamic_preempt_schedule(void); #define __preempt_schedule() dynamic_preempt_schedule() void dynamic_preempt_schedule_notrace(void); #define __preempt_schedule_notrace() dynamic_preempt_schedule_notrace() #else /* CONFIG_PREEMPT_DYNAMIC */ #define __preempt_schedule() preempt_schedule() #define __preempt_schedule_notrace() preempt_schedule_notrace() #endif /* CONFIG_PREEMPT_DYNAMIC */ #endif /* CONFIG_PREEMPTION */ #endif /* __ASM_PREEMPT_H */
4 4 4 4 3 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 // SPDX-License-Identifier: GPL-2.0-or-later /* * NET3 IP device support routines. * * Derived from the IP parts of dev.c 1.0.19 * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * * Additional Authors: * Alan Cox, <gw4pts@gw4pts.ampr.org> * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * Changes: * Alexey Kuznetsov: pa_* fields are replaced with ifaddr * lists. * Cyrus Durgin: updated for kmod * Matthias Andree: in devinet_ioctl, compare label and * address (4.4BSD alias style support), * fall back to comparing just the label * if no match found. */ #include <linux/uaccess.h> #include <linux/bitops.h> #include <linux/capability.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/in.h> #include <linux/errno.h> #include <linux/interrupt.h> #include <linux/if_addr.h> #include <linux/if_ether.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/skbuff.h> #include <linux/init.h> #include <linux/notifier.h> #include <linux/inetdevice.h> #include <linux/igmp.h> #include "igmp_internal.h" #include <linux/slab.h> #include <linux/hash.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif #include <linux/kmod.h> #include <linux/netconf.h> #include <net/arp.h> #include <net/ip.h> #include <net/route.h> #include <net/ip_fib.h> #include <net/rtnetlink.h> #include <net/net_namespace.h> #include <net/addrconf.h> #define IPV6ONLY_FLAGS \ (IFA_F_NODAD | IFA_F_OPTIMISTIC | IFA_F_DADFAILED | \ IFA_F_HOMEADDRESS | IFA_F_TENTATIVE | \ IFA_F_MANAGETEMPADDR | IFA_F_STABLE_PRIVACY) static struct ipv4_devconf ipv4_devconf = { .data = { [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1, [IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1, [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1, [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1, [IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/, [IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] = 1000 /*ms*/, [IPV4_DEVCONF_ARP_EVICT_NOCARRIER - 1] = 1, }, }; static struct ipv4_devconf ipv4_devconf_dflt = { .data = { [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1, [IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1, [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1, [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1, [IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE - 1] = 1, [IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/, [IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] = 1000 /*ms*/, [IPV4_DEVCONF_ARP_EVICT_NOCARRIER - 1] = 1, }, }; #define IPV4_DEVCONF_DFLT(net, attr) \ IPV4_DEVCONF((*net->ipv4.devconf_dflt), attr) static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = { [IFA_LOCAL] = { .type = NLA_U32 }, [IFA_ADDRESS] = { .type = NLA_U32 }, [IFA_BROADCAST] = { .type = NLA_U32 }, [IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, [IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) }, [IFA_FLAGS] = { .type = NLA_U32 }, [IFA_RT_PRIORITY] = { .type = NLA_U32 }, [IFA_TARGET_NETNSID] = { .type = NLA_S32 }, [IFA_PROTO] = { .type = NLA_U8 }, }; #define IN4_ADDR_HSIZE_SHIFT 8 #define IN4_ADDR_HSIZE (1U << IN4_ADDR_HSIZE_SHIFT) static u32 inet_addr_hash(const struct net *net, __be32 addr) { u32 val = __ipv4_addr_hash(addr, net_hash_mix(net)); return hash_32(val, IN4_ADDR_HSIZE_SHIFT); } static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa) { u32 hash = inet_addr_hash(net, ifa->ifa_local); ASSERT_RTNL(); hlist_add_head_rcu(&ifa->addr_lst, &net->ipv4.inet_addr_lst[hash]); } static void inet_hash_remove(struct in_ifaddr *ifa) { ASSERT_RTNL(); hlist_del_init_rcu(&ifa->addr_lst); } /** * __ip_dev_find - find the first device with a given source address. * @net: the net namespace * @addr: the source address * @devref: if true, take a reference on the found device * * If a caller uses devref=false, it should be protected by RCU, or RTNL */ struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) { struct net_device *result = NULL; struct in_ifaddr *ifa; rcu_read_lock(); ifa = inet_lookup_ifaddr_rcu(net, addr); if (!ifa) { struct flowi4 fl4 = { .daddr = addr }; struct fib_result res = { 0 }; struct fib_table *local; /* Fallback to FIB local table so that communication * over loopback subnets work. */ local = fib_get_table(net, RT_TABLE_LOCAL); if (local && !fib_table_lookup(local, &fl4, &res, FIB_LOOKUP_NOREF) && res.type == RTN_LOCAL) result = FIB_RES_DEV(res); } else { result = ifa->ifa_dev->dev; } if (result && devref) dev_hold(result); rcu_read_unlock(); return result; } EXPORT_SYMBOL(__ip_dev_find); /* called under RCU lock */ struct in_ifaddr *inet_lookup_ifaddr_rcu(struct net *net, __be32 addr) { u32 hash = inet_addr_hash(net, addr); struct in_ifaddr *ifa; hlist_for_each_entry_rcu(ifa, &net->ipv4.inet_addr_lst[hash], addr_lst) if (ifa->ifa_local == addr) return ifa; return NULL; } static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32); static BLOCKING_NOTIFIER_HEAD(inetaddr_chain); static BLOCKING_NOTIFIER_HEAD(inetaddr_validator_chain); static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr __rcu **ifap, int destroy); #ifdef CONFIG_SYSCTL static int devinet_sysctl_register(struct in_device *idev); static void devinet_sysctl_unregister(struct in_device *idev); #else static int devinet_sysctl_register(struct in_device *idev) { return 0; } static void devinet_sysctl_unregister(struct in_device *idev) { } #endif /* Locks all the inet devices. */ static struct in_ifaddr *inet_alloc_ifa(struct in_device *in_dev) { struct in_ifaddr *ifa; ifa = kzalloc(sizeof(*ifa), GFP_KERNEL_ACCOUNT); if (!ifa) return NULL; in_dev_hold(in_dev); ifa->ifa_dev = in_dev; INIT_HLIST_NODE(&ifa->addr_lst); return ifa; } static void inet_rcu_free_ifa(struct rcu_head *head) { struct in_ifaddr *ifa = container_of(head, struct in_ifaddr, rcu_head); in_dev_put(ifa->ifa_dev); kfree(ifa); } static void inet_free_ifa(struct in_ifaddr *ifa) { /* Our reference to ifa->ifa_dev must be freed ASAP * to release the reference to the netdev the same way. * in_dev_put() -> in_dev_finish_destroy() -> netdev_put() */ call_rcu_hurry(&ifa->rcu_head, inet_rcu_free_ifa); } static void in_dev_free_rcu(struct rcu_head *head) { struct in_device *idev = container_of(head, struct in_device, rcu_head); kfree(rcu_dereference_protected(idev->mc_hash, 1)); kfree(idev); } void in_dev_finish_destroy(struct in_device *idev) { struct net_device *dev = idev->dev; WARN_ON(idev->ifa_list); WARN_ON(idev->mc_list); #ifdef NET_REFCNT_DEBUG pr_debug("%s: %p=%s\n", __func__, idev, dev ? dev->name : "NIL"); #endif netdev_put(dev, &idev->dev_tracker); if (!idev->dead) pr_err("Freeing alive in_device %p\n", idev); else call_rcu(&idev->rcu_head, in_dev_free_rcu); } EXPORT_SYMBOL(in_dev_finish_destroy); static struct in_device *inetdev_init(struct net_device *dev) { struct in_device *in_dev; int err = -ENOMEM; ASSERT_RTNL(); in_dev = kzalloc(sizeof(*in_dev), GFP_KERNEL); if (!in_dev) goto out; memcpy(&in_dev->cnf, dev_net(dev)->ipv4.devconf_dflt, sizeof(in_dev->cnf)); in_dev->cnf.sysctl = NULL; in_dev->dev = dev; in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl); if (!in_dev->arp_parms) goto out_kfree; if (IPV4_DEVCONF(in_dev->cnf, FORWARDING)) netif_disable_lro(dev); /* Reference in_dev->dev */ netdev_hold(dev, &in_dev->dev_tracker, GFP_KERNEL); /* Account for reference dev->ip_ptr (below) */ refcount_set(&in_dev->refcnt, 1); if (dev != blackhole_netdev) { err = devinet_sysctl_register(in_dev); if (err) { in_dev->dead = 1; neigh_parms_release(&arp_tbl, in_dev->arp_parms); in_dev_put(in_dev); in_dev = NULL; goto out; } ip_mc_init_dev(in_dev); if (dev->flags & IFF_UP) ip_mc_up(in_dev); } /* we can receive as soon as ip_ptr is set -- do this last */ rcu_assign_pointer(dev->ip_ptr, in_dev); out: return in_dev ?: ERR_PTR(err); out_kfree: kfree(in_dev); in_dev = NULL; goto out; } static void inetdev_destroy(struct in_device *in_dev) { struct net_device *dev; struct in_ifaddr *ifa; ASSERT_RTNL(); dev = in_dev->dev; in_dev->dead = 1; ip_mc_destroy_dev(in_dev); while ((ifa = rtnl_dereference(in_dev->ifa_list)) != NULL) { inet_del_ifa(in_dev, &in_dev->ifa_list, 0); inet_free_ifa(ifa); } RCU_INIT_POINTER(dev->ip_ptr, NULL); devinet_sysctl_unregister(in_dev); neigh_parms_release(&arp_tbl, in_dev->arp_parms); arp_ifdown(dev); in_dev_put(in_dev); } static int __init inet_blackhole_dev_init(void) { int err = 0; rtnl_lock(); if (!inetdev_init(blackhole_netdev)) err = -ENOMEM; rtnl_unlock(); return err; } late_initcall(inet_blackhole_dev_init); int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b) { const struct in_ifaddr *ifa; rcu_read_lock(); in_dev_for_each_ifa_rcu(ifa, in_dev) { if (inet_ifa_match(a, ifa)) { if (!b || inet_ifa_match(b, ifa)) { rcu_read_unlock(); return 1; } } } rcu_read_unlock(); return 0; } static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr __rcu **ifap, int destroy, struct nlmsghdr *nlh, u32 portid) { struct in_ifaddr *promote = NULL; struct in_ifaddr *ifa, *ifa1; struct in_ifaddr __rcu **last_prim; struct in_ifaddr *prev_prom = NULL; int do_promote = IN_DEV_PROMOTE_SECONDARIES(in_dev); ASSERT_RTNL(); ifa1 = rtnl_dereference(*ifap); last_prim = ifap; if (in_dev->dead) goto no_promotions; /* 1. Deleting primary ifaddr forces deletion all secondaries * unless alias promotion is set **/ if (!(ifa1->ifa_flags & IFA_F_SECONDARY)) { struct in_ifaddr __rcu **ifap1 = &ifa1->ifa_next; while ((ifa = rtnl_dereference(*ifap1)) != NULL) { if (!(ifa->ifa_flags & IFA_F_SECONDARY) && ifa1->ifa_scope <= ifa->ifa_scope) last_prim = &ifa->ifa_next; if (!(ifa->ifa_flags & IFA_F_SECONDARY) || ifa1->ifa_mask != ifa->ifa_mask || !inet_ifa_match(ifa1->ifa_address, ifa)) { ifap1 = &ifa->ifa_next; prev_prom = ifa; continue; } if (!do_promote) { inet_hash_remove(ifa); *ifap1 = ifa->ifa_next; rtmsg_ifa(RTM_DELADDR, ifa, nlh, portid); blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa); inet_free_ifa(ifa); } else { promote = ifa; break; } } } /* On promotion all secondaries from subnet are changing * the primary IP, we must remove all their routes silently * and later to add them back with new prefsrc. Do this * while all addresses are on the device list. */ for (ifa = promote; ifa; ifa = rtnl_dereference(ifa->ifa_next)) { if (ifa1->ifa_mask == ifa->ifa_mask && inet_ifa_match(ifa1->ifa_address, ifa)) fib_del_ifaddr(ifa, ifa1); } no_promotions: /* 2. Unlink it */ *ifap = ifa1->ifa_next; inet_hash_remove(ifa1); /* 3. Announce address deletion */ /* Send message first, then call notifier. At first sight, FIB update triggered by notifier will refer to already deleted ifaddr, that could confuse netlink listeners. It is not true: look, gated sees that route deleted and if it still thinks that ifaddr is valid, it will try to restore deleted routes... Grr. So that, this order is correct. */ rtmsg_ifa(RTM_DELADDR, ifa1, nlh, portid); blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1); if (promote) { struct in_ifaddr *next_sec; next_sec = rtnl_dereference(promote->ifa_next); if (prev_prom) { struct in_ifaddr *last_sec; rcu_assign_pointer(prev_prom->ifa_next, next_sec); last_sec = rtnl_dereference(*last_prim); rcu_assign_pointer(promote->ifa_next, last_sec); rcu_assign_pointer(*last_prim, promote); } promote->ifa_flags &= ~IFA_F_SECONDARY; rtmsg_ifa(RTM_NEWADDR, promote, nlh, portid); blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, promote); for (ifa = next_sec; ifa; ifa = rtnl_dereference(ifa->ifa_next)) { if (ifa1->ifa_mask != ifa->ifa_mask || !inet_ifa_match(ifa1->ifa_address, ifa)) continue; fib_add_ifaddr(ifa); } } if (destroy) inet_free_ifa(ifa1); } static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr __rcu **ifap, int destroy) { __inet_del_ifa(in_dev, ifap, destroy, NULL, 0); } static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, u32 portid, struct netlink_ext_ack *extack) { struct in_ifaddr __rcu **last_primary, **ifap; struct in_device *in_dev = ifa->ifa_dev; struct net *net = dev_net(in_dev->dev); struct in_validator_info ivi; struct in_ifaddr *ifa1; int ret; ASSERT_RTNL(); ifa->ifa_flags &= ~IFA_F_SECONDARY; last_primary = &in_dev->ifa_list; /* Don't set IPv6 only flags to IPv4 addresses */ ifa->ifa_flags &= ~IPV6ONLY_FLAGS; ifap = &in_dev->ifa_list; ifa1 = rtnl_dereference(*ifap); while (ifa1) { if (!(ifa1->ifa_flags & IFA_F_SECONDARY) && ifa->ifa_scope <= ifa1->ifa_scope) last_primary = &ifa1->ifa_next; if (ifa1->ifa_mask == ifa->ifa_mask && inet_ifa_match(ifa1->ifa_address, ifa)) { if (ifa1->ifa_local == ifa->ifa_local) { inet_free_ifa(ifa); return -EEXIST; } if (ifa1->ifa_scope != ifa->ifa_scope) { NL_SET_ERR_MSG(extack, "ipv4: Invalid scope value"); inet_free_ifa(ifa); return -EINVAL; } ifa->ifa_flags |= IFA_F_SECONDARY; } ifap = &ifa1->ifa_next; ifa1 = rtnl_dereference(*ifap); } /* Allow any devices that wish to register ifaddr validtors to weigh * in now, before changes are committed. The rntl lock is serializing * access here, so the state should not change between a validator call * and a final notify on commit. This isn't invoked on promotion under * the assumption that validators are checking the address itself, and * not the flags. */ ivi.ivi_addr = ifa->ifa_address; ivi.ivi_dev = ifa->ifa_dev; ivi.extack = extack; ret = blocking_notifier_call_chain(&inetaddr_validator_chain, NETDEV_UP, &ivi); ret = notifier_to_errno(ret); if (ret) { inet_free_ifa(ifa); return ret; } if (!(ifa->ifa_flags & IFA_F_SECONDARY)) ifap = last_primary; rcu_assign_pointer(ifa->ifa_next, *ifap); rcu_assign_pointer(*ifap, ifa); inet_hash_insert(dev_net(in_dev->dev), ifa); cancel_delayed_work(&net->ipv4.addr_chk_work); queue_delayed_work(system_power_efficient_wq, &net->ipv4.addr_chk_work, 0); /* Send message first, then call notifier. Notifier will trigger FIB update, so that listeners of netlink will know about new ifaddr */ rtmsg_ifa(RTM_NEWADDR, ifa, nlh, portid); blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa); return 0; } static int inet_insert_ifa(struct in_ifaddr *ifa) { if (!ifa->ifa_local) { inet_free_ifa(ifa); return 0; } return __inet_insert_ifa(ifa, NULL, 0, NULL); } static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa) { struct in_device *in_dev = __in_dev_get_rtnl_net(dev); ipv4_devconf_setall(in_dev); neigh_parms_data_state_setall(in_dev->arp_parms); if (ipv4_is_loopback(ifa->ifa_local)) ifa->ifa_scope = RT_SCOPE_HOST; return inet_insert_ifa(ifa); } /* Caller must hold RCU or RTNL : * We dont take a reference on found in_device */ struct in_device *inetdev_by_index(struct net *net, int ifindex) { struct net_device *dev; struct in_device *in_dev = NULL; rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); if (dev) in_dev = rcu_dereference_rtnl(dev->ip_ptr); rcu_read_unlock(); return in_dev; } EXPORT_SYMBOL(inetdev_by_index); /* Called only from RTNL semaphored context. No locks. */ struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix, __be32 mask) { struct in_ifaddr *ifa; ASSERT_RTNL(); in_dev_for_each_ifa_rtnl(ifa, in_dev) { if (ifa->ifa_mask == mask && inet_ifa_match(prefix, ifa)) return ifa; } return NULL; } static int ip_mc_autojoin_config(struct net *net, bool join, const struct in_ifaddr *ifa) { #if defined(CONFIG_IP_MULTICAST) struct ip_mreqn mreq = { .imr_multiaddr.s_addr = ifa->ifa_address, .imr_ifindex = ifa->ifa_dev->dev->ifindex, }; struct sock *sk = net->ipv4.mc_autojoin_sk; int ret; ASSERT_RTNL_NET(net); lock_sock(sk); if (join) ret = ip_mc_join_group(sk, &mreq); else ret = ip_mc_leave_group(sk, &mreq); release_sock(sk); return ret; #else return -EOPNOTSUPP; #endif } static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct in_ifaddr __rcu **ifap; struct nlattr *tb[IFA_MAX+1]; struct in_device *in_dev; struct ifaddrmsg *ifm; struct in_ifaddr *ifa; int err; err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy, extack); if (err < 0) goto out; ifm = nlmsg_data(nlh); rtnl_net_lock(net); in_dev = inetdev_by_index(net, ifm->ifa_index); if (!in_dev) { NL_SET_ERR_MSG(extack, "ipv4: Device not found"); err = -ENODEV; goto unlock; } for (ifap = &in_dev->ifa_list; (ifa = rtnl_net_dereference(net, *ifap)) != NULL; ifap = &ifa->ifa_next) { if (tb[IFA_LOCAL] && ifa->ifa_local != nla_get_in_addr(tb[IFA_LOCAL])) continue; if (tb[IFA_LABEL] && nla_strcmp(tb[IFA_LABEL], ifa->ifa_label)) continue; if (tb[IFA_ADDRESS] && (ifm->ifa_prefixlen != ifa->ifa_prefixlen || !inet_ifa_match(nla_get_in_addr(tb[IFA_ADDRESS]), ifa))) continue; if (ipv4_is_multicast(ifa->ifa_address)) ip_mc_autojoin_config(net, false, ifa); __inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).portid); goto unlock; } NL_SET_ERR_MSG(extack, "ipv4: Address not found"); err = -EADDRNOTAVAIL; unlock: rtnl_net_unlock(net); out: return err; } static void check_lifetime(struct work_struct *work) { unsigned long now, next, next_sec, next_sched; struct in_ifaddr *ifa; struct hlist_node *n; struct net *net; int i; net = container_of(to_delayed_work(work), struct net, ipv4.addr_chk_work); now = jiffies; next = round_jiffies_up(now + ADDR_CHECK_FREQUENCY); for (i = 0; i < IN4_ADDR_HSIZE; i++) { struct hlist_head *head = &net->ipv4.inet_addr_lst[i]; bool change_needed = false; rcu_read_lock(); hlist_for_each_entry_rcu(ifa, head, addr_lst) { unsigned long age, tstamp; u32 preferred_lft; u32 valid_lft; u32 flags; flags = READ_ONCE(ifa->ifa_flags); if (flags & IFA_F_PERMANENT) continue; preferred_lft = READ_ONCE(ifa->ifa_preferred_lft); valid_lft = READ_ONCE(ifa->ifa_valid_lft); tstamp = READ_ONCE(ifa->ifa_tstamp); /* We try to batch several events at once. */ age = (now - tstamp + ADDRCONF_TIMER_FUZZ_MINUS) / HZ; if (valid_lft != INFINITY_LIFE_TIME && age >= valid_lft) { change_needed = true; } else if (preferred_lft == INFINITY_LIFE_TIME) { continue; } else if (age >= preferred_lft) { if (time_before(tstamp + valid_lft * HZ, next)) next = tstamp + valid_lft * HZ; if (!(flags & IFA_F_DEPRECATED)) change_needed = true; } else if (time_before(tstamp + preferred_lft * HZ, next)) { next = tstamp + preferred_lft * HZ; } } rcu_read_unlock(); if (!change_needed) continue; rtnl_net_lock(net); hlist_for_each_entry_safe(ifa, n, head, addr_lst) { unsigned long age; if (ifa->ifa_flags & IFA_F_PERMANENT) continue; /* We try to batch several events at once. */ age = (now - ifa->ifa_tstamp + ADDRCONF_TIMER_FUZZ_MINUS) / HZ; if (ifa->ifa_valid_lft != INFINITY_LIFE_TIME && age >= ifa->ifa_valid_lft) { struct in_ifaddr __rcu **ifap; struct in_ifaddr *tmp; ifap = &ifa->ifa_dev->ifa_list; tmp = rtnl_net_dereference(net, *ifap); while (tmp) { if (tmp == ifa) { inet_del_ifa(ifa->ifa_dev, ifap, 1); break; } ifap = &tmp->ifa_next; tmp = rtnl_net_dereference(net, *ifap); } } else if (ifa->ifa_preferred_lft != INFINITY_LIFE_TIME && age >= ifa->ifa_preferred_lft && !(ifa->ifa_flags & IFA_F_DEPRECATED)) { ifa->ifa_flags |= IFA_F_DEPRECATED; rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0); } } rtnl_net_unlock(net); } next_sec = round_jiffies_up(next); next_sched = next; /* If rounded timeout is accurate enough, accept it. */ if (time_before(next_sec, next + ADDRCONF_TIMER_FUZZ)) next_sched = next_sec; now = jiffies; /* And minimum interval is ADDRCONF_TIMER_FUZZ_MAX. */ if (time_before(next_sched, now + ADDRCONF_TIMER_FUZZ_MAX)) next_sched = now + ADDRCONF_TIMER_FUZZ_MAX; queue_delayed_work(system_power_efficient_wq, &net->ipv4.addr_chk_work, next_sched - now); } static void set_ifa_lifetime(struct in_ifaddr *ifa, __u32 valid_lft, __u32 prefered_lft) { unsigned long timeout; u32 flags; flags = ifa->ifa_flags & ~(IFA_F_PERMANENT | IFA_F_DEPRECATED); timeout = addrconf_timeout_fixup(valid_lft, HZ); if (addrconf_finite_timeout(timeout)) WRITE_ONCE(ifa->ifa_valid_lft, timeout); else flags |= IFA_F_PERMANENT; timeout = addrconf_timeout_fixup(prefered_lft, HZ); if (addrconf_finite_timeout(timeout)) { if (timeout == 0) flags |= IFA_F_DEPRECATED; WRITE_ONCE(ifa->ifa_preferred_lft, timeout); } WRITE_ONCE(ifa->ifa_flags, flags); WRITE_ONCE(ifa->ifa_tstamp, jiffies); if (!ifa->ifa_cstamp) WRITE_ONCE(ifa->ifa_cstamp, ifa->ifa_tstamp); } static int inet_validate_rtm(struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack, __u32 *valid_lft, __u32 *prefered_lft) { struct ifaddrmsg *ifm = nlmsg_data(nlh); int err; err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy, extack); if (err < 0) return err; if (ifm->ifa_prefixlen > 32) { NL_SET_ERR_MSG(extack, "ipv4: Invalid prefix length"); return -EINVAL; } if (!tb[IFA_LOCAL]) { NL_SET_ERR_MSG(extack, "ipv4: Local address is not supplied"); return -EINVAL; } if (tb[IFA_CACHEINFO]) { struct ifa_cacheinfo *ci; ci = nla_data(tb[IFA_CACHEINFO]); if (!ci->ifa_valid || ci->ifa_prefered > ci->ifa_valid) { NL_SET_ERR_MSG(extack, "ipv4: address lifetime invalid"); return -EINVAL; } *valid_lft = ci->ifa_valid; *prefered_lft = ci->ifa_prefered; } return 0; } static struct in_ifaddr *inet_rtm_to_ifa(struct net *net, struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack) { struct ifaddrmsg *ifm = nlmsg_data(nlh); struct in_device *in_dev; struct net_device *dev; struct in_ifaddr *ifa; int err; dev = __dev_get_by_index(net, ifm->ifa_index); err = -ENODEV; if (!dev) { NL_SET_ERR_MSG(extack, "ipv4: Device not found"); goto errout; } in_dev = __in_dev_get_rtnl_net(dev); err = -ENOBUFS; if (!in_dev) goto errout; ifa = inet_alloc_ifa(in_dev); if (!ifa) /* * A potential indev allocation can be left alive, it stays * assigned to its device and is destroy with it. */ goto errout; ipv4_devconf_setall(in_dev); neigh_parms_data_state_setall(in_dev->arp_parms); if (!tb[IFA_ADDRESS]) tb[IFA_ADDRESS] = tb[IFA_LOCAL]; ifa->ifa_prefixlen = ifm->ifa_prefixlen; ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen); ifa->ifa_flags = nla_get_u32_default(tb[IFA_FLAGS], ifm->ifa_flags); ifa->ifa_scope = ifm->ifa_scope; ifa->ifa_local = nla_get_in_addr(tb[IFA_LOCAL]); ifa->ifa_address = nla_get_in_addr(tb[IFA_ADDRESS]); if (tb[IFA_BROADCAST]) ifa->ifa_broadcast = nla_get_in_addr(tb[IFA_BROADCAST]); if (tb[IFA_LABEL]) nla_strscpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ); else memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); if (tb[IFA_RT_PRIORITY]) ifa->ifa_rt_priority = nla_get_u32(tb[IFA_RT_PRIORITY]); if (tb[IFA_PROTO]) ifa->ifa_proto = nla_get_u8(tb[IFA_PROTO]); return ifa; errout: return ERR_PTR(err); } static struct in_ifaddr *find_matching_ifa(struct net *net, struct in_ifaddr *ifa) { struct in_device *in_dev = ifa->ifa_dev; struct in_ifaddr *ifa1; in_dev_for_each_ifa_rtnl_net(net, ifa1, in_dev) { if (ifa1->ifa_mask == ifa->ifa_mask && inet_ifa_match(ifa1->ifa_address, ifa) && ifa1->ifa_local == ifa->ifa_local) return ifa1; } return NULL; } static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { __u32 prefered_lft = INFINITY_LIFE_TIME; __u32 valid_lft = INFINITY_LIFE_TIME; struct net *net = sock_net(skb->sk); struct in_ifaddr *ifa_existing; struct nlattr *tb[IFA_MAX + 1]; struct in_ifaddr *ifa; int ret; ret = inet_validate_rtm(nlh, tb, extack, &valid_lft, &prefered_lft); if (ret < 0) return ret; if (!nla_get_in_addr(tb[IFA_LOCAL])) return 0; rtnl_net_lock(net); ifa = inet_rtm_to_ifa(net, nlh, tb, extack); if (IS_ERR(ifa)) { ret = PTR_ERR(ifa); goto unlock; } ifa_existing = find_matching_ifa(net, ifa); if (!ifa_existing) { /* It would be best to check for !NLM_F_CREATE here but * userspace already relies on not having to provide this. */ set_ifa_lifetime(ifa, valid_lft, prefered_lft); if (ifa->ifa_flags & IFA_F_MCAUTOJOIN) { ret = ip_mc_autojoin_config(net, true, ifa); if (ret < 0) { NL_SET_ERR_MSG(extack, "ipv4: Multicast auto join failed"); inet_free_ifa(ifa); goto unlock; } } ret = __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid, extack); } else { u32 new_metric = ifa->ifa_rt_priority; u8 new_proto = ifa->ifa_proto; inet_free_ifa(ifa); if (nlh->nlmsg_flags & NLM_F_EXCL || !(nlh->nlmsg_flags & NLM_F_REPLACE)) { NL_SET_ERR_MSG(extack, "ipv4: Address already assigned"); ret = -EEXIST; goto unlock; } ifa = ifa_existing; if (ifa->ifa_rt_priority != new_metric) { fib_modify_prefix_metric(ifa, new_metric); ifa->ifa_rt_priority = new_metric; } ifa->ifa_proto = new_proto; set_ifa_lifetime(ifa, valid_lft, prefered_lft); cancel_delayed_work(&net->ipv4.addr_chk_work); queue_delayed_work(system_power_efficient_wq, &net->ipv4.addr_chk_work, 0); rtmsg_ifa(RTM_NEWADDR, ifa, nlh, NETLINK_CB(skb).portid); } unlock: rtnl_net_unlock(net); return ret; } /* * Determine a default network mask, based on the IP address. */ static int inet_abc_len(__be32 addr) { int rc = -1; /* Something else, probably a multicast. */ if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr)) rc = 0; else { __u32 haddr = ntohl(addr); if (IN_CLASSA(haddr)) rc = 8; else if (IN_CLASSB(haddr)) rc = 16; else if (IN_CLASSC(haddr)) rc = 24; else if (IN_CLASSE(haddr)) rc = 32; } return rc; } int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr) { struct sockaddr_in sin_orig; struct sockaddr_in *sin = (struct sockaddr_in *)&ifr->ifr_addr; struct in_ifaddr __rcu **ifap = NULL; struct in_device *in_dev; struct in_ifaddr *ifa = NULL; struct net_device *dev; char *colon; int ret = -EFAULT; int tryaddrmatch = 0; ifr->ifr_name[IFNAMSIZ - 1] = 0; /* save original address for comparison */ memcpy(&sin_orig, sin, sizeof(*sin)); colon = strchr(ifr->ifr_name, ':'); if (colon) *colon = 0; dev_load(net, ifr->ifr_name); switch (cmd) { case SIOCGIFADDR: /* Get interface address */ case SIOCGIFBRDADDR: /* Get the broadcast address */ case SIOCGIFDSTADDR: /* Get the destination address */ case SIOCGIFNETMASK: /* Get the netmask for the interface */ /* Note that these ioctls will not sleep, so that we do not impose a lock. One day we will be forced to put shlock here (I mean SMP) */ tryaddrmatch = (sin_orig.sin_family == AF_INET); memset(sin, 0, sizeof(*sin)); sin->sin_family = AF_INET; break; case SIOCSIFFLAGS: ret = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) goto out; break; case SIOCSIFADDR: /* Set interface address (and family) */ case SIOCSIFBRDADDR: /* Set the broadcast address */ case SIOCSIFDSTADDR: /* Set the destination address */ case SIOCSIFNETMASK: /* Set the netmask for the interface */ ret = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) goto out; ret = -EINVAL; if (sin->sin_family != AF_INET) goto out; break; default: ret = -EINVAL; goto out; } rtnl_net_lock(net); ret = -ENODEV; dev = __dev_get_by_name(net, ifr->ifr_name); if (!dev) goto done; if (colon) *colon = ':'; in_dev = __in_dev_get_rtnl_net(dev); if (in_dev) { if (tryaddrmatch) { /* Matthias Andree */ /* compare label and address (4.4BSD style) */ /* note: we only do this for a limited set of ioctls and only if the original address family was AF_INET. This is checked above. */ for (ifap = &in_dev->ifa_list; (ifa = rtnl_net_dereference(net, *ifap)) != NULL; ifap = &ifa->ifa_next) { if (!strcmp(ifr->ifr_name, ifa->ifa_label) && sin_orig.sin_addr.s_addr == ifa->ifa_local) { break; /* found */ } } } /* we didn't get a match, maybe the application is 4.3BSD-style and passed in junk so we fall back to comparing just the label */ if (!ifa) { for (ifap = &in_dev->ifa_list; (ifa = rtnl_net_dereference(net, *ifap)) != NULL; ifap = &ifa->ifa_next) if (!strcmp(ifr->ifr_name, ifa->ifa_label)) break; } } ret = -EADDRNOTAVAIL; if (!ifa && cmd != SIOCSIFADDR && cmd != SIOCSIFFLAGS) goto done; switch (cmd) { case SIOCGIFADDR: /* Get interface address */ ret = 0; sin->sin_addr.s_addr = ifa->ifa_local; break; case SIOCGIFBRDADDR: /* Get the broadcast address */ ret = 0; sin->sin_addr.s_addr = ifa->ifa_broadcast; break; case SIOCGIFDSTADDR: /* Get the destination address */ ret = 0; sin->sin_addr.s_addr = ifa->ifa_address; break; case SIOCGIFNETMASK: /* Get the netmask for the interface */ ret = 0; sin->sin_addr.s_addr = ifa->ifa_mask; break; case SIOCSIFFLAGS: if (colon) { ret = -EADDRNOTAVAIL; if (!ifa) break; ret = 0; if (!(ifr->ifr_flags & IFF_UP)) inet_del_ifa(in_dev, ifap, 1); break; } /* NETDEV_UP/DOWN/CHANGE could touch a peer dev */ ASSERT_RTNL(); ret = dev_change_flags(dev, ifr->ifr_flags, NULL); break; case SIOCSIFADDR: /* Set interface address (and family) */ ret = -EINVAL; if (inet_abc_len(sin->sin_addr.s_addr) < 0) break; if (!ifa) { ret = -ENOBUFS; if (!in_dev) break; ifa = inet_alloc_ifa(in_dev); if (!ifa) break; if (colon) memcpy(ifa->ifa_label, ifr->ifr_name, IFNAMSIZ); else memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); } else { ret = 0; if (ifa->ifa_local == sin->sin_addr.s_addr) break; inet_del_ifa(in_dev, ifap, 0); ifa->ifa_broadcast = 0; ifa->ifa_scope = 0; } ifa->ifa_address = ifa->ifa_local = sin->sin_addr.s_addr; if (!(dev->flags & IFF_POINTOPOINT)) { ifa->ifa_prefixlen = inet_abc_len(ifa->ifa_address); ifa->ifa_mask = inet_make_mask(ifa->ifa_prefixlen); if ((dev->flags & IFF_BROADCAST) && ifa->ifa_prefixlen < 31) ifa->ifa_broadcast = ifa->ifa_address | ~ifa->ifa_mask; } else { ifa->ifa_prefixlen = 32; ifa->ifa_mask = inet_make_mask(32); } set_ifa_lifetime(ifa, INFINITY_LIFE_TIME, INFINITY_LIFE_TIME); ret = inet_set_ifa(dev, ifa); break; case SIOCSIFBRDADDR: /* Set the broadcast address */ ret = 0; if (ifa->ifa_broadcast != sin->sin_addr.s_addr) { inet_del_ifa(in_dev, ifap, 0); ifa->ifa_broadcast = sin->sin_addr.s_addr; inet_insert_ifa(ifa); } break; case SIOCSIFDSTADDR: /* Set the destination address */ ret = 0; if (ifa->ifa_address == sin->sin_addr.s_addr) break; ret = -EINVAL; if (inet_abc_len(sin->sin_addr.s_addr) < 0) break; ret = 0; inet_del_ifa(in_dev, ifap, 0); ifa->ifa_address = sin->sin_addr.s_addr; inet_insert_ifa(ifa); break; case SIOCSIFNETMASK: /* Set the netmask for the interface */ /* * The mask we set must be legal. */ ret = -EINVAL; if (bad_mask(sin->sin_addr.s_addr, 0)) break; ret = 0; if (ifa->ifa_mask != sin->sin_addr.s_addr) { __be32 old_mask = ifa->ifa_mask; inet_del_ifa(in_dev, ifap, 0); ifa->ifa_mask = sin->sin_addr.s_addr; ifa->ifa_prefixlen = inet_mask_len(ifa->ifa_mask); /* See if current broadcast address matches * with current netmask, then recalculate * the broadcast address. Otherwise it's a * funny address, so don't touch it since * the user seems to know what (s)he's doing... */ if ((dev->flags & IFF_BROADCAST) && (ifa->ifa_prefixlen < 31) && (ifa->ifa_broadcast == (ifa->ifa_local|~old_mask))) { ifa->ifa_broadcast = (ifa->ifa_local | ~sin->sin_addr.s_addr); } inet_insert_ifa(ifa); } break; } done: rtnl_net_unlock(net); out: return ret; } int inet_gifconf(struct net_device *dev, char __user *buf, int len, int size) { struct in_device *in_dev = __in_dev_get_rtnl_net(dev); const struct in_ifaddr *ifa; struct ifreq ifr; int done = 0; if (WARN_ON(size > sizeof(struct ifreq))) goto out; if (!in_dev) goto out; in_dev_for_each_ifa_rtnl_net(dev_net(dev), ifa, in_dev) { if (!buf) { done += size; continue; } if (len < size) break; memset(&ifr, 0, sizeof(struct ifreq)); strcpy(ifr.ifr_name, ifa->ifa_label); (*(struct sockaddr_in *)&ifr.ifr_addr).sin_family = AF_INET; (*(struct sockaddr_in *)&ifr.ifr_addr).sin_addr.s_addr = ifa->ifa_local; if (copy_to_user(buf + done, &ifr, size)) { done = -EFAULT; break; } len -= size; done += size; } out: return done; } static __be32 in_dev_select_addr(const struct in_device *in_dev, int scope) { const struct in_ifaddr *ifa; in_dev_for_each_ifa_rcu(ifa, in_dev) { if (READ_ONCE(ifa->ifa_flags) & IFA_F_SECONDARY) continue; if (ifa->ifa_scope != RT_SCOPE_LINK && ifa->ifa_scope <= scope) return ifa->ifa_local; } return 0; } __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope) { const struct in_ifaddr *ifa; __be32 addr = 0; unsigned char localnet_scope = RT_SCOPE_HOST; struct in_device *in_dev; struct net *net; int master_idx; rcu_read_lock(); net = dev_net_rcu(dev); in_dev = __in_dev_get_rcu(dev); if (!in_dev) goto no_in_dev; if (unlikely(IN_DEV_ROUTE_LOCALNET(in_dev))) localnet_scope = RT_SCOPE_LINK; in_dev_for_each_ifa_rcu(ifa, in_dev) { if (READ_ONCE(ifa->ifa_flags) & IFA_F_SECONDARY) continue; if (min(ifa->ifa_scope, localnet_scope) > scope) continue; if (!dst || inet_ifa_match(dst, ifa)) { addr = ifa->ifa_local; break; } if (!addr) addr = ifa->ifa_local; } if (addr) goto out_unlock; no_in_dev: master_idx = l3mdev_master_ifindex_rcu(dev); /* For VRFs, the VRF device takes the place of the loopback device, * with addresses on it being preferred. Note in such cases the * loopback device will be among the devices that fail the master_idx * equality check in the loop below. */ if (master_idx && (dev = dev_get_by_index_rcu(net, master_idx)) && (in_dev = __in_dev_get_rcu(dev))) { addr = in_dev_select_addr(in_dev, scope); if (addr) goto out_unlock; } /* Not loopback addresses on loopback should be preferred in this case. It is important that lo is the first interface in dev_base list. */ for_each_netdev_rcu(net, dev) { if (l3mdev_master_ifindex_rcu(dev) != master_idx) continue; in_dev = __in_dev_get_rcu(dev); if (!in_dev) continue; addr = in_dev_select_addr(in_dev, scope); if (addr) goto out_unlock; } out_unlock: rcu_read_unlock(); return addr; } EXPORT_SYMBOL(inet_select_addr); static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst, __be32 local, int scope) { unsigned char localnet_scope = RT_SCOPE_HOST; const struct in_ifaddr *ifa; __be32 addr = 0; int same = 0; if (unlikely(IN_DEV_ROUTE_LOCALNET(in_dev))) localnet_scope = RT_SCOPE_LINK; in_dev_for_each_ifa_rcu(ifa, in_dev) { unsigned char min_scope = min(ifa->ifa_scope, localnet_scope); if (!addr && (local == ifa->ifa_local || !local) && min_scope <= scope) { addr = ifa->ifa_local; if (same) break; } if (!same) { same = (!local || inet_ifa_match(local, ifa)) && (!dst || inet_ifa_match(dst, ifa)); if (same && addr) { if (local || !dst) break; /* Is the selected addr into dst subnet? */ if (inet_ifa_match(addr, ifa)) break; /* No, then can we use new local src? */ if (min_scope <= scope) { addr = ifa->ifa_local; break; } /* search for large dst subnet for addr */ same = 0; } } } return same ? addr : 0; } /* * Confirm that local IP address exists using wildcards: * - net: netns to check, cannot be NULL * - in_dev: only on this interface, NULL=any interface * - dst: only in the same subnet as dst, 0=any dst * - local: address, 0=autoselect the local address * - scope: maximum allowed scope value for the local address */ __be32 inet_confirm_addr(struct net *net, struct in_device *in_dev, __be32 dst, __be32 local, int scope) { __be32 addr = 0; struct net_device *dev; if (in_dev) return confirm_addr_indev(in_dev, dst, local, scope); rcu_read_lock(); for_each_netdev_rcu(net, dev) { in_dev = __in_dev_get_rcu(dev); if (in_dev) { addr = confirm_addr_indev(in_dev, dst, local, scope); if (addr) break; } } rcu_read_unlock(); return addr; } EXPORT_SYMBOL(inet_confirm_addr); /* * Device notifier */ int register_inetaddr_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&inetaddr_chain, nb); } EXPORT_SYMBOL(register_inetaddr_notifier); int unregister_inetaddr_notifier(struct notifier_block *nb) { return blocking_notifier_chain_unregister(&inetaddr_chain, nb); } EXPORT_SYMBOL(unregister_inetaddr_notifier); int register_inetaddr_validator_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&inetaddr_validator_chain, nb); } EXPORT_SYMBOL(register_inetaddr_validator_notifier); int unregister_inetaddr_validator_notifier(struct notifier_block *nb) { return blocking_notifier_chain_unregister(&inetaddr_validator_chain, nb); } EXPORT_SYMBOL(unregister_inetaddr_validator_notifier); /* Rename ifa_labels for a device name change. Make some effort to preserve * existing alias numbering and to create unique labels if possible. */ static void inetdev_changename(struct net_device *dev, struct in_device *in_dev) { struct in_ifaddr *ifa; int named = 0; in_dev_for_each_ifa_rtnl(ifa, in_dev) { char old[IFNAMSIZ], *dot; memcpy(old, ifa->ifa_label, IFNAMSIZ); memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); if (named++ == 0) goto skip; dot = strchr(old, ':'); if (!dot) { sprintf(old, ":%d", named); dot = old; } if (strlen(dot) + strlen(dev->name) < IFNAMSIZ) strcat(ifa->ifa_label, dot); else strcpy(ifa->ifa_label + (IFNAMSIZ - strlen(dot) - 1), dot); skip: rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0); } } static void inetdev_send_gratuitous_arp(struct net_device *dev, struct in_device *in_dev) { const struct in_ifaddr *ifa; in_dev_for_each_ifa_rtnl(ifa, in_dev) { arp_send(ARPOP_REQUEST, ETH_P_ARP, ifa->ifa_local, dev, ifa->ifa_local, NULL, dev->dev_addr, NULL); } } /* Called only under RTNL semaphore */ static int inetdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct in_device *in_dev = __in_dev_get_rtnl(dev); ASSERT_RTNL(); if (!in_dev) { if (event == NETDEV_REGISTER) { in_dev = inetdev_init(dev); if (IS_ERR(in_dev)) return notifier_from_errno(PTR_ERR(in_dev)); if (dev->flags & IFF_LOOPBACK) { IN_DEV_CONF_SET(in_dev, NOXFRM, 1); IN_DEV_CONF_SET(in_dev, NOPOLICY, 1); } } else if (event == NETDEV_CHANGEMTU) { /* Re-enabling IP */ if (inetdev_valid_mtu(dev->mtu)) in_dev = inetdev_init(dev); } goto out; } switch (event) { case NETDEV_REGISTER: pr_debug("%s: bug\n", __func__); RCU_INIT_POINTER(dev->ip_ptr, NULL); break; case NETDEV_UP: if (!inetdev_valid_mtu(dev->mtu)) break; if (dev->flags & IFF_LOOPBACK) { struct in_ifaddr *ifa = inet_alloc_ifa(in_dev); if (ifa) { ifa->ifa_local = ifa->ifa_address = htonl(INADDR_LOOPBACK); ifa->ifa_prefixlen = 8; ifa->ifa_mask = inet_make_mask(8); ifa->ifa_scope = RT_SCOPE_HOST; memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); set_ifa_lifetime(ifa, INFINITY_LIFE_TIME, INFINITY_LIFE_TIME); ipv4_devconf_setall(in_dev); neigh_parms_data_state_setall(in_dev->arp_parms); inet_insert_ifa(ifa); } } ip_mc_up(in_dev); fallthrough; case NETDEV_CHANGEADDR: if (!IN_DEV_ARP_NOTIFY(in_dev)) break; fallthrough; case NETDEV_NOTIFY_PEERS: /* Send gratuitous ARP to notify of link change */ inetdev_send_gratuitous_arp(dev, in_dev); break; case NETDEV_DOWN: ip_mc_down(in_dev); break; case NETDEV_PRE_TYPE_CHANGE: ip_mc_unmap(in_dev); break; case NETDEV_POST_TYPE_CHANGE: ip_mc_remap(in_dev); break; case NETDEV_CHANGEMTU: if (inetdev_valid_mtu(dev->mtu)) break; /* disable IP when MTU is not enough */ fallthrough; case NETDEV_UNREGISTER: inetdev_destroy(in_dev); break; case NETDEV_CHANGENAME: /* Do not notify about label change, this event is * not interesting to applications using netlink. */ inetdev_changename(dev, in_dev); devinet_sysctl_unregister(in_dev); devinet_sysctl_register(in_dev); break; } out: return NOTIFY_DONE; } static struct notifier_block ip_netdev_notifier = { .notifier_call = inetdev_event, }; static size_t inet_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct ifaddrmsg)) + nla_total_size(4) /* IFA_ADDRESS */ + nla_total_size(4) /* IFA_LOCAL */ + nla_total_size(4) /* IFA_BROADCAST */ + nla_total_size(IFNAMSIZ) /* IFA_LABEL */ + nla_total_size(4) /* IFA_FLAGS */ + nla_total_size(1) /* IFA_PROTO */ + nla_total_size(4) /* IFA_RT_PRIORITY */ + nla_total_size(sizeof(struct ifa_cacheinfo)); /* IFA_CACHEINFO */ } static inline u32 cstamp_delta(unsigned long cstamp) { return (cstamp - INITIAL_JIFFIES) * 100UL / HZ; } static int put_cacheinfo(struct sk_buff *skb, unsigned long cstamp, unsigned long tstamp, u32 preferred, u32 valid) { struct ifa_cacheinfo ci; ci.cstamp = cstamp_delta(cstamp); ci.tstamp = cstamp_delta(tstamp); ci.ifa_prefered = preferred; ci.ifa_valid = valid; return nla_put(skb, IFA_CACHEINFO, sizeof(ci), &ci); } static int inet_fill_ifaddr(struct sk_buff *skb, const struct in_ifaddr *ifa, struct inet_fill_args *args) { struct ifaddrmsg *ifm; struct nlmsghdr *nlh; unsigned long tstamp; u32 preferred, valid; u32 flags; nlh = nlmsg_put(skb, args->portid, args->seq, args->event, sizeof(*ifm), args->flags); if (!nlh) return -EMSGSIZE; ifm = nlmsg_data(nlh); ifm->ifa_family = AF_INET; ifm->ifa_prefixlen = ifa->ifa_prefixlen; flags = READ_ONCE(ifa->ifa_flags); /* Warning : ifm->ifa_flags is an __u8, it holds only 8 bits. * The 32bit value is given in IFA_FLAGS attribute. */ ifm->ifa_flags = (__u8)flags; ifm->ifa_scope = ifa->ifa_scope; ifm->ifa_index = ifa->ifa_dev->dev->ifindex; if (args->netnsid >= 0 && nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid)) goto nla_put_failure; tstamp = READ_ONCE(ifa->ifa_tstamp); if (!(flags & IFA_F_PERMANENT)) { preferred = READ_ONCE(ifa->ifa_preferred_lft); valid = READ_ONCE(ifa->ifa_valid_lft); if (preferred != INFINITY_LIFE_TIME) { long tval = (jiffies - tstamp) / HZ; if (preferred > tval) preferred -= tval; else preferred = 0; if (valid != INFINITY_LIFE_TIME) { if (valid > tval) valid -= tval; else valid = 0; } } } else { preferred = INFINITY_LIFE_TIME; valid = INFINITY_LIFE_TIME; } if ((ifa->ifa_address && nla_put_in_addr(skb, IFA_ADDRESS, ifa->ifa_address)) || (ifa->ifa_local && nla_put_in_addr(skb, IFA_LOCAL, ifa->ifa_local)) || (ifa->ifa_broadcast && nla_put_in_addr(skb, IFA_BROADCAST, ifa->ifa_broadcast)) || (ifa->ifa_label[0] && nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) || (ifa->ifa_proto && nla_put_u8(skb, IFA_PROTO, ifa->ifa_proto)) || nla_put_u32(skb, IFA_FLAGS, flags) || (ifa->ifa_rt_priority && nla_put_u32(skb, IFA_RT_PRIORITY, ifa->ifa_rt_priority)) || put_cacheinfo(skb, READ_ONCE(ifa->ifa_cstamp), tstamp, preferred, valid)) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int inet_valid_dump_ifaddr_req(const struct nlmsghdr *nlh, struct inet_fill_args *fillargs, struct net **tgt_net, struct sock *sk, struct netlink_callback *cb) { struct netlink_ext_ack *extack = cb->extack; struct nlattr *tb[IFA_MAX+1]; struct ifaddrmsg *ifm; int err, i; ifm = nlmsg_payload(nlh, sizeof(*ifm)); if (!ifm) { NL_SET_ERR_MSG(extack, "ipv4: Invalid header for address dump request"); return -EINVAL; } if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) { NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for address dump request"); return -EINVAL; } fillargs->ifindex = ifm->ifa_index; if (fillargs->ifindex) { cb->answer_flags |= NLM_F_DUMP_FILTERED; fillargs->flags |= NLM_F_DUMP_FILTERED; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy, extack); if (err < 0) return err; for (i = 0; i <= IFA_MAX; ++i) { if (!tb[i]) continue; if (i == IFA_TARGET_NETNSID) { struct net *net; fillargs->netnsid = nla_get_s32(tb[i]); net = rtnl_get_net_ns_capable(sk, fillargs->netnsid); if (IS_ERR(net)) { fillargs->netnsid = -1; NL_SET_ERR_MSG(extack, "ipv4: Invalid target network namespace id"); return PTR_ERR(net); } *tgt_net = net; } else { NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in dump request"); return -EINVAL; } } return 0; } static int in_dev_dump_ifmcaddr(struct in_device *in_dev, struct sk_buff *skb, struct netlink_callback *cb, int *s_ip_idx, struct inet_fill_args *fillargs) { struct ip_mc_list *im; int ip_idx = 0; int err; for (im = rcu_dereference(in_dev->mc_list); im; im = rcu_dereference(im->next_rcu)) { if (ip_idx < *s_ip_idx) { ip_idx++; continue; } err = inet_fill_ifmcaddr(skb, in_dev->dev, im, fillargs); if (err < 0) goto done; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); ip_idx++; } err = 0; ip_idx = 0; done: *s_ip_idx = ip_idx; return err; } static int in_dev_dump_ifaddr(struct in_device *in_dev, struct sk_buff *skb, struct netlink_callback *cb, int *s_ip_idx, struct inet_fill_args *fillargs) { struct in_ifaddr *ifa; int ip_idx = 0; int err; in_dev_for_each_ifa_rcu(ifa, in_dev) { if (ip_idx < *s_ip_idx) { ip_idx++; continue; } err = inet_fill_ifaddr(skb, ifa, fillargs); if (err < 0) goto done; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); ip_idx++; } err = 0; ip_idx = 0; done: *s_ip_idx = ip_idx; return err; } static int in_dev_dump_addr(struct in_device *in_dev, struct sk_buff *skb, struct netlink_callback *cb, int *s_ip_idx, struct inet_fill_args *fillargs) { switch (fillargs->event) { case RTM_NEWADDR: return in_dev_dump_ifaddr(in_dev, skb, cb, s_ip_idx, fillargs); case RTM_GETMULTICAST: return in_dev_dump_ifmcaddr(in_dev, skb, cb, s_ip_idx, fillargs); default: return -EINVAL; } } /* Combine dev_addr_genid and dev_base_seq to detect changes. */ static u32 inet_base_seq(const struct net *net) { u32 res = atomic_read(&net->ipv4.dev_addr_genid) + READ_ONCE(net->dev_base_seq); /* Must not return 0 (see nl_dump_check_consistent()). * Chose a value far away from 0. */ if (!res) res = 0x80000000; return res; } static int inet_dump_addr(struct sk_buff *skb, struct netlink_callback *cb, int event) { const struct nlmsghdr *nlh = cb->nlh; struct inet_fill_args fillargs = { .portid = NETLINK_CB(cb->skb).portid, .seq = nlh->nlmsg_seq, .event = event, .flags = NLM_F_MULTI, .netnsid = -1, }; struct net *net = sock_net(skb->sk); struct net *tgt_net = net; struct { unsigned long ifindex; int ip_idx; } *ctx = (void *)cb->ctx; struct in_device *in_dev; struct net_device *dev; int err = 0; rcu_read_lock(); if (cb->strict_check) { err = inet_valid_dump_ifaddr_req(nlh, &fillargs, &tgt_net, skb->sk, cb); if (err < 0) goto done; if (fillargs.ifindex) { dev = dev_get_by_index_rcu(tgt_net, fillargs.ifindex); if (!dev) { err = -ENODEV; goto done; } in_dev = __in_dev_get_rcu(dev); if (!in_dev) goto done; err = in_dev_dump_addr(in_dev, skb, cb, &ctx->ip_idx, &fillargs); goto done; } } cb->seq = inet_base_seq(tgt_net); for_each_netdev_dump(tgt_net, dev, ctx->ifindex) { in_dev = __in_dev_get_rcu(dev); if (!in_dev) continue; err = in_dev_dump_addr(in_dev, skb, cb, &ctx->ip_idx, &fillargs); if (err < 0) goto done; } done: if (fillargs.netnsid >= 0) put_net(tgt_net); rcu_read_unlock(); return err; } static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) { return inet_dump_addr(skb, cb, RTM_NEWADDR); } static int inet_dump_ifmcaddr(struct sk_buff *skb, struct netlink_callback *cb) { return inet_dump_addr(skb, cb, RTM_GETMULTICAST); } static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh, u32 portid) { struct inet_fill_args fillargs = { .portid = portid, .seq = nlh ? nlh->nlmsg_seq : 0, .event = event, .flags = 0, .netnsid = -1, }; struct sk_buff *skb; int err = -ENOBUFS; struct net *net; net = dev_net(ifa->ifa_dev->dev); skb = nlmsg_new(inet_nlmsg_size(), GFP_KERNEL); if (!skb) goto errout; err = inet_fill_ifaddr(skb, ifa, &fillargs); if (err < 0) { /* -EMSGSIZE implies BUG in inet_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, portid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL); return; errout: rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err); } static size_t inet_get_link_af_size(const struct net_device *dev, u32 ext_filter_mask) { struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr); if (!in_dev) return 0; return nla_total_size(IPV4_DEVCONF_MAX * 4); /* IFLA_INET_CONF */ } static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev, u32 ext_filter_mask) { struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr); struct nlattr *nla; int i; if (!in_dev) return -ENODATA; nla = nla_reserve(skb, IFLA_INET_CONF, IPV4_DEVCONF_MAX * 4); if (!nla) return -EMSGSIZE; for (i = 0; i < IPV4_DEVCONF_MAX; i++) ((u32 *) nla_data(nla))[i] = READ_ONCE(in_dev->cnf.data[i]); return 0; } static const struct nla_policy inet_af_policy[IFLA_INET_MAX+1] = { [IFLA_INET_CONF] = { .type = NLA_NESTED }, }; static int inet_validate_link_af(const struct net_device *dev, const struct nlattr *nla, struct netlink_ext_ack *extack) { struct nlattr *a, *tb[IFLA_INET_MAX+1]; int err, rem; if (dev && !__in_dev_get_rtnl(dev)) return -EAFNOSUPPORT; err = nla_parse_nested_deprecated(tb, IFLA_INET_MAX, nla, inet_af_policy, extack); if (err < 0) return err; if (tb[IFLA_INET_CONF]) { nla_for_each_nested(a, tb[IFLA_INET_CONF], rem) { int cfgid = nla_type(a); if (nla_len(a) < 4) return -EINVAL; if (cfgid <= 0 || cfgid > IPV4_DEVCONF_MAX) return -EINVAL; } } return 0; } static int inet_set_link_af(struct net_device *dev, const struct nlattr *nla, struct netlink_ext_ack *extack) { struct in_device *in_dev = __in_dev_get_rtnl(dev); struct nlattr *a, *tb[IFLA_INET_MAX+1]; int rem; if (!in_dev) return -EAFNOSUPPORT; if (nla_parse_nested_deprecated(tb, IFLA_INET_MAX, nla, NULL, NULL) < 0) return -EINVAL; if (tb[IFLA_INET_CONF]) { nla_for_each_nested(a, tb[IFLA_INET_CONF], rem) ipv4_devconf_set(in_dev, nla_type(a), nla_get_u32(a)); } return 0; } static int inet_netconf_msgsize_devconf(int type) { int size = NLMSG_ALIGN(sizeof(struct netconfmsg)) + nla_total_size(4); /* NETCONFA_IFINDEX */ bool all = false; if (type == NETCONFA_ALL) all = true; if (all || type == NETCONFA_FORWARDING) size += nla_total_size(4); if (all || type == NETCONFA_RP_FILTER) size += nla_total_size(4); if (all || type == NETCONFA_MC_FORWARDING) size += nla_total_size(4); if (all || type == NETCONFA_BC_FORWARDING) size += nla_total_size(4); if (all || type == NETCONFA_PROXY_NEIGH) size += nla_total_size(4); if (all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) size += nla_total_size(4); return size; } static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex, const struct ipv4_devconf *devconf, u32 portid, u32 seq, int event, unsigned int flags, int type) { struct nlmsghdr *nlh; struct netconfmsg *ncm; bool all = false; nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg), flags); if (!nlh) return -EMSGSIZE; if (type == NETCONFA_ALL) all = true; ncm = nlmsg_data(nlh); ncm->ncm_family = AF_INET; if (nla_put_s32(skb, NETCONFA_IFINDEX, ifindex) < 0) goto nla_put_failure; if (!devconf) goto out; if ((all || type == NETCONFA_FORWARDING) && nla_put_s32(skb, NETCONFA_FORWARDING, IPV4_DEVCONF_RO(*devconf, FORWARDING)) < 0) goto nla_put_failure; if ((all || type == NETCONFA_RP_FILTER) && nla_put_s32(skb, NETCONFA_RP_FILTER, IPV4_DEVCONF_RO(*devconf, RP_FILTER)) < 0) goto nla_put_failure; if ((all || type == NETCONFA_MC_FORWARDING) && nla_put_s32(skb, NETCONFA_MC_FORWARDING, IPV4_DEVCONF_RO(*devconf, MC_FORWARDING)) < 0) goto nla_put_failure; if ((all || type == NETCONFA_BC_FORWARDING) && nla_put_s32(skb, NETCONFA_BC_FORWARDING, IPV4_DEVCONF_RO(*devconf, BC_FORWARDING)) < 0) goto nla_put_failure; if ((all || type == NETCONFA_PROXY_NEIGH) && nla_put_s32(skb, NETCONFA_PROXY_NEIGH, IPV4_DEVCONF_RO(*devconf, PROXY_ARP)) < 0) goto nla_put_failure; if ((all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) && nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, IPV4_DEVCONF_RO(*devconf, IGNORE_ROUTES_WITH_LINKDOWN)) < 0) goto nla_put_failure; out: nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } void inet_netconf_notify_devconf(struct net *net, int event, int type, int ifindex, struct ipv4_devconf *devconf) { struct sk_buff *skb; int err = -ENOBUFS; skb = nlmsg_new(inet_netconf_msgsize_devconf(type), GFP_KERNEL); if (!skb) goto errout; err = inet_netconf_fill_devconf(skb, ifindex, devconf, 0, 0, event, 0, type); if (err < 0) { /* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_IPV4_NETCONF, NULL, GFP_KERNEL); return; errout: rtnl_set_sk_err(net, RTNLGRP_IPV4_NETCONF, err); } static const struct nla_policy devconf_ipv4_policy[NETCONFA_MAX+1] = { [NETCONFA_IFINDEX] = { .len = sizeof(int) }, [NETCONFA_FORWARDING] = { .len = sizeof(int) }, [NETCONFA_RP_FILTER] = { .len = sizeof(int) }, [NETCONFA_PROXY_NEIGH] = { .len = sizeof(int) }, [NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN] = { .len = sizeof(int) }, }; static int inet_netconf_valid_get_req(struct sk_buff *skb, const struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack) { int i, err; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(struct netconfmsg))) { NL_SET_ERR_MSG(extack, "ipv4: Invalid header for netconf get request"); return -EINVAL; } if (!netlink_strict_get_check(skb)) return nlmsg_parse_deprecated(nlh, sizeof(struct netconfmsg), tb, NETCONFA_MAX, devconf_ipv4_policy, extack); err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct netconfmsg), tb, NETCONFA_MAX, devconf_ipv4_policy, extack); if (err) return err; for (i = 0; i <= NETCONFA_MAX; i++) { if (!tb[i]) continue; switch (i) { case NETCONFA_IFINDEX: break; default: NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in netconf get request"); return -EINVAL; } } return 0; } static int inet_netconf_get_devconf(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct nlattr *tb[NETCONFA_MAX + 1]; const struct ipv4_devconf *devconf; struct in_device *in_dev = NULL; struct net_device *dev = NULL; struct sk_buff *skb; int ifindex; int err; err = inet_netconf_valid_get_req(in_skb, nlh, tb, extack); if (err) return err; if (!tb[NETCONFA_IFINDEX]) return -EINVAL; ifindex = nla_get_s32(tb[NETCONFA_IFINDEX]); switch (ifindex) { case NETCONFA_IFINDEX_ALL: devconf = net->ipv4.devconf_all; break; case NETCONFA_IFINDEX_DEFAULT: devconf = net->ipv4.devconf_dflt; break; default: err = -ENODEV; dev = dev_get_by_index(net, ifindex); if (dev) in_dev = in_dev_get(dev); if (!in_dev) goto errout; devconf = &in_dev->cnf; break; } err = -ENOBUFS; skb = nlmsg_new(inet_netconf_msgsize_devconf(NETCONFA_ALL), GFP_KERNEL); if (!skb) goto errout; err = inet_netconf_fill_devconf(skb, ifindex, devconf, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, RTM_NEWNETCONF, 0, NETCONFA_ALL); if (err < 0) { /* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); errout: if (in_dev) in_dev_put(in_dev); dev_put(dev); return err; } static int inet_netconf_dump_devconf(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct { unsigned long ifindex; unsigned int all_default; } *ctx = (void *)cb->ctx; const struct in_device *in_dev; struct net_device *dev; int err = 0; if (cb->strict_check) { struct netlink_ext_ack *extack = cb->extack; struct netconfmsg *ncm; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ncm))) { NL_SET_ERR_MSG(extack, "ipv4: Invalid header for netconf dump request"); return -EINVAL; } if (nlmsg_attrlen(nlh, sizeof(*ncm))) { NL_SET_ERR_MSG(extack, "ipv4: Invalid data after header in netconf dump request"); return -EINVAL; } } rcu_read_lock(); for_each_netdev_dump(net, dev, ctx->ifindex) { in_dev = __in_dev_get_rcu(dev); if (!in_dev) continue; err = inet_netconf_fill_devconf(skb, dev->ifindex, &in_dev->cnf, NETLINK_CB(cb->skb).portid, nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, NETCONFA_ALL); if (err < 0) goto done; } if (ctx->all_default == 0) { err = inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL, net->ipv4.devconf_all, NETLINK_CB(cb->skb).portid, nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, NETCONFA_ALL); if (err < 0) goto done; ctx->all_default++; } if (ctx->all_default == 1) { err = inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT, net->ipv4.devconf_dflt, NETLINK_CB(cb->skb).portid, nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, NETCONFA_ALL); if (err < 0) goto done; ctx->all_default++; } done: rcu_read_unlock(); return err; } #ifdef CONFIG_SYSCTL static void devinet_copy_dflt_conf(struct net *net, int i) { struct net_device *dev; rcu_read_lock(); for_each_netdev_rcu(net, dev) { struct in_device *in_dev; in_dev = __in_dev_get_rcu(dev); if (in_dev && !test_bit(i, in_dev->cnf.state)) in_dev->cnf.data[i] = net->ipv4.devconf_dflt->data[i]; } rcu_read_unlock(); } /* called with RTNL locked */ static void inet_forward_change(struct net *net) { struct net_device *dev; int on = IPV4_DEVCONF_ALL(net, FORWARDING); IPV4_DEVCONF_ALL(net, ACCEPT_REDIRECTS) = !on; IPV4_DEVCONF_DFLT(net, FORWARDING) = on; inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_FORWARDING, NETCONFA_IFINDEX_ALL, net->ipv4.devconf_all); inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_FORWARDING, NETCONFA_IFINDEX_DEFAULT, net->ipv4.devconf_dflt); for_each_netdev(net, dev) { struct in_device *in_dev; if (on) dev_disable_lro(dev); in_dev = __in_dev_get_rtnl_net(dev); if (in_dev) { IN_DEV_CONF_SET(in_dev, FORWARDING, on); inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_FORWARDING, dev->ifindex, &in_dev->cnf); } } } static int devinet_conf_ifindex(struct net *net, struct ipv4_devconf *cnf) { if (cnf == net->ipv4.devconf_dflt) return NETCONFA_IFINDEX_DEFAULT; else if (cnf == net->ipv4.devconf_all) return NETCONFA_IFINDEX_ALL; else { struct in_device *idev = container_of(cnf, struct in_device, cnf); return idev->dev->ifindex; } } static int devinet_conf_proc(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int old_value = *(int *)ctl->data; int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); int new_value = *(int *)ctl->data; if (write) { struct ipv4_devconf *cnf = ctl->extra1; struct net *net = ctl->extra2; int i = (int *)ctl->data - cnf->data; int ifindex; set_bit(i, cnf->state); if (cnf == net->ipv4.devconf_dflt) devinet_copy_dflt_conf(net, i); if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1 || i == IPV4_DEVCONF_ROUTE_LOCALNET - 1) if ((new_value == 0) && (old_value != 0)) rt_cache_flush(net); if (i == IPV4_DEVCONF_BC_FORWARDING - 1 && new_value != old_value) rt_cache_flush(net); if (i == IPV4_DEVCONF_RP_FILTER - 1 && new_value != old_value) { ifindex = devinet_conf_ifindex(net, cnf); inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_RP_FILTER, ifindex, cnf); } if (i == IPV4_DEVCONF_PROXY_ARP - 1 && new_value != old_value) { ifindex = devinet_conf_ifindex(net, cnf); inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_PROXY_NEIGH, ifindex, cnf); } if (i == IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN - 1 && new_value != old_value) { ifindex = devinet_conf_ifindex(net, cnf); inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, ifindex, cnf); } } return ret; } static int devinet_sysctl_forward(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; loff_t pos = *ppos; struct net *net = ctl->extra2; int ret; if (write && !ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; ret = proc_dointvec(ctl, write, buffer, lenp, ppos); if (write && *valp != val) { if (valp != &IPV4_DEVCONF_DFLT(net, FORWARDING)) { if (!rtnl_net_trylock(net)) { /* Restore the original values before restarting */ *valp = val; *ppos = pos; return restart_syscall(); } if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING)) { inet_forward_change(net); } else { struct ipv4_devconf *cnf = ctl->extra1; struct in_device *idev = container_of(cnf, struct in_device, cnf); if (*valp) dev_disable_lro(idev->dev); inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_FORWARDING, idev->dev->ifindex, cnf); } rtnl_net_unlock(net); rt_cache_flush(net); } else inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_FORWARDING, NETCONFA_IFINDEX_DEFAULT, net->ipv4.devconf_dflt); } return ret; } static int ipv4_doint_and_flush(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); struct net *net = ctl->extra2; if (write && *valp != val) rt_cache_flush(net); return ret; } #define DEVINET_SYSCTL_ENTRY(attr, name, mval, proc) \ { \ .procname = name, \ .data = ipv4_devconf.data + \ IPV4_DEVCONF_ ## attr - 1, \ .maxlen = sizeof(int), \ .mode = mval, \ .proc_handler = proc, \ .extra1 = &ipv4_devconf, \ } #define DEVINET_SYSCTL_RW_ENTRY(attr, name) \ DEVINET_SYSCTL_ENTRY(attr, name, 0644, devinet_conf_proc) #define DEVINET_SYSCTL_RO_ENTRY(attr, name) \ DEVINET_SYSCTL_ENTRY(attr, name, 0444, devinet_conf_proc) #define DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, proc) \ DEVINET_SYSCTL_ENTRY(attr, name, 0644, proc) #define DEVINET_SYSCTL_FLUSHING_ENTRY(attr, name) \ DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, ipv4_doint_and_flush) static struct devinet_sysctl_table { struct ctl_table_header *sysctl_header; struct ctl_table devinet_vars[IPV4_DEVCONF_MAX]; } devinet_sysctl = { .devinet_vars = { DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding", devinet_sysctl_forward), DEVINET_SYSCTL_RO_ENTRY(MC_FORWARDING, "mc_forwarding"), DEVINET_SYSCTL_RW_ENTRY(BC_FORWARDING, "bc_forwarding"), DEVINET_SYSCTL_RW_ENTRY(ACCEPT_REDIRECTS, "accept_redirects"), DEVINET_SYSCTL_RW_ENTRY(SECURE_REDIRECTS, "secure_redirects"), DEVINET_SYSCTL_RW_ENTRY(SHARED_MEDIA, "shared_media"), DEVINET_SYSCTL_RW_ENTRY(RP_FILTER, "rp_filter"), DEVINET_SYSCTL_RW_ENTRY(SEND_REDIRECTS, "send_redirects"), DEVINET_SYSCTL_RW_ENTRY(ACCEPT_SOURCE_ROUTE, "accept_source_route"), DEVINET_SYSCTL_RW_ENTRY(ACCEPT_LOCAL, "accept_local"), DEVINET_SYSCTL_RW_ENTRY(SRC_VMARK, "src_valid_mark"), DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP, "proxy_arp"), DEVINET_SYSCTL_RW_ENTRY(MEDIUM_ID, "medium_id"), DEVINET_SYSCTL_RW_ENTRY(BOOTP_RELAY, "bootp_relay"), DEVINET_SYSCTL_RW_ENTRY(LOG_MARTIANS, "log_martians"), DEVINET_SYSCTL_RW_ENTRY(TAG, "tag"), DEVINET_SYSCTL_RW_ENTRY(ARPFILTER, "arp_filter"), DEVINET_SYSCTL_RW_ENTRY(ARP_ANNOUNCE, "arp_announce"), DEVINET_SYSCTL_RW_ENTRY(ARP_IGNORE, "arp_ignore"), DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"), DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"), DEVINET_SYSCTL_RW_ENTRY(ARP_EVICT_NOCARRIER, "arp_evict_nocarrier"), DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP_PVLAN, "proxy_arp_pvlan"), DEVINET_SYSCTL_RW_ENTRY(FORCE_IGMP_VERSION, "force_igmp_version"), DEVINET_SYSCTL_RW_ENTRY(IGMPV2_UNSOLICITED_REPORT_INTERVAL, "igmpv2_unsolicited_report_interval"), DEVINET_SYSCTL_RW_ENTRY(IGMPV3_UNSOLICITED_REPORT_INTERVAL, "igmpv3_unsolicited_report_interval"), DEVINET_SYSCTL_RW_ENTRY(IGNORE_ROUTES_WITH_LINKDOWN, "ignore_routes_with_linkdown"), DEVINET_SYSCTL_RW_ENTRY(DROP_GRATUITOUS_ARP, "drop_gratuitous_arp"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"), DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES, "promote_secondaries"), DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET, "route_localnet"), DEVINET_SYSCTL_FLUSHING_ENTRY(DROP_UNICAST_IN_L2_MULTICAST, "drop_unicast_in_l2_multicast"), }, }; static int __devinet_sysctl_register(struct net *net, char *dev_name, int ifindex, struct ipv4_devconf *p) { int i; struct devinet_sysctl_table *t; char path[sizeof("net/ipv4/conf/") + IFNAMSIZ]; t = kmemdup(&devinet_sysctl, sizeof(*t), GFP_KERNEL_ACCOUNT); if (!t) goto out; for (i = 0; i < ARRAY_SIZE(t->devinet_vars); i++) { t->devinet_vars[i].data += (char *)p - (char *)&ipv4_devconf; t->devinet_vars[i].extra1 = p; t->devinet_vars[i].extra2 = net; } snprintf(path, sizeof(path), "net/ipv4/conf/%s", dev_name); t->sysctl_header = register_net_sysctl(net, path, t->devinet_vars); if (!t->sysctl_header) goto free; p->sysctl = t; inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_ALL, ifindex, p); return 0; free: kfree(t); out: return -ENOMEM; } static void __devinet_sysctl_unregister(struct net *net, struct ipv4_devconf *cnf, int ifindex) { struct devinet_sysctl_table *t = cnf->sysctl; if (t) { cnf->sysctl = NULL; unregister_net_sysctl_table(t->sysctl_header); kfree(t); } inet_netconf_notify_devconf(net, RTM_DELNETCONF, 0, ifindex, NULL); } static int devinet_sysctl_register(struct in_device *idev) { int err; if (!sysctl_dev_name_is_allowed(idev->dev->name)) return -EINVAL; err = neigh_sysctl_register(idev->dev, idev->arp_parms, NULL); if (err) return err; err = __devinet_sysctl_register(dev_net(idev->dev), idev->dev->name, idev->dev->ifindex, &idev->cnf); if (err) neigh_sysctl_unregister(idev->arp_parms); return err; } static void devinet_sysctl_unregister(struct in_device *idev) { struct net *net = dev_net(idev->dev); __devinet_sysctl_unregister(net, &idev->cnf, idev->dev->ifindex); neigh_sysctl_unregister(idev->arp_parms); } static struct ctl_table ctl_forward_entry[] = { { .procname = "ip_forward", .data = &ipv4_devconf.data[ IPV4_DEVCONF_FORWARDING - 1], .maxlen = sizeof(int), .mode = 0644, .proc_handler = devinet_sysctl_forward, .extra1 = &ipv4_devconf, .extra2 = &init_net, }, }; #endif static __net_init int devinet_init_net(struct net *net) { #ifdef CONFIG_SYSCTL struct ctl_table_header *forw_hdr; struct ctl_table *tbl; #endif struct ipv4_devconf *all, *dflt; int err; int i; err = -ENOMEM; net->ipv4.inet_addr_lst = kmalloc_array(IN4_ADDR_HSIZE, sizeof(struct hlist_head), GFP_KERNEL); if (!net->ipv4.inet_addr_lst) goto err_alloc_hash; all = kmemdup(&ipv4_devconf, sizeof(ipv4_devconf), GFP_KERNEL); if (!all) goto err_alloc_all; dflt = kmemdup(&ipv4_devconf_dflt, sizeof(ipv4_devconf_dflt), GFP_KERNEL); if (!dflt) goto err_alloc_dflt; #ifdef CONFIG_SYSCTL tbl = kmemdup(ctl_forward_entry, sizeof(ctl_forward_entry), GFP_KERNEL); if (!tbl) goto err_alloc_ctl; tbl[0].data = &all->data[IPV4_DEVCONF_FORWARDING - 1]; tbl[0].extra1 = all; tbl[0].extra2 = net; #endif if (!net_eq(net, &init_net)) { switch (net_inherit_devconf()) { case 3: /* copy from the current netns */ memcpy(all, current->nsproxy->net_ns->ipv4.devconf_all, sizeof(ipv4_devconf)); memcpy(dflt, current->nsproxy->net_ns->ipv4.devconf_dflt, sizeof(ipv4_devconf_dflt)); break; case 0: case 1: /* copy from init_net */ memcpy(all, init_net.ipv4.devconf_all, sizeof(ipv4_devconf)); memcpy(dflt, init_net.ipv4.devconf_dflt, sizeof(ipv4_devconf_dflt)); break; case 2: /* use compiled values */ break; } } #ifdef CONFIG_SYSCTL err = __devinet_sysctl_register(net, "all", NETCONFA_IFINDEX_ALL, all); if (err < 0) goto err_reg_all; err = __devinet_sysctl_register(net, "default", NETCONFA_IFINDEX_DEFAULT, dflt); if (err < 0) goto err_reg_dflt; err = -ENOMEM; forw_hdr = register_net_sysctl_sz(net, "net/ipv4", tbl, ARRAY_SIZE(ctl_forward_entry)); if (!forw_hdr) goto err_reg_ctl; net->ipv4.forw_hdr = forw_hdr; #endif for (i = 0; i < IN4_ADDR_HSIZE; i++) INIT_HLIST_HEAD(&net->ipv4.inet_addr_lst[i]); INIT_DEFERRABLE_WORK(&net->ipv4.addr_chk_work, check_lifetime); net->ipv4.devconf_all = all; net->ipv4.devconf_dflt = dflt; return 0; #ifdef CONFIG_SYSCTL err_reg_ctl: __devinet_sysctl_unregister(net, dflt, NETCONFA_IFINDEX_DEFAULT); err_reg_dflt: __devinet_sysctl_unregister(net, all, NETCONFA_IFINDEX_ALL); err_reg_all: kfree(tbl); err_alloc_ctl: #endif kfree(dflt); err_alloc_dflt: kfree(all); err_alloc_all: kfree(net->ipv4.inet_addr_lst); err_alloc_hash: return err; } static __net_exit void devinet_exit_net(struct net *net) { #ifdef CONFIG_SYSCTL const struct ctl_table *tbl; #endif cancel_delayed_work_sync(&net->ipv4.addr_chk_work); #ifdef CONFIG_SYSCTL tbl = net->ipv4.forw_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv4.forw_hdr); __devinet_sysctl_unregister(net, net->ipv4.devconf_dflt, NETCONFA_IFINDEX_DEFAULT); __devinet_sysctl_unregister(net, net->ipv4.devconf_all, NETCONFA_IFINDEX_ALL); kfree(tbl); #endif kfree(net->ipv4.devconf_dflt); kfree(net->ipv4.devconf_all); kfree(net->ipv4.inet_addr_lst); } static __net_initdata struct pernet_operations devinet_ops = { .init = devinet_init_net, .exit = devinet_exit_net, }; static struct rtnl_af_ops inet_af_ops __read_mostly = { .family = AF_INET, .fill_link_af = inet_fill_link_af, .get_link_af_size = inet_get_link_af_size, .validate_link_af = inet_validate_link_af, .set_link_af = inet_set_link_af, }; static const struct rtnl_msg_handler devinet_rtnl_msg_handlers[] __initconst = { {.protocol = PF_INET, .msgtype = RTM_NEWADDR, .doit = inet_rtm_newaddr, .flags = RTNL_FLAG_DOIT_PERNET}, {.protocol = PF_INET, .msgtype = RTM_DELADDR, .doit = inet_rtm_deladdr, .flags = RTNL_FLAG_DOIT_PERNET}, {.protocol = PF_INET, .msgtype = RTM_GETADDR, .dumpit = inet_dump_ifaddr, .flags = RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE}, {.protocol = PF_INET, .msgtype = RTM_GETNETCONF, .doit = inet_netconf_get_devconf, .dumpit = inet_netconf_dump_devconf, .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, {.owner = THIS_MODULE, .protocol = PF_INET, .msgtype = RTM_GETMULTICAST, .dumpit = inet_dump_ifmcaddr, .flags = RTNL_FLAG_DUMP_UNLOCKED}, }; void __init devinet_init(void) { register_pernet_subsys(&devinet_ops); register_netdevice_notifier(&ip_netdev_notifier); if (rtnl_af_register(&inet_af_ops)) panic("Unable to register inet_af_ops\n"); rtnl_register_many(devinet_rtnl_msg_handlers); }
3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 // SPDX-License-Identifier: GPL-2.0 /* * Workingset detection * * Copyright (C) 2013 Red Hat, Inc., Johannes Weiner */ #include <linux/memcontrol.h> #include <linux/mm_inline.h> #include <linux/writeback.h> #include <linux/shmem_fs.h> #include <linux/pagemap.h> #include <linux/atomic.h> #include <linux/module.h> #include <linux/swap.h> #include <linux/dax.h> #include <linux/fs.h> #include <linux/mm.h> #include "internal.h" /* * Double CLOCK lists * * Per node, two clock lists are maintained for file pages: the * inactive and the active list. Freshly faulted pages start out at * the head of the inactive list and page reclaim scans pages from the * tail. Pages that are accessed multiple times on the inactive list * are promoted to the active list, to protect them from reclaim, * whereas active pages are demoted to the inactive list when the * active list grows too big. * * fault ------------------------+ * | * +--------------+ | +-------------+ * reclaim <- | inactive | <-+-- demotion | active | <--+ * +--------------+ +-------------+ | * | | * +-------------- promotion ------------------+ * * * Access frequency and refault distance * * A workload is thrashing when its pages are frequently used but they * are evicted from the inactive list every time before another access * would have promoted them to the active list. * * In cases where the average access distance between thrashing pages * is bigger than the size of memory there is nothing that can be * done - the thrashing set could never fit into memory under any * circumstance. * * However, the average access distance could be bigger than the * inactive list, yet smaller than the size of memory. In this case, * the set could fit into memory if it weren't for the currently * active pages - which may be used more, hopefully less frequently: * * +-memory available to cache-+ * | | * +-inactive------+-active----+ * a b | c d e f g h i | J K L M N | * +---------------+-----------+ * * It is prohibitively expensive to accurately track access frequency * of pages. But a reasonable approximation can be made to measure * thrashing on the inactive list, after which refaulting pages can be * activated optimistically to compete with the existing active pages. * * Approximating inactive page access frequency - Observations: * * 1. When a page is accessed for the first time, it is added to the * head of the inactive list, slides every existing inactive page * towards the tail by one slot, and pushes the current tail page * out of memory. * * 2. When a page is accessed for the second time, it is promoted to * the active list, shrinking the inactive list by one slot. This * also slides all inactive pages that were faulted into the cache * more recently than the activated page towards the tail of the * inactive list. * * Thus: * * 1. The sum of evictions and activations between any two points in * time indicate the minimum number of inactive pages accessed in * between. * * 2. Moving one inactive page N page slots towards the tail of the * list requires at least N inactive page accesses. * * Combining these: * * 1. When a page is finally evicted from memory, the number of * inactive pages accessed while the page was in cache is at least * the number of page slots on the inactive list. * * 2. In addition, measuring the sum of evictions and activations (E) * at the time of a page's eviction, and comparing it to another * reading (R) at the time the page faults back into memory tells * the minimum number of accesses while the page was not cached. * This is called the refault distance. * * Because the first access of the page was the fault and the second * access the refault, we combine the in-cache distance with the * out-of-cache distance to get the complete minimum access distance * of this page: * * NR_inactive + (R - E) * * And knowing the minimum access distance of a page, we can easily * tell if the page would be able to stay in cache assuming all page * slots in the cache were available: * * NR_inactive + (R - E) <= NR_inactive + NR_active * * If we have swap we should consider about NR_inactive_anon and * NR_active_anon, so for page cache and anonymous respectively: * * NR_inactive_file + (R - E) <= NR_inactive_file + NR_active_file * + NR_inactive_anon + NR_active_anon * * NR_inactive_anon + (R - E) <= NR_inactive_anon + NR_active_anon * + NR_inactive_file + NR_active_file * * Which can be further simplified to: * * (R - E) <= NR_active_file + NR_inactive_anon + NR_active_anon * * (R - E) <= NR_active_anon + NR_inactive_file + NR_active_file * * Put into words, the refault distance (out-of-cache) can be seen as * a deficit in inactive list space (in-cache). If the inactive list * had (R - E) more page slots, the page would not have been evicted * in between accesses, but activated instead. And on a full system, * the only thing eating into inactive list space is active pages. * * * Refaulting inactive pages * * All that is known about the active list is that the pages have been * accessed more than once in the past. This means that at any given * time there is actually a good chance that pages on the active list * are no longer in active use. * * So when a refault distance of (R - E) is observed and there are at * least (R - E) pages in the userspace workingset, the refaulting page * is activated optimistically in the hope that (R - E) pages are actually * used less frequently than the refaulting page - or even not used at * all anymore. * * That means if inactive cache is refaulting with a suitable refault * distance, we assume the cache workingset is transitioning and put * pressure on the current workingset. * * If this is wrong and demotion kicks in, the pages which are truly * used more frequently will be reactivated while the less frequently * used once will be evicted from memory. * * But if this is right, the stale pages will be pushed out of memory * and the used pages get to stay in cache. * * Refaulting active pages * * If on the other hand the refaulting pages have recently been * deactivated, it means that the active list is no longer protecting * actively used cache from reclaim. The cache is NOT transitioning to * a different workingset; the existing workingset is thrashing in the * space allocated to the page cache. * * * Implementation * * For each node's LRU lists, a counter for inactive evictions and * activations is maintained (node->nonresident_age). * * On eviction, a snapshot of this counter (along with some bits to * identify the node) is stored in the now empty page cache * slot of the evicted page. This is called a shadow entry. * * On cache misses for which there are shadow entries, an eligible * refault distance will immediately activate the refaulting page. */ #define WORKINGSET_SHIFT 1 #define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \ WORKINGSET_SHIFT + NODES_SHIFT + \ MEM_CGROUP_ID_SHIFT) #define EVICTION_MASK (~0UL >> EVICTION_SHIFT) /* * Eviction timestamps need to be able to cover the full range of * actionable refaults. However, bits are tight in the xarray * entry, and after storing the identifier for the lruvec there might * not be enough left to represent every single actionable refault. In * that case, we have to sacrifice granularity for distance, and group * evictions into coarser buckets by shaving off lower timestamp bits. */ static unsigned int bucket_order __read_mostly; static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, bool workingset) { eviction &= EVICTION_MASK; eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; eviction = (eviction << NODES_SHIFT) | pgdat->node_id; eviction = (eviction << WORKINGSET_SHIFT) | workingset; return xa_mk_value(eviction); } static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, unsigned long *evictionp, bool *workingsetp) { unsigned long entry = xa_to_value(shadow); int memcgid, nid; bool workingset; workingset = entry & ((1UL << WORKINGSET_SHIFT) - 1); entry >>= WORKINGSET_SHIFT; nid = entry & ((1UL << NODES_SHIFT) - 1); entry >>= NODES_SHIFT; memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); entry >>= MEM_CGROUP_ID_SHIFT; *memcgidp = memcgid; *pgdat = NODE_DATA(nid); *evictionp = entry; *workingsetp = workingset; } #ifdef CONFIG_LRU_GEN static void *lru_gen_eviction(struct folio *folio) { int hist; unsigned long token; unsigned long min_seq; struct lruvec *lruvec; struct lru_gen_folio *lrugen; int type = folio_is_file_lru(folio); int delta = folio_nr_pages(folio); int refs = folio_lru_refs(folio); bool workingset = folio_test_workingset(folio); int tier = lru_tier_from_refs(refs, workingset); struct mem_cgroup *memcg = folio_memcg(folio); struct pglist_data *pgdat = folio_pgdat(folio); BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT); lruvec = mem_cgroup_lruvec(memcg, pgdat); lrugen = &lruvec->lrugen; min_seq = READ_ONCE(lrugen->min_seq[type]); token = (min_seq << LRU_REFS_WIDTH) | max(refs - 1, 0); hist = lru_hist_from_seq(min_seq); atomic_long_add(delta, &lrugen->evicted[hist][type][tier]); return pack_shadow(mem_cgroup_id(memcg), pgdat, token, workingset); } /* * Tests if the shadow entry is for a folio that was recently evicted. * Fills in @lruvec, @token, @workingset with the values unpacked from shadow. */ static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec, unsigned long *token, bool *workingset) { int memcg_id; unsigned long max_seq; struct mem_cgroup *memcg; struct pglist_data *pgdat; unpack_shadow(shadow, &memcg_id, &pgdat, token, workingset); memcg = mem_cgroup_from_id(memcg_id); *lruvec = mem_cgroup_lruvec(memcg, pgdat); max_seq = READ_ONCE((*lruvec)->lrugen.max_seq); max_seq &= EVICTION_MASK >> LRU_REFS_WIDTH; return abs_diff(max_seq, *token >> LRU_REFS_WIDTH) < MAX_NR_GENS; } static void lru_gen_refault(struct folio *folio, void *shadow) { bool recent; int hist, tier, refs; bool workingset; unsigned long token; struct lruvec *lruvec; struct lru_gen_folio *lrugen; int type = folio_is_file_lru(folio); int delta = folio_nr_pages(folio); rcu_read_lock(); recent = lru_gen_test_recent(shadow, &lruvec, &token, &workingset); if (lruvec != folio_lruvec(folio)) goto unlock; mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta); if (!recent) goto unlock; lrugen = &lruvec->lrugen; hist = lru_hist_from_seq(READ_ONCE(lrugen->min_seq[type])); refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + 1; tier = lru_tier_from_refs(refs, workingset); atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]); /* see folio_add_lru() where folio_set_active() will be called */ if (lru_gen_in_fault()) mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta); if (workingset) { folio_set_workingset(folio); mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta); } else set_mask_bits(&folio->flags, LRU_REFS_MASK, (refs - 1UL) << LRU_REFS_PGOFF); unlock: rcu_read_unlock(); } #else /* !CONFIG_LRU_GEN */ static void *lru_gen_eviction(struct folio *folio) { return NULL; } static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec, unsigned long *token, bool *workingset) { return false; } static void lru_gen_refault(struct folio *folio, void *shadow) { } #endif /* CONFIG_LRU_GEN */ /** * workingset_age_nonresident - age non-resident entries as LRU ages * @lruvec: the lruvec that was aged * @nr_pages: the number of pages to count * * As in-memory pages are aged, non-resident pages need to be aged as * well, in order for the refault distances later on to be comparable * to the in-memory dimensions. This function allows reclaim and LRU * operations to drive the non-resident aging along in parallel. */ void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages) { /* * Reclaiming a cgroup means reclaiming all its children in a * round-robin fashion. That means that each cgroup has an LRU * order that is composed of the LRU orders of its child * cgroups; and every page has an LRU position not just in the * cgroup that owns it, but in all of that group's ancestors. * * So when the physical inactive list of a leaf cgroup ages, * the virtual inactive lists of all its parents, including * the root cgroup's, age as well. */ do { atomic_long_add(nr_pages, &lruvec->nonresident_age); } while ((lruvec = parent_lruvec(lruvec))); } /** * workingset_eviction - note the eviction of a folio from memory * @target_memcg: the cgroup that is causing the reclaim * @folio: the folio being evicted * * Return: a shadow entry to be stored in @folio->mapping->i_pages in place * of the evicted @folio so that a later refault can be detected. */ void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg) { struct pglist_data *pgdat = folio_pgdat(folio); unsigned long eviction; struct lruvec *lruvec; int memcgid; /* Folio is fully exclusive and pins folio's memory cgroup pointer */ VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (lru_gen_enabled()) return lru_gen_eviction(folio); lruvec = mem_cgroup_lruvec(target_memcg, pgdat); /* XXX: target_memcg can be NULL, go through lruvec */ memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); eviction = atomic_long_read(&lruvec->nonresident_age); eviction >>= bucket_order; workingset_age_nonresident(lruvec, folio_nr_pages(folio)); return pack_shadow(memcgid, pgdat, eviction, folio_test_workingset(folio)); } /** * workingset_test_recent - tests if the shadow entry is for a folio that was * recently evicted. Also fills in @workingset with the value unpacked from * shadow. * @shadow: the shadow entry to be tested. * @file: whether the corresponding folio is from the file lru. * @workingset: where the workingset value unpacked from shadow should * be stored. * @flush: whether to flush cgroup rstat. * * Return: true if the shadow is for a recently evicted folio; false otherwise. */ bool workingset_test_recent(void *shadow, bool file, bool *workingset, bool flush) { struct mem_cgroup *eviction_memcg; struct lruvec *eviction_lruvec; unsigned long refault_distance; unsigned long workingset_size; unsigned long refault; int memcgid; struct pglist_data *pgdat; unsigned long eviction; if (lru_gen_enabled()) { bool recent; rcu_read_lock(); recent = lru_gen_test_recent(shadow, &eviction_lruvec, &eviction, workingset); rcu_read_unlock(); return recent; } rcu_read_lock(); unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset); eviction <<= bucket_order; /* * Look up the memcg associated with the stored ID. It might * have been deleted since the folio's eviction. * * Note that in rare events the ID could have been recycled * for a new cgroup that refaults a shared folio. This is * impossible to tell from the available data. However, this * should be a rare and limited disturbance, and activations * are always speculative anyway. Ultimately, it's the aging * algorithm's job to shake out the minimum access frequency * for the active cache. * * XXX: On !CONFIG_MEMCG, this will always return NULL; it * would be better if the root_mem_cgroup existed in all * configurations instead. */ eviction_memcg = mem_cgroup_from_id(memcgid); if (!mem_cgroup_tryget(eviction_memcg)) eviction_memcg = NULL; rcu_read_unlock(); if (!mem_cgroup_disabled() && !eviction_memcg) return false; /* * Flush stats (and potentially sleep) outside the RCU read section. * * Note that workingset_test_recent() itself might be called in RCU read * section (for e.g, in cachestat) - these callers need to skip flushing * stats (via the flush argument). * * XXX: With per-memcg flushing and thresholding, is ratelimiting * still needed here? */ if (flush) mem_cgroup_flush_stats_ratelimited(eviction_memcg); eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat); refault = atomic_long_read(&eviction_lruvec->nonresident_age); /* * Calculate the refault distance * * The unsigned subtraction here gives an accurate distance * across nonresident_age overflows in most cases. There is a * special case: usually, shadow entries have a short lifetime * and are either refaulted or reclaimed along with the inode * before they get too old. But it is not impossible for the * nonresident_age to lap a shadow entry in the field, which * can then result in a false small refault distance, leading * to a false activation should this old entry actually * refault again. However, earlier kernels used to deactivate * unconditionally with *every* reclaim invocation for the * longest time, so the occasional inappropriate activation * leading to pressure on the active list is not a problem. */ refault_distance = (refault - eviction) & EVICTION_MASK; /* * Compare the distance to the existing workingset size. We * don't activate pages that couldn't stay resident even if * all the memory was available to the workingset. Whether * workingset competition needs to consider anon or not depends * on having free swap space. */ workingset_size = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE); if (!file) { workingset_size += lruvec_page_state(eviction_lruvec, NR_INACTIVE_FILE); } if (mem_cgroup_get_nr_swap_pages(eviction_memcg) > 0) { workingset_size += lruvec_page_state(eviction_lruvec, NR_ACTIVE_ANON); if (file) { workingset_size += lruvec_page_state(eviction_lruvec, NR_INACTIVE_ANON); } } mem_cgroup_put(eviction_memcg); return refault_distance <= workingset_size; } /** * workingset_refault - Evaluate the refault of a previously evicted folio. * @folio: The freshly allocated replacement folio. * @shadow: Shadow entry of the evicted folio. * * Calculates and evaluates the refault distance of the previously * evicted folio in the context of the node and the memcg whose memory * pressure caused the eviction. */ void workingset_refault(struct folio *folio, void *shadow) { bool file = folio_is_file_lru(folio); struct pglist_data *pgdat; struct mem_cgroup *memcg; struct lruvec *lruvec; bool workingset; long nr; VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (lru_gen_enabled()) { lru_gen_refault(folio, shadow); return; } /* * The activation decision for this folio is made at the level * where the eviction occurred, as that is where the LRU order * during folio reclaim is being determined. * * However, the cgroup that will own the folio is the one that * is actually experiencing the refault event. Make sure the folio is * locked to guarantee folio_memcg() stability throughout. */ nr = folio_nr_pages(folio); memcg = folio_memcg(folio); pgdat = folio_pgdat(folio); lruvec = mem_cgroup_lruvec(memcg, pgdat); mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr); if (!workingset_test_recent(shadow, file, &workingset, true)) return; folio_set_active(folio); workingset_age_nonresident(lruvec, nr); mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file, nr); /* Folio was active prior to eviction */ if (workingset) { folio_set_workingset(folio); /* * XXX: Move to folio_add_lru() when it supports new vs * putback */ lru_note_cost_refault(folio); mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr); } } /** * workingset_activation - note a page activation * @folio: Folio that is being activated. */ void workingset_activation(struct folio *folio) { /* * Filter non-memcg pages here, e.g. unmap can call * mark_page_accessed() on VDSO pages. */ if (mem_cgroup_disabled() || folio_memcg_charged(folio)) workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio)); } /* * Shadow entries reflect the share of the working set that does not * fit into memory, so their number depends on the access pattern of * the workload. In most cases, they will refault or get reclaimed * along with the inode, but a (malicious) workload that streams * through files with a total size several times that of available * memory, while preventing the inodes from being reclaimed, can * create excessive amounts of shadow nodes. To keep a lid on this, * track shadow nodes and reclaim them when they grow way past the * point where they would still be useful. */ struct list_lru shadow_nodes; void workingset_update_node(struct xa_node *node) { struct page *page = virt_to_page(node); /* * Track non-empty nodes that contain only shadow entries; * unlink those that contain pages or are being freed. * * Avoid acquiring the list_lru lock when the nodes are * already where they should be. The list_empty() test is safe * as node->private_list is protected by the i_pages lock. */ lockdep_assert_held(&node->array->xa_lock); if (node->count && node->count == node->nr_values) { if (list_empty(&node->private_list)) { list_lru_add_obj(&shadow_nodes, &node->private_list); __inc_node_page_state(page, WORKINGSET_NODES); } } else { if (!list_empty(&node->private_list)) { list_lru_del_obj(&shadow_nodes, &node->private_list); __dec_node_page_state(page, WORKINGSET_NODES); } } } static unsigned long count_shadow_nodes(struct shrinker *shrinker, struct shrink_control *sc) { unsigned long max_nodes; unsigned long nodes; unsigned long pages; nodes = list_lru_shrink_count(&shadow_nodes, sc); if (!nodes) return SHRINK_EMPTY; /* * Approximate a reasonable limit for the nodes * containing shadow entries. We don't need to keep more * shadow entries than possible pages on the active list, * since refault distances bigger than that are dismissed. * * The size of the active list converges toward 100% of * overall page cache as memory grows, with only a tiny * inactive list. Assume the total cache size for that. * * Nodes might be sparsely populated, with only one shadow * entry in the extreme case. Obviously, we cannot keep one * node for every eligible shadow entry, so compromise on a * worst-case density of 1/8th. Below that, not all eligible * refaults can be detected anymore. * * On 64-bit with 7 xa_nodes per page and 64 slots * each, this will reclaim shadow entries when they consume * ~1.8% of available memory: * * PAGE_SIZE / xa_nodes / node_entries * 8 / PAGE_SIZE */ #ifdef CONFIG_MEMCG if (sc->memcg) { struct lruvec *lruvec; int i; mem_cgroup_flush_stats_ratelimited(sc->memcg); lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); for (pages = 0, i = 0; i < NR_LRU_LISTS; i++) pages += lruvec_page_state_local(lruvec, NR_LRU_BASE + i); pages += lruvec_page_state_local( lruvec, NR_SLAB_RECLAIMABLE_B) >> PAGE_SHIFT; pages += lruvec_page_state_local( lruvec, NR_SLAB_UNRECLAIMABLE_B) >> PAGE_SHIFT; } else #endif pages = node_present_pages(sc->nid); max_nodes = pages >> (XA_CHUNK_SHIFT - 3); if (nodes <= max_nodes) return 0; return nodes - max_nodes; } static enum lru_status shadow_lru_isolate(struct list_head *item, struct list_lru_one *lru, void *arg) __must_hold(lru->lock) { struct xa_node *node = container_of(item, struct xa_node, private_list); struct address_space *mapping; int ret; /* * Page cache insertions and deletions synchronously maintain * the shadow node LRU under the i_pages lock and the * &lru->lock. Because the page cache tree is emptied before * the inode can be destroyed, holding the &lru->lock pins any * address_space that has nodes on the LRU. * * We can then safely transition to the i_pages lock to * pin only the address_space of the particular node we want * to reclaim, take the node off-LRU, and drop the &lru->lock. */ mapping = container_of(node->array, struct address_space, i_pages); /* Coming from the list, invert the lock order */ if (!xa_trylock(&mapping->i_pages)) { spin_unlock_irq(&lru->lock); ret = LRU_RETRY; goto out; } /* For page cache we need to hold i_lock */ if (mapping->host != NULL) { if (!spin_trylock(&mapping->host->i_lock)) { xa_unlock(&mapping->i_pages); spin_unlock_irq(&lru->lock); ret = LRU_RETRY; goto out; } } list_lru_isolate(lru, item); __dec_node_page_state(virt_to_page(node), WORKINGSET_NODES); spin_unlock(&lru->lock); /* * The nodes should only contain one or more shadow entries, * no pages, so we expect to be able to remove them all and * delete and free the empty node afterwards. */ if (WARN_ON_ONCE(!node->nr_values)) goto out_invalid; if (WARN_ON_ONCE(node->count != node->nr_values)) goto out_invalid; xa_delete_node(node, workingset_update_node); __inc_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM); out_invalid: xa_unlock_irq(&mapping->i_pages); if (mapping->host != NULL) { if (mapping_shrinkable(mapping)) inode_add_lru(mapping->host); spin_unlock(&mapping->host->i_lock); } ret = LRU_REMOVED_RETRY; out: cond_resched(); return ret; } static unsigned long scan_shadow_nodes(struct shrinker *shrinker, struct shrink_control *sc) { /* list_lru lock nests inside the IRQ-safe i_pages lock */ return list_lru_shrink_walk_irq(&shadow_nodes, sc, shadow_lru_isolate, NULL); } /* * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe * i_pages lock. */ static struct lock_class_key shadow_nodes_key; static int __init workingset_init(void) { struct shrinker *workingset_shadow_shrinker; unsigned int timestamp_bits; unsigned int max_order; int ret = -ENOMEM; BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT); /* * Calculate the eviction bucket size to cover the longest * actionable refault distance, which is currently half of * memory (totalram_pages/2). However, memory hotplug may add * some more pages at runtime, so keep working with up to * double the initial memory by using totalram_pages as-is. */ timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT; max_order = fls_long(totalram_pages() - 1); if (max_order > timestamp_bits) bucket_order = max_order - timestamp_bits; pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", timestamp_bits, max_order, bucket_order); workingset_shadow_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "mm-shadow"); if (!workingset_shadow_shrinker) goto err; ret = list_lru_init_memcg_key(&shadow_nodes, workingset_shadow_shrinker, &shadow_nodes_key); if (ret) goto err_list_lru; workingset_shadow_shrinker->count_objects = count_shadow_nodes; workingset_shadow_shrinker->scan_objects = scan_shadow_nodes; /* ->count reports only fully expendable nodes */ workingset_shadow_shrinker->seeks = 0; shrinker_register(workingset_shadow_shrinker); return 0; err_list_lru: shrinker_free(workingset_shadow_shrinker); err: return ret; } module_init(workingset_init);
732 932 29 105 899 161 144 83 1321 1200 1185 740 553 355 125 9 72 79 12 127 3 1248 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 // SPDX-License-Identifier: GPL-2.0 // Generated by scripts/atomic/gen-atomic-long.sh // DO NOT MODIFY THIS FILE DIRECTLY #ifndef _LINUX_ATOMIC_LONG_H #define _LINUX_ATOMIC_LONG_H #include <linux/compiler.h> #include <asm/types.h> #ifdef CONFIG_64BIT typedef atomic64_t atomic_long_t; #define ATOMIC_LONG_INIT(i) ATOMIC64_INIT(i) #define atomic_long_cond_read_acquire atomic64_cond_read_acquire #define atomic_long_cond_read_relaxed atomic64_cond_read_relaxed #else typedef atomic_t atomic_long_t; #define ATOMIC_LONG_INIT(i) ATOMIC_INIT(i) #define atomic_long_cond_read_acquire atomic_cond_read_acquire #define atomic_long_cond_read_relaxed atomic_cond_read_relaxed #endif /** * raw_atomic_long_read() - atomic load with relaxed ordering * @v: pointer to atomic_long_t * * Atomically loads the value of @v with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_read() elsewhere. * * Return: The value loaded from @v. */ static __always_inline long raw_atomic_long_read(const atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_read(v); #else return raw_atomic_read(v); #endif } /** * raw_atomic_long_read_acquire() - atomic load with acquire ordering * @v: pointer to atomic_long_t * * Atomically loads the value of @v with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_read_acquire() elsewhere. * * Return: The value loaded from @v. */ static __always_inline long raw_atomic_long_read_acquire(const atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_read_acquire(v); #else return raw_atomic_read_acquire(v); #endif } /** * raw_atomic_long_set() - atomic set with relaxed ordering * @v: pointer to atomic_long_t * @i: long value to assign * * Atomically sets @v to @i with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_set() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_long_set(atomic_long_t *v, long i) { #ifdef CONFIG_64BIT raw_atomic64_set(v, i); #else raw_atomic_set(v, i); #endif } /** * raw_atomic_long_set_release() - atomic set with release ordering * @v: pointer to atomic_long_t * @i: long value to assign * * Atomically sets @v to @i with release ordering. * * Safe to use in noinstr code; prefer atomic_long_set_release() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_long_set_release(atomic_long_t *v, long i) { #ifdef CONFIG_64BIT raw_atomic64_set_release(v, i); #else raw_atomic_set_release(v, i); #endif } /** * raw_atomic_long_add() - atomic add with relaxed ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_add() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_long_add(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT raw_atomic64_add(i, v); #else raw_atomic_add(i, v); #endif } /** * raw_atomic_long_add_return() - atomic add with full ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_add_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_add_return(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_add_return(i, v); #else return raw_atomic_add_return(i, v); #endif } /** * raw_atomic_long_add_return_acquire() - atomic add with acquire ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_add_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_add_return_acquire(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_add_return_acquire(i, v); #else return raw_atomic_add_return_acquire(i, v); #endif } /** * raw_atomic_long_add_return_release() - atomic add with release ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_add_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_add_return_release(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_add_return_release(i, v); #else return raw_atomic_add_return_release(i, v); #endif } /** * raw_atomic_long_add_return_relaxed() - atomic add with relaxed ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_add_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_add_return_relaxed(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_add_return_relaxed(i, v); #else return raw_atomic_add_return_relaxed(i, v); #endif } /** * raw_atomic_long_fetch_add() - atomic add with full ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_add() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_add(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_add(i, v); #else return raw_atomic_fetch_add(i, v); #endif } /** * raw_atomic_long_fetch_add_acquire() - atomic add with acquire ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_add_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_add_acquire(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_add_acquire(i, v); #else return raw_atomic_fetch_add_acquire(i, v); #endif } /** * raw_atomic_long_fetch_add_release() - atomic add with release ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_add_release() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_add_release(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_add_release(i, v); #else return raw_atomic_fetch_add_release(i, v); #endif } /** * raw_atomic_long_fetch_add_relaxed() - atomic add with relaxed ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_add_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_add_relaxed(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_add_relaxed(i, v); #else return raw_atomic_fetch_add_relaxed(i, v); #endif } /** * raw_atomic_long_sub() - atomic subtract with relaxed ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_sub() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_long_sub(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT raw_atomic64_sub(i, v); #else raw_atomic_sub(i, v); #endif } /** * raw_atomic_long_sub_return() - atomic subtract with full ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_sub_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_sub_return(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_sub_return(i, v); #else return raw_atomic_sub_return(i, v); #endif } /** * raw_atomic_long_sub_return_acquire() - atomic subtract with acquire ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_sub_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_sub_return_acquire(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_sub_return_acquire(i, v); #else return raw_atomic_sub_return_acquire(i, v); #endif } /** * raw_atomic_long_sub_return_release() - atomic subtract with release ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_sub_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_sub_return_release(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_sub_return_release(i, v); #else return raw_atomic_sub_return_release(i, v); #endif } /** * raw_atomic_long_sub_return_relaxed() - atomic subtract with relaxed ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_sub_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_sub_return_relaxed(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_sub_return_relaxed(i, v); #else return raw_atomic_sub_return_relaxed(i, v); #endif } /** * raw_atomic_long_fetch_sub() - atomic subtract with full ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_sub() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_sub(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_sub(i, v); #else return raw_atomic_fetch_sub(i, v); #endif } /** * raw_atomic_long_fetch_sub_acquire() - atomic subtract with acquire ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_sub_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_sub_acquire(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_sub_acquire(i, v); #else return raw_atomic_fetch_sub_acquire(i, v); #endif } /** * raw_atomic_long_fetch_sub_release() - atomic subtract with release ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_sub_release() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_sub_release(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_sub_release(i, v); #else return raw_atomic_fetch_sub_release(i, v); #endif } /** * raw_atomic_long_fetch_sub_relaxed() - atomic subtract with relaxed ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_sub_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_sub_relaxed(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_sub_relaxed(i, v); #else return raw_atomic_fetch_sub_relaxed(i, v); #endif } /** * raw_atomic_long_inc() - atomic increment with relaxed ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_inc() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_long_inc(atomic_long_t *v) { #ifdef CONFIG_64BIT raw_atomic64_inc(v); #else raw_atomic_inc(v); #endif } /** * raw_atomic_long_inc_return() - atomic increment with full ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_inc_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_inc_return(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_inc_return(v); #else return raw_atomic_inc_return(v); #endif } /** * raw_atomic_long_inc_return_acquire() - atomic increment with acquire ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_inc_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_inc_return_acquire(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_inc_return_acquire(v); #else return raw_atomic_inc_return_acquire(v); #endif } /** * raw_atomic_long_inc_return_release() - atomic increment with release ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_inc_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_inc_return_release(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_inc_return_release(v); #else return raw_atomic_inc_return_release(v); #endif } /** * raw_atomic_long_inc_return_relaxed() - atomic increment with relaxed ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_inc_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_inc_return_relaxed(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_inc_return_relaxed(v); #else return raw_atomic_inc_return_relaxed(v); #endif } /** * raw_atomic_long_fetch_inc() - atomic increment with full ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_inc() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_inc(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_inc(v); #else return raw_atomic_fetch_inc(v); #endif } /** * raw_atomic_long_fetch_inc_acquire() - atomic increment with acquire ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_inc_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_inc_acquire(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_inc_acquire(v); #else return raw_atomic_fetch_inc_acquire(v); #endif } /** * raw_atomic_long_fetch_inc_release() - atomic increment with release ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_inc_release() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_inc_release(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_inc_release(v); #else return raw_atomic_fetch_inc_release(v); #endif } /** * raw_atomic_long_fetch_inc_relaxed() - atomic increment with relaxed ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_inc_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_inc_relaxed(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_inc_relaxed(v); #else return raw_atomic_fetch_inc_relaxed(v); #endif } /** * raw_atomic_long_dec() - atomic decrement with relaxed ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_dec() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_long_dec(atomic_long_t *v) { #ifdef CONFIG_64BIT raw_atomic64_dec(v); #else raw_atomic_dec(v); #endif } /** * raw_atomic_long_dec_return() - atomic decrement with full ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_dec_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_dec_return(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_dec_return(v); #else return raw_atomic_dec_return(v); #endif } /** * raw_atomic_long_dec_return_acquire() - atomic decrement with acquire ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_dec_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_dec_return_acquire(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_dec_return_acquire(v); #else return raw_atomic_dec_return_acquire(v); #endif } /** * raw_atomic_long_dec_return_release() - atomic decrement with release ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_dec_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_dec_return_release(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_dec_return_release(v); #else return raw_atomic_dec_return_release(v); #endif } /** * raw_atomic_long_dec_return_relaxed() - atomic decrement with relaxed ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_dec_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_dec_return_relaxed(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_dec_return_relaxed(v); #else return raw_atomic_dec_return_relaxed(v); #endif } /** * raw_atomic_long_fetch_dec() - atomic decrement with full ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_dec() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_dec(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_dec(v); #else return raw_atomic_fetch_dec(v); #endif } /** * raw_atomic_long_fetch_dec_acquire() - atomic decrement with acquire ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_dec_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_dec_acquire(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_dec_acquire(v); #else return raw_atomic_fetch_dec_acquire(v); #endif } /** * raw_atomic_long_fetch_dec_release() - atomic decrement with release ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_dec_release() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_dec_release(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_dec_release(v); #else return raw_atomic_fetch_dec_release(v); #endif } /** * raw_atomic_long_fetch_dec_relaxed() - atomic decrement with relaxed ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_dec_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_dec_relaxed(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_dec_relaxed(v); #else return raw_atomic_fetch_dec_relaxed(v); #endif } /** * raw_atomic_long_and() - atomic bitwise AND with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_and() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_long_and(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT raw_atomic64_and(i, v); #else raw_atomic_and(i, v); #endif } /** * raw_atomic_long_fetch_and() - atomic bitwise AND with full ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_and() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_and(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_and(i, v); #else return raw_atomic_fetch_and(i, v); #endif } /** * raw_atomic_long_fetch_and_acquire() - atomic bitwise AND with acquire ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_and_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_and_acquire(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_and_acquire(i, v); #else return raw_atomic_fetch_and_acquire(i, v); #endif } /** * raw_atomic_long_fetch_and_release() - atomic bitwise AND with release ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_and_release() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_and_release(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_and_release(i, v); #else return raw_atomic_fetch_and_release(i, v); #endif } /** * raw_atomic_long_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_and_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_and_relaxed(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_and_relaxed(i, v); #else return raw_atomic_fetch_and_relaxed(i, v); #endif } /** * raw_atomic_long_andnot() - atomic bitwise AND NOT with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & ~@i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_andnot() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_long_andnot(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT raw_atomic64_andnot(i, v); #else raw_atomic_andnot(i, v); #endif } /** * raw_atomic_long_fetch_andnot() - atomic bitwise AND NOT with full ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & ~@i) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_andnot() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_andnot(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_andnot(i, v); #else return raw_atomic_fetch_andnot(i, v); #endif } /** * raw_atomic_long_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & ~@i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_andnot_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_andnot_acquire(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_andnot_acquire(i, v); #else return raw_atomic_fetch_andnot_acquire(i, v); #endif } /** * raw_atomic_long_fetch_andnot_release() - atomic bitwise AND NOT with release ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & ~@i) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_andnot_release() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_andnot_release(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_andnot_release(i, v); #else return raw_atomic_fetch_andnot_release(i, v); #endif } /** * raw_atomic_long_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & ~@i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_andnot_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_andnot_relaxed(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_andnot_relaxed(i, v); #else return raw_atomic_fetch_andnot_relaxed(i, v); #endif } /** * raw_atomic_long_or() - atomic bitwise OR with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v | @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_or() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_long_or(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT raw_atomic64_or(i, v); #else raw_atomic_or(i, v); #endif } /** * raw_atomic_long_fetch_or() - atomic bitwise OR with full ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v | @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_or() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_or(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_or(i, v); #else return raw_atomic_fetch_or(i, v); #endif } /** * raw_atomic_long_fetch_or_acquire() - atomic bitwise OR with acquire ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v | @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_or_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_or_acquire(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_or_acquire(i, v); #else return raw_atomic_fetch_or_acquire(i, v); #endif } /** * raw_atomic_long_fetch_or_release() - atomic bitwise OR with release ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v | @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_or_release() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_or_release(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_or_release(i, v); #else return raw_atomic_fetch_or_release(i, v); #endif } /** * raw_atomic_long_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v | @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_or_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_or_relaxed(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_or_relaxed(i, v); #else return raw_atomic_fetch_or_relaxed(i, v); #endif } /** * raw_atomic_long_xor() - atomic bitwise XOR with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v ^ @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_xor() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_long_xor(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT raw_atomic64_xor(i, v); #else raw_atomic_xor(i, v); #endif } /** * raw_atomic_long_fetch_xor() - atomic bitwise XOR with full ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v ^ @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_xor() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_xor(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_xor(i, v); #else return raw_atomic_fetch_xor(i, v); #endif } /** * raw_atomic_long_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v ^ @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_xor_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_xor_acquire(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_xor_acquire(i, v); #else return raw_atomic_fetch_xor_acquire(i, v); #endif } /** * raw_atomic_long_fetch_xor_release() - atomic bitwise XOR with release ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v ^ @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_xor_release() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_xor_release(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_xor_release(i, v); #else return raw_atomic_fetch_xor_release(i, v); #endif } /** * raw_atomic_long_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v ^ @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_xor_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_xor_relaxed(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_xor_relaxed(i, v); #else return raw_atomic_fetch_xor_relaxed(i, v); #endif } /** * raw_atomic_long_xchg() - atomic exchange with full ordering * @v: pointer to atomic_long_t * @new: long value to assign * * Atomically updates @v to @new with full ordering. * * Safe to use in noinstr code; prefer atomic_long_xchg() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_xchg(atomic_long_t *v, long new) { #ifdef CONFIG_64BIT return raw_atomic64_xchg(v, new); #else return raw_atomic_xchg(v, new); #endif } /** * raw_atomic_long_xchg_acquire() - atomic exchange with acquire ordering * @v: pointer to atomic_long_t * @new: long value to assign * * Atomically updates @v to @new with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_xchg_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_xchg_acquire(atomic_long_t *v, long new) { #ifdef CONFIG_64BIT return raw_atomic64_xchg_acquire(v, new); #else return raw_atomic_xchg_acquire(v, new); #endif } /** * raw_atomic_long_xchg_release() - atomic exchange with release ordering * @v: pointer to atomic_long_t * @new: long value to assign * * Atomically updates @v to @new with release ordering. * * Safe to use in noinstr code; prefer atomic_long_xchg_release() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_xchg_release(atomic_long_t *v, long new) { #ifdef CONFIG_64BIT return raw_atomic64_xchg_release(v, new); #else return raw_atomic_xchg_release(v, new); #endif } /** * raw_atomic_long_xchg_relaxed() - atomic exchange with relaxed ordering * @v: pointer to atomic_long_t * @new: long value to assign * * Atomically updates @v to @new with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_xchg_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_xchg_relaxed(atomic_long_t *v, long new) { #ifdef CONFIG_64BIT return raw_atomic64_xchg_relaxed(v, new); #else return raw_atomic_xchg_relaxed(v, new); #endif } /** * raw_atomic_long_cmpxchg() - atomic compare and exchange with full ordering * @v: pointer to atomic_long_t * @old: long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_cmpxchg() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_cmpxchg(atomic_long_t *v, long old, long new) { #ifdef CONFIG_64BIT return raw_atomic64_cmpxchg(v, old, new); #else return raw_atomic_cmpxchg(v, old, new); #endif } /** * raw_atomic_long_cmpxchg_acquire() - atomic compare and exchange with acquire ordering * @v: pointer to atomic_long_t * @old: long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_cmpxchg_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_cmpxchg_acquire(atomic_long_t *v, long old, long new) { #ifdef CONFIG_64BIT return raw_atomic64_cmpxchg_acquire(v, old, new); #else return raw_atomic_cmpxchg_acquire(v, old, new); #endif } /** * raw_atomic_long_cmpxchg_release() - atomic compare and exchange with release ordering * @v: pointer to atomic_long_t * @old: long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_cmpxchg_release() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_cmpxchg_release(atomic_long_t *v, long old, long new) { #ifdef CONFIG_64BIT return raw_atomic64_cmpxchg_release(v, old, new); #else return raw_atomic_cmpxchg_release(v, old, new); #endif } /** * raw_atomic_long_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering * @v: pointer to atomic_long_t * @old: long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_cmpxchg_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_cmpxchg_relaxed(atomic_long_t *v, long old, long new) { #ifdef CONFIG_64BIT return raw_atomic64_cmpxchg_relaxed(v, old, new); #else return raw_atomic_cmpxchg_relaxed(v, old, new); #endif } /** * raw_atomic_long_try_cmpxchg() - atomic compare and exchange with full ordering * @v: pointer to atomic_long_t * @old: pointer to long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_try_cmpxchg() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic_long_try_cmpxchg(atomic_long_t *v, long *old, long new) { #ifdef CONFIG_64BIT return raw_atomic64_try_cmpxchg(v, (s64 *)old, new); #else return raw_atomic_try_cmpxchg(v, (int *)old, new); #endif } /** * raw_atomic_long_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering * @v: pointer to atomic_long_t * @old: pointer to long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_try_cmpxchg_acquire() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic_long_try_cmpxchg_acquire(atomic_long_t *v, long *old, long new) { #ifdef CONFIG_64BIT return raw_atomic64_try_cmpxchg_acquire(v, (s64 *)old, new); #else return raw_atomic_try_cmpxchg_acquire(v, (int *)old, new); #endif } /** * raw_atomic_long_try_cmpxchg_release() - atomic compare and exchange with release ordering * @v: pointer to atomic_long_t * @old: pointer to long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_try_cmpxchg_release() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic_long_try_cmpxchg_release(atomic_long_t *v, long *old, long new) { #ifdef CONFIG_64BIT return raw_atomic64_try_cmpxchg_release(v, (s64 *)old, new); #else return raw_atomic_try_cmpxchg_release(v, (int *)old, new); #endif } /** * raw_atomic_long_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering * @v: pointer to atomic_long_t * @old: pointer to long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_try_cmpxchg_relaxed() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic_long_try_cmpxchg_relaxed(atomic_long_t *v, long *old, long new) { #ifdef CONFIG_64BIT return raw_atomic64_try_cmpxchg_relaxed(v, (s64 *)old, new); #else return raw_atomic_try_cmpxchg_relaxed(v, (int *)old, new); #endif } /** * raw_atomic_long_sub_and_test() - atomic subtract and test if zero with full ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_sub_and_test() elsewhere. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool raw_atomic_long_sub_and_test(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_sub_and_test(i, v); #else return raw_atomic_sub_and_test(i, v); #endif } /** * raw_atomic_long_dec_and_test() - atomic decrement and test if zero with full ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_dec_and_test() elsewhere. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool raw_atomic_long_dec_and_test(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_dec_and_test(v); #else return raw_atomic_dec_and_test(v); #endif } /** * raw_atomic_long_inc_and_test() - atomic increment and test if zero with full ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_inc_and_test() elsewhere. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool raw_atomic_long_inc_and_test(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_inc_and_test(v); #else return raw_atomic_inc_and_test(v); #endif } /** * raw_atomic_long_add_negative() - atomic add and test if negative with full ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_add_negative() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic_long_add_negative(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_add_negative(i, v); #else return raw_atomic_add_negative(i, v); #endif } /** * raw_atomic_long_add_negative_acquire() - atomic add and test if negative with acquire ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_add_negative_acquire() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic_long_add_negative_acquire(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_add_negative_acquire(i, v); #else return raw_atomic_add_negative_acquire(i, v); #endif } /** * raw_atomic_long_add_negative_release() - atomic add and test if negative with release ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_add_negative_release() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic_long_add_negative_release(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_add_negative_release(i, v); #else return raw_atomic_add_negative_release(i, v); #endif } /** * raw_atomic_long_add_negative_relaxed() - atomic add and test if negative with relaxed ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_add_negative_relaxed() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic_long_add_negative_relaxed(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_add_negative_relaxed(i, v); #else return raw_atomic_add_negative_relaxed(i, v); #endif } /** * raw_atomic_long_fetch_add_unless() - atomic add unless value with full ordering * @v: pointer to atomic_long_t * @a: long value to add * @u: long value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_fetch_add_unless() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_add_unless(atomic_long_t *v, long a, long u) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_add_unless(v, a, u); #else return raw_atomic_fetch_add_unless(v, a, u); #endif } /** * raw_atomic_long_add_unless() - atomic add unless value with full ordering * @v: pointer to atomic_long_t * @a: long value to add * @u: long value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_add_unless() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic_long_add_unless(atomic_long_t *v, long a, long u) { #ifdef CONFIG_64BIT return raw_atomic64_add_unless(v, a, u); #else return raw_atomic_add_unless(v, a, u); #endif } /** * raw_atomic_long_inc_not_zero() - atomic increment unless zero with full ordering * @v: pointer to atomic_long_t * * If (@v != 0), atomically updates @v to (@v + 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_inc_not_zero() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic_long_inc_not_zero(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_inc_not_zero(v); #else return raw_atomic_inc_not_zero(v); #endif } /** * raw_atomic_long_inc_unless_negative() - atomic increment unless negative with full ordering * @v: pointer to atomic_long_t * * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_inc_unless_negative() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic_long_inc_unless_negative(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_inc_unless_negative(v); #else return raw_atomic_inc_unless_negative(v); #endif } /** * raw_atomic_long_dec_unless_positive() - atomic decrement unless positive with full ordering * @v: pointer to atomic_long_t * * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_dec_unless_positive() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic_long_dec_unless_positive(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_dec_unless_positive(v); #else return raw_atomic_dec_unless_positive(v); #endif } /** * raw_atomic_long_dec_if_positive() - atomic decrement if positive with full ordering * @v: pointer to atomic_long_t * * If (@v > 0), atomically updates @v to (@v - 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_dec_if_positive() elsewhere. * * Return: The old value of (@v - 1), regardless of whether @v was updated. */ static __always_inline long raw_atomic_long_dec_if_positive(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_dec_if_positive(v); #else return raw_atomic_dec_if_positive(v); #endif } #endif /* _LINUX_ATOMIC_LONG_H */ // eadf183c3600b8b92b91839dd3be6bcc560c752d
4 4 4 4 108 109 107 109 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 // SPDX-License-Identifier: GPL-2.0 /* * Implementation of the multi-level security (MLS) policy. * * Author : Stephen Smalley, <stephen.smalley.work@gmail.com> */ /* * Updated: Trusted Computer Solutions, Inc. <dgoeddel@trustedcs.com> * Support for enhanced MLS infrastructure. * Copyright (C) 2004-2006 Trusted Computer Solutions, Inc. * * Updated: Hewlett-Packard <paul@paul-moore.com> * Added support to import/export the MLS label from NetLabel * Copyright (C) Hewlett-Packard Development Company, L.P., 2006 */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/errno.h> #include <net/netlabel.h> #include "sidtab.h" #include "mls.h" #include "policydb.h" #include "services.h" /* * Return the length in bytes for the MLS fields of the * security context string representation of `context'. */ int mls_compute_context_len(struct policydb *p, struct context *context) { int i, l, len, head, prev; char *nm; struct ebitmap *e; struct ebitmap_node *node; if (!p->mls_enabled) return 0; len = 1; /* for the beginning ":" */ for (l = 0; l < 2; l++) { u32 index_sens = context->range.level[l].sens; len += strlen(sym_name(p, SYM_LEVELS, index_sens - 1)); /* categories */ head = -2; prev = -2; e = &context->range.level[l].cat; ebitmap_for_each_positive_bit(e, node, i) { if (i - prev > 1) { /* one or more negative bits are skipped */ if (head != prev) { nm = sym_name(p, SYM_CATS, prev); len += strlen(nm) + 1; } nm = sym_name(p, SYM_CATS, i); len += strlen(nm) + 1; head = i; } prev = i; } if (prev != head) { nm = sym_name(p, SYM_CATS, prev); len += strlen(nm) + 1; } if (l == 0) { if (mls_level_eq(&context->range.level[0], &context->range.level[1])) break; else len++; } } return len; } /* * Write the security context string representation of * the MLS fields of `context' into the string `*scontext'. * Update `*scontext' to point to the end of the MLS fields. */ void mls_sid_to_context(struct policydb *p, struct context *context, char **scontext) { char *scontextp, *nm; int i, l, head, prev; struct ebitmap *e; struct ebitmap_node *node; if (!p->mls_enabled) return; scontextp = *scontext; *scontextp = ':'; scontextp++; for (l = 0; l < 2; l++) { strcpy(scontextp, sym_name(p, SYM_LEVELS, context->range.level[l].sens - 1)); scontextp += strlen(scontextp); /* categories */ head = -2; prev = -2; e = &context->range.level[l].cat; ebitmap_for_each_positive_bit(e, node, i) { if (i - prev > 1) { /* one or more negative bits are skipped */ if (prev != head) { if (prev - head > 1) *scontextp++ = '.'; else *scontextp++ = ','; nm = sym_name(p, SYM_CATS, prev); strcpy(scontextp, nm); scontextp += strlen(nm); } if (prev < 0) *scontextp++ = ':'; else *scontextp++ = ','; nm = sym_name(p, SYM_CATS, i); strcpy(scontextp, nm); scontextp += strlen(nm); head = i; } prev = i; } if (prev != head) { if (prev - head > 1) *scontextp++ = '.'; else *scontextp++ = ','; nm = sym_name(p, SYM_CATS, prev); strcpy(scontextp, nm); scontextp += strlen(nm); } if (l == 0) { if (mls_level_eq(&context->range.level[0], &context->range.level[1])) break; else *scontextp++ = '-'; } } *scontext = scontextp; } int mls_level_isvalid(struct policydb *p, struct mls_level *l) { struct level_datum *levdatum; if (!l->sens || l->sens > p->p_levels.nprim) return 0; levdatum = symtab_search(&p->p_levels, sym_name(p, SYM_LEVELS, l->sens - 1)); if (!levdatum) return 0; /* * Return 1 iff all the bits set in l->cat are also be set in * levdatum->level->cat and no bit in l->cat is larger than * p->p_cats.nprim. */ return ebitmap_contains(&levdatum->level.cat, &l->cat, p->p_cats.nprim); } int mls_range_isvalid(struct policydb *p, struct mls_range *r) { return (mls_level_isvalid(p, &r->level[0]) && mls_level_isvalid(p, &r->level[1]) && mls_level_dom(&r->level[1], &r->level[0])); } /* * Return 1 if the MLS fields in the security context * structure `c' are valid. Return 0 otherwise. */ int mls_context_isvalid(struct policydb *p, struct context *c) { struct user_datum *usrdatum; if (!p->mls_enabled) return 1; if (!mls_range_isvalid(p, &c->range)) return 0; if (c->role == OBJECT_R_VAL) return 1; /* * User must be authorized for the MLS range. */ if (!c->user || c->user > p->p_users.nprim) return 0; usrdatum = p->user_val_to_struct[c->user - 1]; if (!mls_range_contains(usrdatum->range, c->range)) return 0; /* user may not be associated with range */ return 1; } /* * Set the MLS fields in the security context structure * `context' based on the string representation in * the string `scontext'. * * This function modifies the string in place, inserting * NULL characters to terminate the MLS fields. * * If a def_sid is provided and no MLS field is present, * copy the MLS field of the associated default context. * Used for upgraded to MLS systems where objects may lack * MLS fields. * * Policy read-lock must be held for sidtab lookup. * */ int mls_context_to_sid(struct policydb *pol, char oldc, char *scontext, struct context *context, struct sidtab *s, u32 def_sid) { char *sensitivity, *cur_cat, *next_cat, *rngptr; struct level_datum *levdatum; struct cat_datum *catdatum, *rngdatum; u32 i; int l, rc; char *rangep[2]; if (!pol->mls_enabled) { /* * With no MLS, only return -EINVAL if there is a MLS field * and it did not come from an xattr. */ if (oldc && def_sid == SECSID_NULL) return -EINVAL; return 0; } /* * No MLS component to the security context, try and map to * default if provided. */ if (!oldc) { struct context *defcon; if (def_sid == SECSID_NULL) return -EINVAL; defcon = sidtab_search(s, def_sid); if (!defcon) return -EINVAL; return mls_context_cpy(context, defcon); } /* * If we're dealing with a range, figure out where the two parts * of the range begin. */ rangep[0] = scontext; rangep[1] = strchr(scontext, '-'); if (rangep[1]) { rangep[1][0] = '\0'; rangep[1]++; } /* For each part of the range: */ for (l = 0; l < 2; l++) { /* Split sensitivity and category set. */ sensitivity = rangep[l]; if (sensitivity == NULL) break; next_cat = strchr(sensitivity, ':'); if (next_cat) *(next_cat++) = '\0'; /* Parse sensitivity. */ levdatum = symtab_search(&pol->p_levels, sensitivity); if (!levdatum) return -EINVAL; context->range.level[l].sens = levdatum->level.sens; /* Extract category set. */ while (next_cat != NULL) { cur_cat = next_cat; next_cat = strchr(next_cat, ','); if (next_cat != NULL) *(next_cat++) = '\0'; /* Separate into range if exists */ rngptr = strchr(cur_cat, '.'); if (rngptr != NULL) { /* Remove '.' */ *rngptr++ = '\0'; } catdatum = symtab_search(&pol->p_cats, cur_cat); if (!catdatum) return -EINVAL; rc = ebitmap_set_bit(&context->range.level[l].cat, catdatum->value - 1, 1); if (rc) return rc; /* If range, set all categories in range */ if (rngptr == NULL) continue; rngdatum = symtab_search(&pol->p_cats, rngptr); if (!rngdatum) return -EINVAL; if (catdatum->value >= rngdatum->value) return -EINVAL; for (i = catdatum->value; i < rngdatum->value; i++) { rc = ebitmap_set_bit( &context->range.level[l].cat, i, 1); if (rc) return rc; } } } /* If we didn't see a '-', the range start is also the range end. */ if (rangep[1] == NULL) { context->range.level[1].sens = context->range.level[0].sens; rc = ebitmap_cpy(&context->range.level[1].cat, &context->range.level[0].cat); if (rc) return rc; } return 0; } /* * Set the MLS fields in the security context structure * `context' based on the string representation in * the string `str'. This function will allocate temporary memory with the * given constraints of gfp_mask. */ int mls_from_string(struct policydb *p, char *str, struct context *context, gfp_t gfp_mask) { char *tmpstr; int rc; if (!p->mls_enabled) return -EINVAL; tmpstr = kstrdup(str, gfp_mask); if (!tmpstr) { rc = -ENOMEM; } else { rc = mls_context_to_sid(p, ':', tmpstr, context, NULL, SECSID_NULL); kfree(tmpstr); } return rc; } /* * Copies the MLS range `range' into `context'. */ int mls_range_set(struct context *context, struct mls_range *range) { int l, rc = 0; /* Copy the MLS range into the context */ for (l = 0; l < 2; l++) { context->range.level[l].sens = range->level[l].sens; rc = ebitmap_cpy(&context->range.level[l].cat, &range->level[l].cat); if (rc) break; } return rc; } int mls_setup_user_range(struct policydb *p, struct context *fromcon, struct user_datum *user, struct context *usercon) { if (p->mls_enabled) { struct mls_level *fromcon_sen = &(fromcon->range.level[0]); struct mls_level *fromcon_clr = &(fromcon->range.level[1]); struct mls_level *user_low = &(user->range.level[0]); struct mls_level *user_clr = &(user->range.level[1]); struct mls_level *user_def = &(user->dfltlevel); struct mls_level *usercon_sen = &(usercon->range.level[0]); struct mls_level *usercon_clr = &(usercon->range.level[1]); /* Honor the user's default level if we can */ if (mls_level_between(user_def, fromcon_sen, fromcon_clr)) *usercon_sen = *user_def; else if (mls_level_between(fromcon_sen, user_def, user_clr)) *usercon_sen = *fromcon_sen; else if (mls_level_between(fromcon_clr, user_low, user_def)) *usercon_sen = *user_low; else return -EINVAL; /* Lower the clearance of available contexts if the clearance of "fromcon" is lower than that of the user's default clearance (but only if the "fromcon" clearance dominates the user's computed sensitivity level) */ if (mls_level_dom(user_clr, fromcon_clr)) *usercon_clr = *fromcon_clr; else if (mls_level_dom(fromcon_clr, user_clr)) *usercon_clr = *user_clr; else return -EINVAL; } return 0; } /* * Convert the MLS fields in the security context * structure `oldc' from the values specified in the * policy `oldp' to the values specified in the policy `newp', * storing the resulting context in `newc'. */ int mls_convert_context(struct policydb *oldp, struct policydb *newp, struct context *oldc, struct context *newc) { struct level_datum *levdatum; struct cat_datum *catdatum; struct ebitmap_node *node; u32 i; int l; if (!oldp->mls_enabled || !newp->mls_enabled) return 0; for (l = 0; l < 2; l++) { char *name = sym_name(oldp, SYM_LEVELS, oldc->range.level[l].sens - 1); levdatum = symtab_search(&newp->p_levels, name); if (!levdatum) return -EINVAL; newc->range.level[l].sens = levdatum->level.sens; ebitmap_for_each_positive_bit(&oldc->range.level[l].cat, node, i) { int rc; catdatum = symtab_search(&newp->p_cats, sym_name(oldp, SYM_CATS, i)); if (!catdatum) return -EINVAL; rc = ebitmap_set_bit(&newc->range.level[l].cat, catdatum->value - 1, 1); if (rc) return rc; } } return 0; } int mls_compute_sid(struct policydb *p, struct context *scontext, struct context *tcontext, u16 tclass, u32 specified, struct context *newcontext, bool sock) { struct range_trans rtr; struct mls_range *r; struct class_datum *cladatum; char default_range = 0; if (!p->mls_enabled) return 0; switch (specified) { case AVTAB_TRANSITION: /* Look for a range transition rule. */ rtr.source_type = scontext->type; rtr.target_type = tcontext->type; rtr.target_class = tclass; r = policydb_rangetr_search(p, &rtr); if (r) return mls_range_set(newcontext, r); if (tclass && tclass <= p->p_classes.nprim) { cladatum = p->class_val_to_struct[tclass - 1]; if (cladatum) default_range = cladatum->default_range; } switch (default_range) { case DEFAULT_SOURCE_LOW: return mls_context_cpy_low(newcontext, scontext); case DEFAULT_SOURCE_HIGH: return mls_context_cpy_high(newcontext, scontext); case DEFAULT_SOURCE_LOW_HIGH: return mls_context_cpy(newcontext, scontext); case DEFAULT_TARGET_LOW: return mls_context_cpy_low(newcontext, tcontext); case DEFAULT_TARGET_HIGH: return mls_context_cpy_high(newcontext, tcontext); case DEFAULT_TARGET_LOW_HIGH: return mls_context_cpy(newcontext, tcontext); case DEFAULT_GLBLUB: return mls_context_glblub(newcontext, scontext, tcontext); } fallthrough; case AVTAB_CHANGE: if ((tclass == p->process_class) || sock) /* Use the process MLS attributes. */ return mls_context_cpy(newcontext, scontext); else /* Use the process effective MLS attributes. */ return mls_context_cpy_low(newcontext, scontext); case AVTAB_MEMBER: /* Use the process effective MLS attributes. */ return mls_context_cpy_low(newcontext, scontext); } return -EINVAL; } #ifdef CONFIG_NETLABEL /** * mls_export_netlbl_lvl - Export the MLS sensitivity levels to NetLabel * @p: the policy * @context: the security context * @secattr: the NetLabel security attributes * * Description: * Given the security context copy the low MLS sensitivity level into the * NetLabel MLS sensitivity level field. * */ void mls_export_netlbl_lvl(struct policydb *p, struct context *context, struct netlbl_lsm_secattr *secattr) { if (!p->mls_enabled) return; secattr->attr.mls.lvl = context->range.level[0].sens - 1; secattr->flags |= NETLBL_SECATTR_MLS_LVL; } /** * mls_import_netlbl_lvl - Import the NetLabel MLS sensitivity levels * @p: the policy * @context: the security context * @secattr: the NetLabel security attributes * * Description: * Given the security context and the NetLabel security attributes, copy the * NetLabel MLS sensitivity level into the context. * */ void mls_import_netlbl_lvl(struct policydb *p, struct context *context, struct netlbl_lsm_secattr *secattr) { if (!p->mls_enabled) return; context->range.level[0].sens = secattr->attr.mls.lvl + 1; context->range.level[1].sens = context->range.level[0].sens; } /** * mls_export_netlbl_cat - Export the MLS categories to NetLabel * @p: the policy * @context: the security context * @secattr: the NetLabel security attributes * * Description: * Given the security context copy the low MLS categories into the NetLabel * MLS category field. Returns zero on success, negative values on failure. * */ int mls_export_netlbl_cat(struct policydb *p, struct context *context, struct netlbl_lsm_secattr *secattr) { int rc; if (!p->mls_enabled) return 0; rc = ebitmap_netlbl_export(&context->range.level[0].cat, &secattr->attr.mls.cat); if (rc == 0 && secattr->attr.mls.cat != NULL) secattr->flags |= NETLBL_SECATTR_MLS_CAT; return rc; } /** * mls_import_netlbl_cat - Import the MLS categories from NetLabel * @p: the policy * @context: the security context * @secattr: the NetLabel security attributes * * Description: * Copy the NetLabel security attributes into the SELinux context; since the * NetLabel security attribute only contains a single MLS category use it for * both the low and high categories of the context. Returns zero on success, * negative values on failure. * */ int mls_import_netlbl_cat(struct policydb *p, struct context *context, struct netlbl_lsm_secattr *secattr) { int rc; if (!p->mls_enabled) return 0; rc = ebitmap_netlbl_import(&context->range.level[0].cat, secattr->attr.mls.cat); if (rc) goto import_netlbl_cat_failure; memcpy(&context->range.level[1].cat, &context->range.level[0].cat, sizeof(context->range.level[0].cat)); return 0; import_netlbl_cat_failure: ebitmap_destroy(&context->range.level[0].cat); return rc; } #endif /* CONFIG_NETLABEL */
29 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_USER_NAMESPACE_H #define _LINUX_USER_NAMESPACE_H #include <linux/kref.h> #include <linux/nsproxy.h> #include <linux/ns_common.h> #include <linux/rculist_nulls.h> #include <linux/sched.h> #include <linux/workqueue.h> #include <linux/rcuref.h> #include <linux/rwsem.h> #include <linux/sysctl.h> #include <linux/err.h> #define UID_GID_MAP_MAX_BASE_EXTENTS 5 #define UID_GID_MAP_MAX_EXTENTS 340 struct uid_gid_extent { u32 first; u32 lower_first; u32 count; }; struct uid_gid_map { /* 64 bytes -- 1 cache line */ union { struct { struct uid_gid_extent extent[UID_GID_MAP_MAX_BASE_EXTENTS]; u32 nr_extents; }; struct { struct uid_gid_extent *forward; struct uid_gid_extent *reverse; }; }; }; #define USERNS_SETGROUPS_ALLOWED 1UL #define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED struct ucounts; enum ucount_type { UCOUNT_USER_NAMESPACES, UCOUNT_PID_NAMESPACES, UCOUNT_UTS_NAMESPACES, UCOUNT_IPC_NAMESPACES, UCOUNT_NET_NAMESPACES, UCOUNT_MNT_NAMESPACES, UCOUNT_CGROUP_NAMESPACES, UCOUNT_TIME_NAMESPACES, #ifdef CONFIG_INOTIFY_USER UCOUNT_INOTIFY_INSTANCES, UCOUNT_INOTIFY_WATCHES, #endif #ifdef CONFIG_FANOTIFY UCOUNT_FANOTIFY_GROUPS, UCOUNT_FANOTIFY_MARKS, #endif UCOUNT_COUNTS, }; enum rlimit_type { UCOUNT_RLIMIT_NPROC, UCOUNT_RLIMIT_MSGQUEUE, UCOUNT_RLIMIT_SIGPENDING, UCOUNT_RLIMIT_MEMLOCK, UCOUNT_RLIMIT_COUNTS, }; #if IS_ENABLED(CONFIG_BINFMT_MISC) struct binfmt_misc; #endif struct user_namespace { struct uid_gid_map uid_map; struct uid_gid_map gid_map; struct uid_gid_map projid_map; struct user_namespace *parent; int level; kuid_t owner; kgid_t group; struct ns_common ns; unsigned long flags; /* parent_could_setfcap: true if the creator if this ns had CAP_SETFCAP * in its effective capability set at the child ns creation time. */ bool parent_could_setfcap; #ifdef CONFIG_KEYS /* List of joinable keyrings in this namespace. Modification access of * these pointers is controlled by keyring_sem. Once * user_keyring_register is set, it won't be changed, so it can be * accessed directly with READ_ONCE(). */ struct list_head keyring_name_list; struct key *user_keyring_register; struct rw_semaphore keyring_sem; #endif /* Register of per-UID persistent keyrings for this namespace */ #ifdef CONFIG_PERSISTENT_KEYRINGS struct key *persistent_keyring_register; #endif struct work_struct work; #ifdef CONFIG_SYSCTL struct ctl_table_set set; struct ctl_table_header *sysctls; #endif struct ucounts *ucounts; long ucount_max[UCOUNT_COUNTS]; long rlimit_max[UCOUNT_RLIMIT_COUNTS]; #if IS_ENABLED(CONFIG_BINFMT_MISC) struct binfmt_misc *binfmt_misc; #endif } __randomize_layout; struct ucounts { struct hlist_nulls_node node; struct user_namespace *ns; kuid_t uid; struct rcu_head rcu; rcuref_t count; atomic_long_t ucount[UCOUNT_COUNTS]; atomic_long_t rlimit[UCOUNT_RLIMIT_COUNTS]; }; extern struct user_namespace init_user_ns; extern struct ucounts init_ucounts; bool setup_userns_sysctls(struct user_namespace *ns); void retire_userns_sysctls(struct user_namespace *ns); struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_type type); void dec_ucount(struct ucounts *ucounts, enum ucount_type type); struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid); void put_ucounts(struct ucounts *ucounts); static inline struct ucounts * __must_check get_ucounts(struct ucounts *ucounts) { if (rcuref_get(&ucounts->count)) return ucounts; return NULL; } static inline long get_rlimit_value(struct ucounts *ucounts, enum rlimit_type type) { return atomic_long_read(&ucounts->rlimit[type]); } long inc_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v); bool dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v); long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type, bool override_rlimit); void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type); bool is_rlimit_overlimit(struct ucounts *ucounts, enum rlimit_type type, unsigned long max); static inline long get_userns_rlimit_max(struct user_namespace *ns, enum rlimit_type type) { return READ_ONCE(ns->rlimit_max[type]); } static inline void set_userns_rlimit_max(struct user_namespace *ns, enum rlimit_type type, unsigned long max) { ns->rlimit_max[type] = max <= LONG_MAX ? max : LONG_MAX; } #ifdef CONFIG_USER_NS static inline struct user_namespace *get_user_ns(struct user_namespace *ns) { if (ns) refcount_inc(&ns->ns.count); return ns; } extern int create_user_ns(struct cred *new); extern int unshare_userns(unsigned long unshare_flags, struct cred **new_cred); extern void __put_user_ns(struct user_namespace *ns); static inline void put_user_ns(struct user_namespace *ns) { if (ns && refcount_dec_and_test(&ns->ns.count)) __put_user_ns(ns); } struct seq_operations; extern const struct seq_operations proc_uid_seq_operations; extern const struct seq_operations proc_gid_seq_operations; extern const struct seq_operations proc_projid_seq_operations; extern ssize_t proc_uid_map_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t proc_gid_map_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t proc_projid_map_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t proc_setgroups_write(struct file *, const char __user *, size_t, loff_t *); extern int proc_setgroups_show(struct seq_file *m, void *v); extern bool userns_may_setgroups(const struct user_namespace *ns); extern bool in_userns(const struct user_namespace *ancestor, const struct user_namespace *child); extern bool current_in_userns(const struct user_namespace *target_ns); struct ns_common *ns_get_owner(struct ns_common *ns); #else static inline struct user_namespace *get_user_ns(struct user_namespace *ns) { return &init_user_ns; } static inline int create_user_ns(struct cred *new) { return -EINVAL; } static inline int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) { if (unshare_flags & CLONE_NEWUSER) return -EINVAL; return 0; } static inline void put_user_ns(struct user_namespace *ns) { } static inline bool userns_may_setgroups(const struct user_namespace *ns) { return true; } static inline bool in_userns(const struct user_namespace *ancestor, const struct user_namespace *child) { return true; } static inline bool current_in_userns(const struct user_namespace *target_ns) { return true; } static inline struct ns_common *ns_get_owner(struct ns_common *ns) { return ERR_PTR(-EPERM); } #endif #endif /* _LINUX_USER_H */
233 238 23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _MM_PERCPU_INTERNAL_H #define _MM_PERCPU_INTERNAL_H #include <linux/types.h> #include <linux/percpu.h> #include <linux/memcontrol.h> /* * pcpu_block_md is the metadata block struct. * Each chunk's bitmap is split into a number of full blocks. * All units are in terms of bits. * * The scan hint is the largest known contiguous area before the contig hint. * It is not necessarily the actual largest contig hint though. There is an * invariant that the scan_hint_start > contig_hint_start iff * scan_hint == contig_hint. This is necessary because when scanning forward, * we don't know if a new contig hint would be better than the current one. */ struct pcpu_block_md { int scan_hint; /* scan hint for block */ int scan_hint_start; /* block relative starting position of the scan hint */ int contig_hint; /* contig hint for block */ int contig_hint_start; /* block relative starting position of the contig hint */ int left_free; /* size of free space along the left side of the block */ int right_free; /* size of free space along the right side of the block */ int first_free; /* block position of first free */ int nr_bits; /* total bits responsible for */ }; struct pcpuobj_ext { #ifdef CONFIG_MEMCG struct obj_cgroup *cgroup; #endif #ifdef CONFIG_MEM_ALLOC_PROFILING union codetag_ref tag; #endif }; #if defined(CONFIG_MEMCG) || defined(CONFIG_MEM_ALLOC_PROFILING) #define NEED_PCPUOBJ_EXT #endif struct pcpu_chunk { #ifdef CONFIG_PERCPU_STATS int nr_alloc; /* # of allocations */ size_t max_alloc_size; /* largest allocation size */ #endif struct list_head list; /* linked to pcpu_slot lists */ int free_bytes; /* free bytes in the chunk */ struct pcpu_block_md chunk_md; unsigned long *bound_map; /* boundary map */ /* * base_addr is the base address of this chunk. * To reduce false sharing, current layout is optimized to make sure * base_addr locate in the different cacheline with free_bytes and * chunk_md. */ void *base_addr ____cacheline_aligned_in_smp; unsigned long *alloc_map; /* allocation map */ struct pcpu_block_md *md_blocks; /* metadata blocks */ void *data; /* chunk data */ bool immutable; /* no [de]population allowed */ bool isolated; /* isolated from active chunk slots */ int start_offset; /* the overlap with the previous region to have a page aligned base_addr */ int end_offset; /* additional area required to have the region end page aligned */ #ifdef NEED_PCPUOBJ_EXT struct pcpuobj_ext *obj_exts; /* vector of object cgroups */ #endif int nr_pages; /* # of pages served by this chunk */ int nr_populated; /* # of populated pages */ int nr_empty_pop_pages; /* # of empty populated pages */ unsigned long populated[]; /* populated bitmap */ }; static inline bool need_pcpuobj_ext(void) { if (IS_ENABLED(CONFIG_MEM_ALLOC_PROFILING)) return true; if (!mem_cgroup_kmem_disabled()) return true; return false; } extern spinlock_t pcpu_lock; extern struct list_head *pcpu_chunk_lists; extern int pcpu_nr_slots; extern int pcpu_sidelined_slot; extern int pcpu_to_depopulate_slot; extern int pcpu_nr_empty_pop_pages; extern struct pcpu_chunk *pcpu_first_chunk; extern struct pcpu_chunk *pcpu_reserved_chunk; /** * pcpu_chunk_nr_blocks - converts nr_pages to # of md_blocks * @chunk: chunk of interest * * This conversion is from the number of physical pages that the chunk * serves to the number of bitmap blocks used. */ static inline int pcpu_chunk_nr_blocks(struct pcpu_chunk *chunk) { return chunk->nr_pages * PAGE_SIZE / PCPU_BITMAP_BLOCK_SIZE; } /** * pcpu_nr_pages_to_map_bits - converts the pages to size of bitmap * @pages: number of physical pages * * This conversion is from physical pages to the number of bits * required in the bitmap. */ static inline int pcpu_nr_pages_to_map_bits(int pages) { return pages * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE; } /** * pcpu_chunk_map_bits - helper to convert nr_pages to size of bitmap * @chunk: chunk of interest * * This conversion is from the number of physical pages that the chunk * serves to the number of bits in the bitmap. */ static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk) { return pcpu_nr_pages_to_map_bits(chunk->nr_pages); } /** * pcpu_obj_full_size - helper to calculate size of each accounted object * @size: size of area to allocate in bytes * * For each accounted object there is an extra space which is used to store * obj_cgroup membership if kmemcg is not disabled. Charge it too. */ static inline size_t pcpu_obj_full_size(size_t size) { size_t extra_size = 0; #ifdef CONFIG_MEMCG if (!mem_cgroup_kmem_disabled()) extra_size += size / PCPU_MIN_ALLOC_SIZE * sizeof(struct obj_cgroup *); #endif return size * num_possible_cpus() + extra_size; } #ifdef CONFIG_PERCPU_STATS #include <linux/spinlock.h> struct percpu_stats { u64 nr_alloc; /* lifetime # of allocations */ u64 nr_dealloc; /* lifetime # of deallocations */ u64 nr_cur_alloc; /* current # of allocations */ u64 nr_max_alloc; /* max # of live allocations */ u32 nr_chunks; /* current # of live chunks */ u32 nr_max_chunks; /* max # of live chunks */ size_t min_alloc_size; /* min allocation size */ size_t max_alloc_size; /* max allocation size */ }; extern struct percpu_stats pcpu_stats; extern struct pcpu_alloc_info pcpu_stats_ai; /* * For debug purposes. We don't care about the flexible array. */ static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai) { memcpy(&pcpu_stats_ai, ai, sizeof(struct pcpu_alloc_info)); /* initialize min_alloc_size to unit_size */ pcpu_stats.min_alloc_size = pcpu_stats_ai.unit_size; } /* * pcpu_stats_area_alloc - increment area allocation stats * @chunk: the location of the area being allocated * @size: size of area to allocate in bytes * * CONTEXT: * pcpu_lock. */ static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size) { lockdep_assert_held(&pcpu_lock); pcpu_stats.nr_alloc++; pcpu_stats.nr_cur_alloc++; pcpu_stats.nr_max_alloc = max(pcpu_stats.nr_max_alloc, pcpu_stats.nr_cur_alloc); pcpu_stats.min_alloc_size = min(pcpu_stats.min_alloc_size, size); pcpu_stats.max_alloc_size = max(pcpu_stats.max_alloc_size, size); chunk->nr_alloc++; chunk->max_alloc_size = max(chunk->max_alloc_size, size); } /* * pcpu_stats_area_dealloc - decrement allocation stats * @chunk: the location of the area being deallocated * * CONTEXT: * pcpu_lock. */ static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk) { lockdep_assert_held(&pcpu_lock); pcpu_stats.nr_dealloc++; pcpu_stats.nr_cur_alloc--; chunk->nr_alloc--; } /* * pcpu_stats_chunk_alloc - increment chunk stats */ static inline void pcpu_stats_chunk_alloc(void) { unsigned long flags; spin_lock_irqsave(&pcpu_lock, flags); pcpu_stats.nr_chunks++; pcpu_stats.nr_max_chunks = max(pcpu_stats.nr_max_chunks, pcpu_stats.nr_chunks); spin_unlock_irqrestore(&pcpu_lock, flags); } /* * pcpu_stats_chunk_dealloc - decrement chunk stats */ static inline void pcpu_stats_chunk_dealloc(void) { unsigned long flags; spin_lock_irqsave(&pcpu_lock, flags); pcpu_stats.nr_chunks--; spin_unlock_irqrestore(&pcpu_lock, flags); } #else static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai) { } static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size) { } static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk) { } static inline void pcpu_stats_chunk_alloc(void) { } static inline void pcpu_stats_chunk_dealloc(void) { } #endif /* !CONFIG_PERCPU_STATS */ #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Definitions for the 'struct ptr_ring' datastructure. * * Author: * Michael S. Tsirkin <mst@redhat.com> * * Copyright (C) 2016 Red Hat, Inc. * * This is a limited-size FIFO maintaining pointers in FIFO order, with * one CPU producing entries and another consuming entries from a FIFO. * * This implementation tries to minimize cache-contention when there is a * single producer and a single consumer CPU. */ #ifndef _LINUX_PTR_RING_H #define _LINUX_PTR_RING_H 1 #ifdef __KERNEL__ #include <linux/spinlock.h> #include <linux/cache.h> #include <linux/types.h> #include <linux/compiler.h> #include <linux/slab.h> #include <linux/mm.h> #include <asm/errno.h> #endif struct ptr_ring { int producer ____cacheline_aligned_in_smp; spinlock_t producer_lock; int consumer_head ____cacheline_aligned_in_smp; /* next valid entry */ int consumer_tail; /* next entry to invalidate */ spinlock_t consumer_lock; /* Shared consumer/producer data */ /* Read-only by both the producer and the consumer */ int size ____cacheline_aligned_in_smp; /* max entries in queue */ int batch; /* number of entries to consume in a batch */ void **queue; }; /* Note: callers invoking this in a loop must use a compiler barrier, * for example cpu_relax(). * * NB: this is unlike __ptr_ring_empty in that callers must hold producer_lock: * see e.g. ptr_ring_full. */ static inline bool __ptr_ring_full(struct ptr_ring *r) { return r->queue[r->producer]; } static inline bool ptr_ring_full(struct ptr_ring *r) { bool ret; spin_lock(&r->producer_lock); ret = __ptr_ring_full(r); spin_unlock(&r->producer_lock); return ret; } static inline bool ptr_ring_full_irq(struct ptr_ring *r) { bool ret; spin_lock_irq(&r->producer_lock); ret = __ptr_ring_full(r); spin_unlock_irq(&r->producer_lock); return ret; } static inline bool ptr_ring_full_any(struct ptr_ring *r) { unsigned long flags; bool ret; spin_lock_irqsave(&r->producer_lock, flags); ret = __ptr_ring_full(r); spin_unlock_irqrestore(&r->producer_lock, flags); return ret; } static inline bool ptr_ring_full_bh(struct ptr_ring *r) { bool ret; spin_lock_bh(&r->producer_lock); ret = __ptr_ring_full(r); spin_unlock_bh(&r->producer_lock); return ret; } /* Note: callers invoking this in a loop must use a compiler barrier, * for example cpu_relax(). Callers must hold producer_lock. * Callers are responsible for making sure pointer that is being queued * points to a valid data. */ static inline int __ptr_ring_produce(struct ptr_ring *r, void *ptr) { if (unlikely(!r->size) || r->queue[r->producer]) return -ENOSPC; /* Make sure the pointer we are storing points to a valid data. */ /* Pairs with the dependency ordering in __ptr_ring_consume. */ smp_wmb(); WRITE_ONCE(r->queue[r->producer++], ptr); if (unlikely(r->producer >= r->size)) r->producer = 0; return 0; } /* * Note: resize (below) nests producer lock within consumer lock, so if you * consume in interrupt or BH context, you must disable interrupts/BH when * calling this. */ static inline int ptr_ring_produce(struct ptr_ring *r, void *ptr) { int ret; spin_lock(&r->producer_lock); ret = __ptr_ring_produce(r, ptr); spin_unlock(&r->producer_lock); return ret; } static inline int ptr_ring_produce_irq(struct ptr_ring *r, void *ptr) { int ret; spin_lock_irq(&r->producer_lock); ret = __ptr_ring_produce(r, ptr); spin_unlock_irq(&r->producer_lock); return ret; } static inline int ptr_ring_produce_any(struct ptr_ring *r, void *ptr) { unsigned long flags; int ret; spin_lock_irqsave(&r->producer_lock, flags); ret = __ptr_ring_produce(r, ptr); spin_unlock_irqrestore(&r->producer_lock, flags); return ret; } static inline int ptr_ring_produce_bh(struct ptr_ring *r, void *ptr) { int ret; spin_lock_bh(&r->producer_lock); ret = __ptr_ring_produce(r, ptr); spin_unlock_bh(&r->producer_lock); return ret; } static inline void *__ptr_ring_peek(struct ptr_ring *r) { if (likely(r->size)) return READ_ONCE(r->queue[r->consumer_head]); return NULL; } /* * Test ring empty status without taking any locks. * * NB: This is only safe to call if ring is never resized. * * However, if some other CPU consumes ring entries at the same time, the value * returned is not guaranteed to be correct. * * In this case - to avoid incorrectly detecting the ring * as empty - the CPU consuming the ring entries is responsible * for either consuming all ring entries until the ring is empty, * or synchronizing with some other CPU and causing it to * re-test __ptr_ring_empty and/or consume the ring enteries * after the synchronization point. * * Note: callers invoking this in a loop must use a compiler barrier, * for example cpu_relax(). */ static inline bool __ptr_ring_empty(struct ptr_ring *r) { if (likely(r->size)) return !r->queue[READ_ONCE(r->consumer_head)]; return true; } static inline bool ptr_ring_empty(struct ptr_ring *r) { bool ret; spin_lock(&r->consumer_lock); ret = __ptr_ring_empty(r); spin_unlock(&r->consumer_lock); return ret; } static inline bool ptr_ring_empty_irq(struct ptr_ring *r) { bool ret; spin_lock_irq(&r->consumer_lock); ret = __ptr_ring_empty(r); spin_unlock_irq(&r->consumer_lock); return ret; } static inline bool ptr_ring_empty_any(struct ptr_ring *r) { unsigned long flags; bool ret; spin_lock_irqsave(&r->consumer_lock, flags); ret = __ptr_ring_empty(r); spin_unlock_irqrestore(&r->consumer_lock, flags); return ret; } static inline bool ptr_ring_empty_bh(struct ptr_ring *r) { bool ret; spin_lock_bh(&r->consumer_lock); ret = __ptr_ring_empty(r); spin_unlock_bh(&r->consumer_lock); return ret; } /* Must only be called after __ptr_ring_peek returned !NULL */ static inline void __ptr_ring_discard_one(struct ptr_ring *r) { /* Fundamentally, what we want to do is update consumer * index and zero out the entry so producer can reuse it. * Doing it naively at each consume would be as simple as: * consumer = r->consumer; * r->queue[consumer++] = NULL; * if (unlikely(consumer >= r->size)) * consumer = 0; * r->consumer = consumer; * but that is suboptimal when the ring is full as producer is writing * out new entries in the same cache line. Defer these updates until a * batch of entries has been consumed. */ /* Note: we must keep consumer_head valid at all times for __ptr_ring_empty * to work correctly. */ int consumer_head = r->consumer_head; int head = consumer_head++; /* Once we have processed enough entries invalidate them in * the ring all at once so producer can reuse their space in the ring. * We also do this when we reach end of the ring - not mandatory * but helps keep the implementation simple. */ if (unlikely(consumer_head - r->consumer_tail >= r->batch || consumer_head >= r->size)) { /* Zero out entries in the reverse order: this way we touch the * cache line that producer might currently be reading the last; * producer won't make progress and touch other cache lines * besides the first one until we write out all entries. */ while (likely(head >= r->consumer_tail)) r->queue[head--] = NULL; r->consumer_tail = consumer_head; } if (unlikely(consumer_head >= r->size)) { consumer_head = 0; r->consumer_tail = 0; } /* matching READ_ONCE in __ptr_ring_empty for lockless tests */ WRITE_ONCE(r->consumer_head, consumer_head); } static inline void *__ptr_ring_consume(struct ptr_ring *r) { void *ptr; /* The READ_ONCE in __ptr_ring_peek guarantees that anyone * accessing data through the pointer is up to date. Pairs * with smp_wmb in __ptr_ring_produce. */ ptr = __ptr_ring_peek(r); if (ptr) __ptr_ring_discard_one(r); return ptr; } static inline int __ptr_ring_consume_batched(struct ptr_ring *r, void **array, int n) { void *ptr; int i; for (i = 0; i < n; i++) { ptr = __ptr_ring_consume(r); if (!ptr) break; array[i] = ptr; } return i; } /* * Note: resize (below) nests producer lock within consumer lock, so if you * call this in interrupt or BH context, you must disable interrupts/BH when * producing. */ static inline void *ptr_ring_consume(struct ptr_ring *r) { void *ptr; spin_lock(&r->consumer_lock); ptr = __ptr_ring_consume(r); spin_unlock(&r->consumer_lock); return ptr; } static inline void *ptr_ring_consume_irq(struct ptr_ring *r) { void *ptr; spin_lock_irq(&r->consumer_lock); ptr = __ptr_ring_consume(r); spin_unlock_irq(&r->consumer_lock); return ptr; } static inline void *ptr_ring_consume_any(struct ptr_ring *r) { unsigned long flags; void *ptr; spin_lock_irqsave(&r->consumer_lock, flags); ptr = __ptr_ring_consume(r); spin_unlock_irqrestore(&r->consumer_lock, flags); return ptr; } static inline void *ptr_ring_consume_bh(struct ptr_ring *r) { void *ptr; spin_lock_bh(&r->consumer_lock); ptr = __ptr_ring_consume(r); spin_unlock_bh(&r->consumer_lock); return ptr; } static inline int ptr_ring_consume_batched(struct ptr_ring *r, void **array, int n) { int ret; spin_lock(&r->consumer_lock); ret = __ptr_ring_consume_batched(r, array, n); spin_unlock(&r->consumer_lock); return ret; } static inline int ptr_ring_consume_batched_irq(struct ptr_ring *r, void **array, int n) { int ret; spin_lock_irq(&r->consumer_lock); ret = __ptr_ring_consume_batched(r, array, n); spin_unlock_irq(&r->consumer_lock); return ret; } static inline int ptr_ring_consume_batched_any(struct ptr_ring *r, void **array, int n) { unsigned long flags; int ret; spin_lock_irqsave(&r->consumer_lock, flags); ret = __ptr_ring_consume_batched(r, array, n); spin_unlock_irqrestore(&r->consumer_lock, flags); return ret; } static inline int ptr_ring_consume_batched_bh(struct ptr_ring *r, void **array, int n) { int ret; spin_lock_bh(&r->consumer_lock); ret = __ptr_ring_consume_batched(r, array, n); spin_unlock_bh(&r->consumer_lock); return ret; } /* Cast to structure type and call a function without discarding from FIFO. * Function must return a value. * Callers must take consumer_lock. */ #define __PTR_RING_PEEK_CALL(r, f) ((f)(__ptr_ring_peek(r))) #define PTR_RING_PEEK_CALL(r, f) ({ \ typeof((f)(NULL)) __PTR_RING_PEEK_CALL_v; \ \ spin_lock(&(r)->consumer_lock); \ __PTR_RING_PEEK_CALL_v = __PTR_RING_PEEK_CALL(r, f); \ spin_unlock(&(r)->consumer_lock); \ __PTR_RING_PEEK_CALL_v; \ }) #define PTR_RING_PEEK_CALL_IRQ(r, f) ({ \ typeof((f)(NULL)) __PTR_RING_PEEK_CALL_v; \ \ spin_lock_irq(&(r)->consumer_lock); \ __PTR_RING_PEEK_CALL_v = __PTR_RING_PEEK_CALL(r, f); \ spin_unlock_irq(&(r)->consumer_lock); \ __PTR_RING_PEEK_CALL_v; \ }) #define PTR_RING_PEEK_CALL_BH(r, f) ({ \ typeof((f)(NULL)) __PTR_RING_PEEK_CALL_v; \ \ spin_lock_bh(&(r)->consumer_lock); \ __PTR_RING_PEEK_CALL_v = __PTR_RING_PEEK_CALL(r, f); \ spin_unlock_bh(&(r)->consumer_lock); \ __PTR_RING_PEEK_CALL_v; \ }) #define PTR_RING_PEEK_CALL_ANY(r, f) ({ \ typeof((f)(NULL)) __PTR_RING_PEEK_CALL_v; \ unsigned long __PTR_RING_PEEK_CALL_f;\ \ spin_lock_irqsave(&(r)->consumer_lock, __PTR_RING_PEEK_CALL_f); \ __PTR_RING_PEEK_CALL_v = __PTR_RING_PEEK_CALL(r, f); \ spin_unlock_irqrestore(&(r)->consumer_lock, __PTR_RING_PEEK_CALL_f); \ __PTR_RING_PEEK_CALL_v; \ }) /* Not all gfp_t flags (besides GFP_KERNEL) are allowed. See * documentation for vmalloc for which of them are legal. */ static inline void **__ptr_ring_init_queue_alloc_noprof(unsigned int size, gfp_t gfp) { if (size > KMALLOC_MAX_SIZE / sizeof(void *)) return NULL; return kvmalloc_array_noprof(size, sizeof(void *), gfp | __GFP_ZERO); } static inline void __ptr_ring_set_size(struct ptr_ring *r, int size) { r->size = size; r->batch = SMP_CACHE_BYTES * 2 / sizeof(*(r->queue)); /* We need to set batch at least to 1 to make logic * in __ptr_ring_discard_one work correctly. * Batching too much (because ring is small) would cause a lot of * burstiness. Needs tuning, for now disable batching. */ if (r->batch > r->size / 2 || !r->batch) r->batch = 1; } static inline int ptr_ring_init_noprof(struct ptr_ring *r, int size, gfp_t gfp) { r->queue = __ptr_ring_init_queue_alloc_noprof(size, gfp); if (!r->queue) return -ENOMEM; __ptr_ring_set_size(r, size); r->producer = r->consumer_head = r->consumer_tail = 0; spin_lock_init(&r->producer_lock); spin_lock_init(&r->consumer_lock); return 0; } #define ptr_ring_init(...) alloc_hooks(ptr_ring_init_noprof(__VA_ARGS__)) /* * Return entries into ring. Destroy entries that don't fit. * * Note: this is expected to be a rare slow path operation. * * Note: producer lock is nested within consumer lock, so if you * resize you must make sure all uses nest correctly. * In particular if you consume ring in interrupt or BH context, you must * disable interrupts/BH when doing so. */ static inline void ptr_ring_unconsume(struct ptr_ring *r, void **batch, int n, void (*destroy)(void *)) { unsigned long flags; int head; spin_lock_irqsave(&r->consumer_lock, flags); spin_lock(&r->producer_lock); if (!r->size) goto done; /* * Clean out buffered entries (for simplicity). This way following code * can test entries for NULL and if not assume they are valid. */ head = r->consumer_head - 1; while (likely(head >= r->consumer_tail)) r->queue[head--] = NULL; r->consumer_tail = r->consumer_head; /* * Go over entries in batch, start moving head back and copy entries. * Stop when we run into previously unconsumed entries. */ while (n) { head = r->consumer_head - 1; if (head < 0) head = r->size - 1; if (r->queue[head]) { /* This batch entry will have to be destroyed. */ goto done; } r->queue[head] = batch[--n]; r->consumer_tail = head; /* matching READ_ONCE in __ptr_ring_empty for lockless tests */ WRITE_ONCE(r->consumer_head, head); } done: /* Destroy all entries left in the batch. */ while (n) destroy(batch[--n]); spin_unlock(&r->producer_lock); spin_unlock_irqrestore(&r->consumer_lock, flags); } static inline void **__ptr_ring_swap_queue(struct ptr_ring *r, void **queue, int size, gfp_t gfp, void (*destroy)(void *)) { int producer = 0; void **old; void *ptr; while ((ptr = __ptr_ring_consume(r))) if (producer < size) queue[producer++] = ptr; else if (destroy) destroy(ptr); if (producer >= size) producer = 0; __ptr_ring_set_size(r, size); r->producer = producer; r->consumer_head = 0; r->consumer_tail = 0; old = r->queue; r->queue = queue; return old; } /* * Note: producer lock is nested within consumer lock, so if you * resize you must make sure all uses nest correctly. * In particular if you consume ring in interrupt or BH context, you must * disable interrupts/BH when doing so. */ static inline int ptr_ring_resize_noprof(struct ptr_ring *r, int size, gfp_t gfp, void (*destroy)(void *)) { unsigned long flags; void **queue = __ptr_ring_init_queue_alloc_noprof(size, gfp); void **old; if (!queue) return -ENOMEM; spin_lock_irqsave(&(r)->consumer_lock, flags); spin_lock(&(r)->producer_lock); old = __ptr_ring_swap_queue(r, queue, size, gfp, destroy); spin_unlock(&(r)->producer_lock); spin_unlock_irqrestore(&(r)->consumer_lock, flags); kvfree(old); return 0; } #define ptr_ring_resize(...) alloc_hooks(ptr_ring_resize_noprof(__VA_ARGS__)) /* * Note: producer lock is nested within consumer lock, so if you * resize you must make sure all uses nest correctly. * In particular if you consume ring in BH context, you must * disable BH when doing so. */ static inline int ptr_ring_resize_multiple_bh_noprof(struct ptr_ring **rings, unsigned int nrings, int size, gfp_t gfp, void (*destroy)(void *)) { void ***queues; int i; queues = kmalloc_array_noprof(nrings, sizeof(*queues), gfp); if (!queues) goto noqueues; for (i = 0; i < nrings; ++i) { queues[i] = __ptr_ring_init_queue_alloc_noprof(size, gfp); if (!queues[i]) goto nomem; } for (i = 0; i < nrings; ++i) { spin_lock_bh(&(rings[i])->consumer_lock); spin_lock(&(rings[i])->producer_lock); queues[i] = __ptr_ring_swap_queue(rings[i], queues[i], size, gfp, destroy); spin_unlock(&(rings[i])->producer_lock); spin_unlock_bh(&(rings[i])->consumer_lock); } for (i = 0; i < nrings; ++i) kvfree(queues[i]); kfree(queues); return 0; nomem: while (--i >= 0) kvfree(queues[i]); kfree(queues); noqueues: return -ENOMEM; } #define ptr_ring_resize_multiple_bh(...) \ alloc_hooks(ptr_ring_resize_multiple_bh_noprof(__VA_ARGS__)) static inline void ptr_ring_cleanup(struct ptr_ring *r, void (*destroy)(void *)) { void *ptr; if (destroy) while ((ptr = ptr_ring_consume(r))) destroy(ptr); kvfree(r->queue); } #endif /* _LINUX_PTR_RING_H */
3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 // SPDX-License-Identifier: GPL-2.0 /* Copyright 2011-2014 Autronica Fire and Security AS * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se * * Frame handler other utility functions for HSR and PRP. */ #include "hsr_slave.h" #include <linux/etherdevice.h> #include <linux/if_arp.h> #include <linux/if_vlan.h> #include "hsr_main.h" #include "hsr_device.h" #include "hsr_forward.h" #include "hsr_framereg.h" bool hsr_invalid_dan_ingress_frame(__be16 protocol) { return (protocol != htons(ETH_P_PRP) && protocol != htons(ETH_P_HSR)); } static rx_handler_result_t hsr_handle_frame(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; struct hsr_port *port; struct hsr_priv *hsr; __be16 protocol; /* Packets from dev_loopback_xmit() do not have L2 header, bail out */ if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) return RX_HANDLER_PASS; if (!skb_mac_header_was_set(skb)) { WARN_ONCE(1, "%s: skb invalid", __func__); return RX_HANDLER_PASS; } port = hsr_port_get_rcu(skb->dev); if (!port) goto finish_pass; hsr = port->hsr; if (hsr_addr_is_self(port->hsr, eth_hdr(skb)->h_source)) { /* Directly kill frames sent by ourselves */ kfree_skb(skb); goto finish_consume; } /* For HSR, only tagged frames are expected (unless the device offloads * HSR tag removal), but for PRP there could be non tagged frames as * well from Single attached nodes (SANs). */ protocol = eth_hdr(skb)->h_proto; if (!(port->dev->features & NETIF_F_HW_HSR_TAG_RM) && port->type != HSR_PT_INTERLINK && hsr->proto_ops->invalid_dan_ingress_frame && hsr->proto_ops->invalid_dan_ingress_frame(protocol)) goto finish_pass; skb_push(skb, ETH_HLEN); skb_reset_mac_header(skb); if ((!hsr->prot_version && protocol == htons(ETH_P_PRP)) || protocol == htons(ETH_P_HSR)) { if (!pskb_may_pull(skb, ETH_HLEN + HSR_HLEN)) { kfree_skb(skb); goto finish_consume; } skb_set_network_header(skb, ETH_HLEN + HSR_HLEN); } skb_reset_mac_len(skb); /* Only the frames received over the interlink port will assign a * sequence number and require synchronisation vs other sender. */ if (port->type == HSR_PT_INTERLINK) { spin_lock_bh(&hsr->seqnr_lock); hsr_forward_skb(skb, port); spin_unlock_bh(&hsr->seqnr_lock); } else { hsr_forward_skb(skb, port); } finish_consume: return RX_HANDLER_CONSUMED; finish_pass: return RX_HANDLER_PASS; } bool hsr_port_exists(const struct net_device *dev) { return rcu_access_pointer(dev->rx_handler) == hsr_handle_frame; } static int hsr_check_dev_ok(struct net_device *dev, struct netlink_ext_ack *extack) { /* Don't allow HSR on non-ethernet like devices */ if ((dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER || dev->addr_len != ETH_ALEN) { NL_SET_ERR_MSG_MOD(extack, "Cannot use loopback or non-ethernet device as HSR slave."); return -EINVAL; } /* Don't allow enslaving hsr devices */ if (is_hsr_master(dev)) { NL_SET_ERR_MSG_MOD(extack, "Cannot create trees of HSR devices."); return -EINVAL; } if (hsr_port_exists(dev)) { NL_SET_ERR_MSG_MOD(extack, "This device is already a HSR slave."); return -EINVAL; } if (is_vlan_dev(dev)) { NL_SET_ERR_MSG_MOD(extack, "HSR on top of VLAN is not yet supported in this driver."); return -EINVAL; } if (dev->priv_flags & IFF_DONT_BRIDGE) { NL_SET_ERR_MSG_MOD(extack, "This device does not support bridging."); return -EOPNOTSUPP; } /* HSR over bonded devices has not been tested, but I'm not sure it * won't work... */ return 0; } /* Setup device to be added to the HSR bridge. */ static int hsr_portdev_setup(struct hsr_priv *hsr, struct net_device *dev, struct hsr_port *port, struct netlink_ext_ack *extack) { struct net_device *hsr_dev; struct hsr_port *master; int res; /* Don't use promiscuous mode for offload since L2 frame forward * happens at the offloaded hardware. */ if (!port->hsr->fwd_offloaded) { res = dev_set_promiscuity(dev, 1); if (res) return res; } master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); hsr_dev = master->dev; res = netdev_upper_dev_link(dev, hsr_dev, extack); if (res) goto fail_upper_dev_link; res = netdev_rx_handler_register(dev, hsr_handle_frame, port); if (res) goto fail_rx_handler; dev_disable_lro(dev); return 0; fail_rx_handler: netdev_upper_dev_unlink(dev, hsr_dev); fail_upper_dev_link: if (!port->hsr->fwd_offloaded) dev_set_promiscuity(dev, -1); return res; } int hsr_add_port(struct hsr_priv *hsr, struct net_device *dev, enum hsr_port_type type, struct netlink_ext_ack *extack) { struct hsr_port *port, *master; int res; if (type != HSR_PT_MASTER) { res = hsr_check_dev_ok(dev, extack); if (res) return res; } port = hsr_port_get_hsr(hsr, type); if (port) return -EBUSY; /* This port already exists */ port = kzalloc(sizeof(*port), GFP_KERNEL); if (!port) return -ENOMEM; port->hsr = hsr; port->dev = dev; port->type = type; ether_addr_copy(port->original_macaddress, dev->dev_addr); if (type != HSR_PT_MASTER) { res = hsr_portdev_setup(hsr, dev, port, extack); if (res) goto fail_dev_setup; } list_add_tail_rcu(&port->port_list, &hsr->ports); master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); netdev_update_features(master->dev); dev_set_mtu(master->dev, hsr_get_max_mtu(hsr)); return 0; fail_dev_setup: kfree(port); return res; } void hsr_del_port(struct hsr_port *port) { struct hsr_priv *hsr; struct hsr_port *master; hsr = port->hsr; master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); list_del_rcu(&port->port_list); if (port != master) { netdev_update_features(master->dev); dev_set_mtu(master->dev, hsr_get_max_mtu(hsr)); netdev_rx_handler_unregister(port->dev); if (!port->hsr->fwd_offloaded) dev_set_promiscuity(port->dev, -1); netdev_upper_dev_unlink(port->dev, master->dev); eth_hw_addr_set(port->dev, port->original_macaddress); } kfree_rcu(port, rcu); }
46 49 10 11 257 257 256 40 262 257 12 10 1 1 10 1 9 1 9 9 9 8 2 1 1 3 50 43 5 1 1 39 1 39 1 1 1 1 1 1 1 47 47 47 47 1 42 1 42 42 1 42 1 1 1 1 62 24 24 5 20 4 36 36 1 1 1 38 36 2 1 1 1 1 41 41 36 5 2 2 1 1 3 2 1 1 2 2 2 2 7 9 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 // SPDX-License-Identifier: GPL-2.0-only /* * VGICv3 MMIO handling functions */ #include <linux/bitfield.h> #include <linux/irqchip/arm-gic-v3.h> #include <linux/kvm.h> #include <linux/kvm_host.h> #include <linux/interrupt.h> #include <kvm/iodev.h> #include <kvm/arm_vgic.h> #include <asm/kvm_emulate.h> #include <asm/kvm_arm.h> #include <asm/kvm_mmu.h> #include "vgic.h" #include "vgic-mmio.h" /* extract @num bytes at @offset bytes offset in data */ unsigned long extract_bytes(u64 data, unsigned int offset, unsigned int num) { return (data >> (offset * 8)) & GENMASK_ULL(num * 8 - 1, 0); } /* allows updates of any half of a 64-bit register (or the whole thing) */ u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len, unsigned long val) { int lower = (offset & 4) * 8; int upper = lower + 8 * len - 1; reg &= ~GENMASK_ULL(upper, lower); val &= GENMASK_ULL(len * 8 - 1, 0); return reg | ((u64)val << lower); } bool vgic_has_its(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; if (dist->vgic_model != KVM_DEV_TYPE_ARM_VGIC_V3) return false; return dist->has_its; } bool vgic_supports_direct_msis(struct kvm *kvm) { /* * Deliberately conflate vLPI and vSGI support on GICv4.1 hardware, * indirectly allowing userspace to control whether or not vPEs are * allocated for the VM. */ if (system_supports_direct_sgis() && !vgic_supports_direct_sgis(kvm)) return false; return kvm_vgic_global_state.has_gicv4 && vgic_has_its(kvm); } bool system_supports_direct_sgis(void) { return kvm_vgic_global_state.has_gicv4_1 && gic_cpuif_has_vsgi(); } bool vgic_supports_direct_sgis(struct kvm *kvm) { return kvm->arch.vgic.nassgicap; } /* * The Revision field in the IIDR have the following meanings: * * Revision 2: Interrupt groups are guest-configurable and signaled using * their configured groups. */ static unsigned long vgic_mmio_read_v3_misc(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len) { struct vgic_dist *vgic = &vcpu->kvm->arch.vgic; u32 value = 0; switch (addr & 0x0c) { case GICD_CTLR: if (vgic->enabled) value |= GICD_CTLR_ENABLE_SS_G1; value |= GICD_CTLR_ARE_NS | GICD_CTLR_DS; if (vgic->nassgireq) value |= GICD_CTLR_nASSGIreq; break; case GICD_TYPER: value = vgic->nr_spis + VGIC_NR_PRIVATE_IRQS; value = (value >> 5) - 1; if (vgic_has_its(vcpu->kvm)) { value |= (INTERRUPT_ID_BITS_ITS - 1) << 19; value |= GICD_TYPER_LPIS; } else { value |= (INTERRUPT_ID_BITS_SPIS - 1) << 19; } break; case GICD_TYPER2: if (vgic_supports_direct_sgis(vcpu->kvm)) value = GICD_TYPER2_nASSGIcap; break; case GICD_IIDR: value = (PRODUCT_ID_KVM << GICD_IIDR_PRODUCT_ID_SHIFT) | (vgic->implementation_rev << GICD_IIDR_REVISION_SHIFT) | (IMPLEMENTER_ARM << GICD_IIDR_IMPLEMENTER_SHIFT); break; default: return 0; } return value; } static void vgic_mmio_write_v3_misc(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { struct vgic_dist *dist = &vcpu->kvm->arch.vgic; switch (addr & 0x0c) { case GICD_CTLR: { bool was_enabled, is_hwsgi; mutex_lock(&vcpu->kvm->arch.config_lock); was_enabled = dist->enabled; is_hwsgi = dist->nassgireq; dist->enabled = val & GICD_CTLR_ENABLE_SS_G1; /* Not a GICv4.1? No HW SGIs */ if (!vgic_supports_direct_sgis(vcpu->kvm)) val &= ~GICD_CTLR_nASSGIreq; /* Dist stays enabled? nASSGIreq is RO */ if (was_enabled && dist->enabled) { val &= ~GICD_CTLR_nASSGIreq; val |= FIELD_PREP(GICD_CTLR_nASSGIreq, is_hwsgi); } /* Switching HW SGIs? */ dist->nassgireq = val & GICD_CTLR_nASSGIreq; if (is_hwsgi != dist->nassgireq) vgic_v4_configure_vsgis(vcpu->kvm); if (vgic_supports_direct_sgis(vcpu->kvm) && was_enabled != dist->enabled) kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_RELOAD_GICv4); else if (!was_enabled && dist->enabled) vgic_kick_vcpus(vcpu->kvm); mutex_unlock(&vcpu->kvm->arch.config_lock); break; } case GICD_TYPER: case GICD_TYPER2: case GICD_IIDR: /* This is at best for documentation purposes... */ return; } } static int vgic_mmio_uaccess_write_v3_misc(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { struct vgic_dist *dist = &vcpu->kvm->arch.vgic; u32 reg; switch (addr & 0x0c) { case GICD_TYPER2: reg = vgic_mmio_read_v3_misc(vcpu, addr, len); if (reg == val) return 0; if (vgic_initialized(vcpu->kvm)) return -EBUSY; if ((reg ^ val) & ~GICD_TYPER2_nASSGIcap) return -EINVAL; if (!system_supports_direct_sgis() && val) return -EINVAL; dist->nassgicap = val & GICD_TYPER2_nASSGIcap; return 0; case GICD_IIDR: reg = vgic_mmio_read_v3_misc(vcpu, addr, len); if ((reg ^ val) & ~GICD_IIDR_REVISION_MASK) return -EINVAL; reg = FIELD_GET(GICD_IIDR_REVISION_MASK, reg); switch (reg) { case KVM_VGIC_IMP_REV_2: case KVM_VGIC_IMP_REV_3: dist->implementation_rev = reg; return 0; default: return -EINVAL; } case GICD_CTLR: /* Not a GICv4.1? No HW SGIs */ if (!vgic_supports_direct_sgis(vcpu->kvm)) val &= ~GICD_CTLR_nASSGIreq; dist->enabled = val & GICD_CTLR_ENABLE_SS_G1; dist->nassgireq = val & GICD_CTLR_nASSGIreq; return 0; } vgic_mmio_write_v3_misc(vcpu, addr, len, val); return 0; } static unsigned long vgic_mmio_read_irouter(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len) { int intid = VGIC_ADDR_TO_INTID(addr, 64); struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, intid); unsigned long ret = 0; if (!irq) return 0; /* The upper word is RAZ for us. */ if (!(addr & 4)) ret = extract_bytes(READ_ONCE(irq->mpidr), addr & 7, len); vgic_put_irq(vcpu->kvm, irq); return ret; } static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { int intid = VGIC_ADDR_TO_INTID(addr, 64); struct vgic_irq *irq; unsigned long flags; /* The upper word is WI for us since we don't implement Aff3. */ if (addr & 4) return; irq = vgic_get_irq(vcpu->kvm, intid); if (!irq) return; raw_spin_lock_irqsave(&irq->irq_lock, flags); /* We only care about and preserve Aff0, Aff1 and Aff2. */ irq->mpidr = val & GENMASK(23, 0); irq->target_vcpu = kvm_mpidr_to_vcpu(vcpu->kvm, irq->mpidr); raw_spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(vcpu->kvm, irq); } bool vgic_lpis_enabled(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; return atomic_read(&vgic_cpu->ctlr) == GICR_CTLR_ENABLE_LPIS; } static unsigned long vgic_mmio_read_v3r_ctlr(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; unsigned long val; val = atomic_read(&vgic_cpu->ctlr); if (vgic_get_implementation_rev(vcpu) >= KVM_VGIC_IMP_REV_3) val |= GICR_CTLR_IR | GICR_CTLR_CES; return val; } static void vgic_mmio_write_v3r_ctlr(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; u32 ctlr; if (!vgic_has_its(vcpu->kvm)) return; if (!(val & GICR_CTLR_ENABLE_LPIS)) { /* * Don't disable if RWP is set, as there already an * ongoing disable. Funky guest... */ ctlr = atomic_cmpxchg_acquire(&vgic_cpu->ctlr, GICR_CTLR_ENABLE_LPIS, GICR_CTLR_RWP); if (ctlr != GICR_CTLR_ENABLE_LPIS) return; vgic_flush_pending_lpis(vcpu); vgic_its_invalidate_all_caches(vcpu->kvm); atomic_set_release(&vgic_cpu->ctlr, 0); } else { ctlr = atomic_cmpxchg_acquire(&vgic_cpu->ctlr, 0, GICR_CTLR_ENABLE_LPIS); if (ctlr != 0) return; vgic_enable_lpis(vcpu); } } static bool vgic_mmio_vcpu_rdist_is_last(struct kvm_vcpu *vcpu) { struct vgic_dist *vgic = &vcpu->kvm->arch.vgic; struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; struct vgic_redist_region *iter, *rdreg = vgic_cpu->rdreg; if (!rdreg) return false; if (vgic_cpu->rdreg_index < rdreg->free_index - 1) { return false; } else if (rdreg->count && vgic_cpu->rdreg_index == (rdreg->count - 1)) { struct list_head *rd_regions = &vgic->rd_regions; gpa_t end = rdreg->base + rdreg->count * KVM_VGIC_V3_REDIST_SIZE; /* * the rdist is the last one of the redist region, * check whether there is no other contiguous rdist region */ list_for_each_entry(iter, rd_regions, list) { if (iter->base == end && iter->free_index > 0) return false; } } return true; } static unsigned long vgic_mmio_read_v3r_typer(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len) { unsigned long mpidr = kvm_vcpu_get_mpidr_aff(vcpu); int target_vcpu_id = vcpu->vcpu_id; u64 value; value = (u64)(mpidr & GENMASK(23, 0)) << 32; value |= ((target_vcpu_id & 0xffff) << 8); if (vgic_has_its(vcpu->kvm)) value |= GICR_TYPER_PLPIS; if (vgic_mmio_vcpu_rdist_is_last(vcpu)) value |= GICR_TYPER_LAST; return extract_bytes(value, addr & 7, len); } static unsigned long vgic_mmio_read_v3r_iidr(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len) { return (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0); } static unsigned long vgic_mmio_read_v3_idregs(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len) { switch (addr & 0xffff) { case GICD_PIDR2: /* report a GICv3 compliant implementation */ return 0x3b; } return 0; } static int vgic_v3_uaccess_write_pending(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { int ret; ret = vgic_uaccess_write_spending(vcpu, addr, len, val); if (ret) return ret; return vgic_uaccess_write_cpending(vcpu, addr, len, ~val); } /* We want to avoid outer shareable. */ u64 vgic_sanitise_shareability(u64 field) { switch (field) { case GIC_BASER_OuterShareable: return GIC_BASER_InnerShareable; default: return field; } } /* Avoid any inner non-cacheable mapping. */ u64 vgic_sanitise_inner_cacheability(u64 field) { switch (field) { case GIC_BASER_CACHE_nCnB: case GIC_BASER_CACHE_nC: return GIC_BASER_CACHE_RaWb; default: return field; } } /* Non-cacheable or same-as-inner are OK. */ u64 vgic_sanitise_outer_cacheability(u64 field) { switch (field) { case GIC_BASER_CACHE_SameAsInner: case GIC_BASER_CACHE_nC: return field; default: return GIC_BASER_CACHE_SameAsInner; } } u64 vgic_sanitise_field(u64 reg, u64 field_mask, int field_shift, u64 (*sanitise_fn)(u64)) { u64 field = (reg & field_mask) >> field_shift; field = sanitise_fn(field) << field_shift; return (reg & ~field_mask) | field; } #define PROPBASER_RES0_MASK \ (GENMASK_ULL(63, 59) | GENMASK_ULL(55, 52) | GENMASK_ULL(6, 5)) #define PENDBASER_RES0_MASK \ (BIT_ULL(63) | GENMASK_ULL(61, 59) | GENMASK_ULL(55, 52) | \ GENMASK_ULL(15, 12) | GENMASK_ULL(6, 0)) static u64 vgic_sanitise_pendbaser(u64 reg) { reg = vgic_sanitise_field(reg, GICR_PENDBASER_SHAREABILITY_MASK, GICR_PENDBASER_SHAREABILITY_SHIFT, vgic_sanitise_shareability); reg = vgic_sanitise_field(reg, GICR_PENDBASER_INNER_CACHEABILITY_MASK, GICR_PENDBASER_INNER_CACHEABILITY_SHIFT, vgic_sanitise_inner_cacheability); reg = vgic_sanitise_field(reg, GICR_PENDBASER_OUTER_CACHEABILITY_MASK, GICR_PENDBASER_OUTER_CACHEABILITY_SHIFT, vgic_sanitise_outer_cacheability); reg &= ~PENDBASER_RES0_MASK; return reg; } static u64 vgic_sanitise_propbaser(u64 reg) { reg = vgic_sanitise_field(reg, GICR_PROPBASER_SHAREABILITY_MASK, GICR_PROPBASER_SHAREABILITY_SHIFT, vgic_sanitise_shareability); reg = vgic_sanitise_field(reg, GICR_PROPBASER_INNER_CACHEABILITY_MASK, GICR_PROPBASER_INNER_CACHEABILITY_SHIFT, vgic_sanitise_inner_cacheability); reg = vgic_sanitise_field(reg, GICR_PROPBASER_OUTER_CACHEABILITY_MASK, GICR_PROPBASER_OUTER_CACHEABILITY_SHIFT, vgic_sanitise_outer_cacheability); reg &= ~PROPBASER_RES0_MASK; return reg; } static unsigned long vgic_mmio_read_propbase(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len) { struct vgic_dist *dist = &vcpu->kvm->arch.vgic; return extract_bytes(dist->propbaser, addr & 7, len); } static void vgic_mmio_write_propbase(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { struct vgic_dist *dist = &vcpu->kvm->arch.vgic; u64 old_propbaser, propbaser; /* Storing a value with LPIs already enabled is undefined */ if (vgic_lpis_enabled(vcpu)) return; do { old_propbaser = READ_ONCE(dist->propbaser); propbaser = old_propbaser; propbaser = update_64bit_reg(propbaser, addr & 4, len, val); propbaser = vgic_sanitise_propbaser(propbaser); } while (cmpxchg64(&dist->propbaser, old_propbaser, propbaser) != old_propbaser); } static unsigned long vgic_mmio_read_pendbase(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; u64 value = vgic_cpu->pendbaser; value &= ~GICR_PENDBASER_PTZ; return extract_bytes(value, addr & 7, len); } static void vgic_mmio_write_pendbase(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; u64 old_pendbaser, pendbaser; /* Storing a value with LPIs already enabled is undefined */ if (vgic_lpis_enabled(vcpu)) return; do { old_pendbaser = READ_ONCE(vgic_cpu->pendbaser); pendbaser = old_pendbaser; pendbaser = update_64bit_reg(pendbaser, addr & 4, len, val); pendbaser = vgic_sanitise_pendbaser(pendbaser); } while (cmpxchg64(&vgic_cpu->pendbaser, old_pendbaser, pendbaser) != old_pendbaser); } static unsigned long vgic_mmio_read_sync(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len) { return !!atomic_read(&vcpu->arch.vgic_cpu.syncr_busy); } static void vgic_set_rdist_busy(struct kvm_vcpu *vcpu, bool busy) { if (busy) { atomic_inc(&vcpu->arch.vgic_cpu.syncr_busy); smp_mb__after_atomic(); } else { smp_mb__before_atomic(); atomic_dec(&vcpu->arch.vgic_cpu.syncr_busy); } } static void vgic_mmio_write_invlpi(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { struct vgic_irq *irq; u32 intid; /* * If the guest wrote only to the upper 32bit part of the * register, drop the write on the floor, as it is only for * vPEs (which we don't support for obvious reasons). * * Also discard the access if LPIs are not enabled. */ if ((addr & 4) || !vgic_lpis_enabled(vcpu)) return; intid = lower_32_bits(val); if (intid < VGIC_MIN_LPI) return; vgic_set_rdist_busy(vcpu, true); irq = vgic_get_irq(vcpu->kvm, intid); if (irq) { vgic_its_inv_lpi(vcpu->kvm, irq); vgic_put_irq(vcpu->kvm, irq); } vgic_set_rdist_busy(vcpu, false); } static void vgic_mmio_write_invall(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { /* See vgic_mmio_write_invlpi() for the early return rationale */ if ((addr & 4) || !vgic_lpis_enabled(vcpu)) return; vgic_set_rdist_busy(vcpu, true); vgic_its_invall(vcpu); vgic_set_rdist_busy(vcpu, false); } /* * The GICv3 per-IRQ registers are split to control PPIs and SGIs in the * redistributors, while SPIs are covered by registers in the distributor * block. Trying to set private IRQs in this block gets ignored. * We take some special care here to fix the calculation of the register * offset. */ #define REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(off, rd, wr, ur, uw, bpi, acc) \ { \ .reg_offset = off, \ .bits_per_irq = bpi, \ .len = (bpi * VGIC_NR_PRIVATE_IRQS) / 8, \ .access_flags = acc, \ .read = vgic_mmio_read_raz, \ .write = vgic_mmio_write_wi, \ }, { \ .reg_offset = off + (bpi * VGIC_NR_PRIVATE_IRQS) / 8, \ .bits_per_irq = bpi, \ .len = (bpi * (1024 - VGIC_NR_PRIVATE_IRQS)) / 8, \ .access_flags = acc, \ .read = rd, \ .write = wr, \ .uaccess_read = ur, \ .uaccess_write = uw, \ } static const struct vgic_register_region vgic_v3_dist_registers[] = { REGISTER_DESC_WITH_LENGTH_UACCESS(GICD_CTLR, vgic_mmio_read_v3_misc, vgic_mmio_write_v3_misc, NULL, vgic_mmio_uaccess_write_v3_misc, 16, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_LENGTH(GICD_STATUSR, vgic_mmio_read_rao, vgic_mmio_write_wi, 4, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IGROUPR, vgic_mmio_read_group, vgic_mmio_write_group, NULL, NULL, 1, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISENABLER, vgic_mmio_read_enable, vgic_mmio_write_senable, NULL, vgic_uaccess_write_senable, 1, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICENABLER, vgic_mmio_read_enable, vgic_mmio_write_cenable, NULL, vgic_uaccess_write_cenable, 1, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISPENDR, vgic_mmio_read_pending, vgic_mmio_write_spending, vgic_uaccess_read_pending, vgic_v3_uaccess_write_pending, 1, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICPENDR, vgic_mmio_read_pending, vgic_mmio_write_cpending, vgic_mmio_read_raz, vgic_mmio_uaccess_write_wi, 1, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISACTIVER, vgic_mmio_read_active, vgic_mmio_write_sactive, vgic_uaccess_read_active, vgic_mmio_uaccess_write_sactive, 1, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICACTIVER, vgic_mmio_read_active, vgic_mmio_write_cactive, vgic_uaccess_read_active, vgic_mmio_uaccess_write_cactive, 1, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IPRIORITYR, vgic_mmio_read_priority, vgic_mmio_write_priority, NULL, NULL, 8, VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ITARGETSR, vgic_mmio_read_raz, vgic_mmio_write_wi, NULL, NULL, 8, VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICFGR, vgic_mmio_read_config, vgic_mmio_write_config, NULL, NULL, 2, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IGRPMODR, vgic_mmio_read_raz, vgic_mmio_write_wi, NULL, NULL, 1, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IROUTER, vgic_mmio_read_irouter, vgic_mmio_write_irouter, NULL, NULL, 64, VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), REGISTER_DESC_WITH_LENGTH(GICD_IDREGS, vgic_mmio_read_v3_idregs, vgic_mmio_write_wi, 48, VGIC_ACCESS_32bit), }; static const struct vgic_register_region vgic_v3_rd_registers[] = { /* RD_base registers */ REGISTER_DESC_WITH_LENGTH(GICR_CTLR, vgic_mmio_read_v3r_ctlr, vgic_mmio_write_v3r_ctlr, 4, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_LENGTH(GICR_STATUSR, vgic_mmio_read_raz, vgic_mmio_write_wi, 4, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_LENGTH(GICR_IIDR, vgic_mmio_read_v3r_iidr, vgic_mmio_write_wi, 4, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_LENGTH_UACCESS(GICR_TYPER, vgic_mmio_read_v3r_typer, vgic_mmio_write_wi, NULL, vgic_mmio_uaccess_write_wi, 8, VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), REGISTER_DESC_WITH_LENGTH(GICR_WAKER, vgic_mmio_read_raz, vgic_mmio_write_wi, 4, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_LENGTH(GICR_PROPBASER, vgic_mmio_read_propbase, vgic_mmio_write_propbase, 8, VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), REGISTER_DESC_WITH_LENGTH(GICR_PENDBASER, vgic_mmio_read_pendbase, vgic_mmio_write_pendbase, 8, VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), REGISTER_DESC_WITH_LENGTH(GICR_INVLPIR, vgic_mmio_read_raz, vgic_mmio_write_invlpi, 8, VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), REGISTER_DESC_WITH_LENGTH(GICR_INVALLR, vgic_mmio_read_raz, vgic_mmio_write_invall, 8, VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), REGISTER_DESC_WITH_LENGTH(GICR_SYNCR, vgic_mmio_read_sync, vgic_mmio_write_wi, 4, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_LENGTH(GICR_IDREGS, vgic_mmio_read_v3_idregs, vgic_mmio_write_wi, 48, VGIC_ACCESS_32bit), /* SGI_base registers */ REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_IGROUPR0, vgic_mmio_read_group, vgic_mmio_write_group, 4, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ISENABLER0, vgic_mmio_read_enable, vgic_mmio_write_senable, NULL, vgic_uaccess_write_senable, 4, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ICENABLER0, vgic_mmio_read_enable, vgic_mmio_write_cenable, NULL, vgic_uaccess_write_cenable, 4, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ISPENDR0, vgic_mmio_read_pending, vgic_mmio_write_spending, vgic_uaccess_read_pending, vgic_v3_uaccess_write_pending, 4, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ICPENDR0, vgic_mmio_read_pending, vgic_mmio_write_cpending, vgic_mmio_read_raz, vgic_mmio_uaccess_write_wi, 4, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ISACTIVER0, vgic_mmio_read_active, vgic_mmio_write_sactive, vgic_uaccess_read_active, vgic_mmio_uaccess_write_sactive, 4, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ICACTIVER0, vgic_mmio_read_active, vgic_mmio_write_cactive, vgic_uaccess_read_active, vgic_mmio_uaccess_write_cactive, 4, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_IPRIORITYR0, vgic_mmio_read_priority, vgic_mmio_write_priority, 32, VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_ICFGR0, vgic_mmio_read_config, vgic_mmio_write_config, 8, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_IGRPMODR0, vgic_mmio_read_raz, vgic_mmio_write_wi, 4, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_NSACR, vgic_mmio_read_raz, vgic_mmio_write_wi, 4, VGIC_ACCESS_32bit), }; unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev) { dev->regions = vgic_v3_dist_registers; dev->nr_regions = ARRAY_SIZE(vgic_v3_dist_registers); kvm_iodevice_init(&dev->dev, &kvm_io_gic_ops); return SZ_64K; } /** * vgic_register_redist_iodev - register a single redist iodev * @vcpu: The VCPU to which the redistributor belongs * * Register a KVM iodev for this VCPU's redistributor using the address * provided. * * Return 0 on success, -ERRNO otherwise. */ int vgic_register_redist_iodev(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; struct vgic_dist *vgic = &kvm->arch.vgic; struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; struct vgic_io_device *rd_dev = &vcpu->arch.vgic_cpu.rd_iodev; struct vgic_redist_region *rdreg; gpa_t rd_base; int ret = 0; lockdep_assert_held(&kvm->slots_lock); mutex_lock(&kvm->arch.config_lock); if (!IS_VGIC_ADDR_UNDEF(vgic_cpu->rd_iodev.base_addr)) goto out_unlock; /* * We may be creating VCPUs before having set the base address for the * redistributor region, in which case we will come back to this * function for all VCPUs when the base address is set. Just return * without doing any work for now. */ rdreg = vgic_v3_rdist_free_slot(&vgic->rd_regions); if (!rdreg) goto out_unlock; if (!vgic_v3_check_base(kvm)) { ret = -EINVAL; goto out_unlock; } vgic_cpu->rdreg = rdreg; vgic_cpu->rdreg_index = rdreg->free_index; rd_base = rdreg->base + rdreg->free_index * KVM_VGIC_V3_REDIST_SIZE; kvm_iodevice_init(&rd_dev->dev, &kvm_io_gic_ops); rd_dev->base_addr = rd_base; rd_dev->iodev_type = IODEV_REDIST; rd_dev->regions = vgic_v3_rd_registers; rd_dev->nr_regions = ARRAY_SIZE(vgic_v3_rd_registers); rd_dev->redist_vcpu = vcpu; mutex_unlock(&kvm->arch.config_lock); ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, rd_base, 2 * SZ_64K, &rd_dev->dev); if (ret) return ret; /* Protected by slots_lock */ rdreg->free_index++; return 0; out_unlock: mutex_unlock(&kvm->arch.config_lock); return ret; } void vgic_unregister_redist_iodev(struct kvm_vcpu *vcpu) { struct vgic_io_device *rd_dev = &vcpu->arch.vgic_cpu.rd_iodev; kvm_io_bus_unregister_dev(vcpu->kvm, KVM_MMIO_BUS, &rd_dev->dev); } static int vgic_register_all_redist_iodevs(struct kvm *kvm) { struct kvm_vcpu *vcpu; unsigned long c; int ret = 0; lockdep_assert_held(&kvm->slots_lock); kvm_for_each_vcpu(c, vcpu, kvm) { ret = vgic_register_redist_iodev(vcpu); if (ret) break; } if (ret) { /* The current c failed, so iterate over the previous ones. */ int i; for (i = 0; i < c; i++) { vcpu = kvm_get_vcpu(kvm, i); vgic_unregister_redist_iodev(vcpu); } } return ret; } /** * vgic_v3_alloc_redist_region - Allocate a new redistributor region * * Performs various checks before inserting the rdist region in the list. * Those tests depend on whether the size of the rdist region is known * (ie. count != 0). The list is sorted by rdist region index. * * @kvm: kvm handle * @index: redist region index * @base: base of the new rdist region * @count: number of redistributors the region is made of (0 in the old style * single region, whose size is induced from the number of vcpus) * * Return 0 on success, < 0 otherwise */ static int vgic_v3_alloc_redist_region(struct kvm *kvm, uint32_t index, gpa_t base, uint32_t count) { struct vgic_dist *d = &kvm->arch.vgic; struct vgic_redist_region *rdreg; struct list_head *rd_regions = &d->rd_regions; int nr_vcpus = atomic_read(&kvm->online_vcpus); size_t size = count ? count * KVM_VGIC_V3_REDIST_SIZE : nr_vcpus * KVM_VGIC_V3_REDIST_SIZE; int ret; /* cross the end of memory ? */ if (base + size < base) return -EINVAL; if (list_empty(rd_regions)) { if (index != 0) return -EINVAL; } else { rdreg = list_last_entry(rd_regions, struct vgic_redist_region, list); /* Don't mix single region and discrete redist regions */ if (!count && rdreg->count) return -EINVAL; if (!count) return -EEXIST; if (index != rdreg->index + 1) return -EINVAL; } /* * For legacy single-region redistributor regions (!count), * check that the redistributor region does not overlap with the * distributor's address space. */ if (!count && !IS_VGIC_ADDR_UNDEF(d->vgic_dist_base) && vgic_dist_overlap(kvm, base, size)) return -EINVAL; /* collision with any other rdist region? */ if (vgic_v3_rdist_overlap(kvm, base, size)) return -EINVAL; rdreg = kzalloc(sizeof(*rdreg), GFP_KERNEL_ACCOUNT); if (!rdreg) return -ENOMEM; rdreg->base = VGIC_ADDR_UNDEF; ret = vgic_check_iorange(kvm, rdreg->base, base, SZ_64K, size); if (ret) goto free; rdreg->base = base; rdreg->count = count; rdreg->free_index = 0; rdreg->index = index; list_add_tail(&rdreg->list, rd_regions); return 0; free: kfree(rdreg); return ret; } void vgic_v3_free_redist_region(struct kvm *kvm, struct vgic_redist_region *rdreg) { struct kvm_vcpu *vcpu; unsigned long c; lockdep_assert_held(&kvm->arch.config_lock); /* Garbage collect the region */ kvm_for_each_vcpu(c, vcpu, kvm) { if (vcpu->arch.vgic_cpu.rdreg == rdreg) vcpu->arch.vgic_cpu.rdreg = NULL; } list_del(&rdreg->list); kfree(rdreg); } int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, u64 addr, u32 count) { int ret; mutex_lock(&kvm->arch.config_lock); ret = vgic_v3_alloc_redist_region(kvm, index, addr, count); mutex_unlock(&kvm->arch.config_lock); if (ret) return ret; /* * Register iodevs for each existing VCPU. Adding more VCPUs * afterwards will register the iodevs when needed. */ ret = vgic_register_all_redist_iodevs(kvm); if (ret) { struct vgic_redist_region *rdreg; mutex_lock(&kvm->arch.config_lock); rdreg = vgic_v3_rdist_region_from_index(kvm, index); vgic_v3_free_redist_region(kvm, rdreg); mutex_unlock(&kvm->arch.config_lock); return ret; } return 0; } int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr) { const struct vgic_register_region *region; struct vgic_io_device iodev; struct vgic_reg_attr reg_attr; struct kvm_vcpu *vcpu; gpa_t addr; int ret; ret = vgic_v3_parse_attr(dev, attr, &reg_attr); if (ret) return ret; vcpu = reg_attr.vcpu; addr = reg_attr.addr; switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: iodev.regions = vgic_v3_dist_registers; iodev.nr_regions = ARRAY_SIZE(vgic_v3_dist_registers); iodev.base_addr = 0; break; case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS:{ iodev.regions = vgic_v3_rd_registers; iodev.nr_regions = ARRAY_SIZE(vgic_v3_rd_registers); iodev.base_addr = 0; break; } case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: return vgic_v3_has_cpu_sysregs_attr(vcpu, attr); default: return -ENXIO; } /* We only support aligned 32-bit accesses. */ if (addr & 3) return -ENXIO; region = vgic_get_mmio_region(vcpu, &iodev, addr, sizeof(u32)); if (!region) return -ENXIO; return 0; } /* * The ICC_SGI* registers encode the affinity differently from the MPIDR, * so provide a wrapper to use the existing defines to isolate a certain * affinity level. */ #define SGI_AFFINITY_LEVEL(reg, level) \ ((((reg) & ICC_SGI1R_AFFINITY_## level ##_MASK) \ >> ICC_SGI1R_AFFINITY_## level ##_SHIFT) << MPIDR_LEVEL_SHIFT(level)) static void vgic_v3_queue_sgi(struct kvm_vcpu *vcpu, u32 sgi, bool allow_group1) { struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, sgi); unsigned long flags; raw_spin_lock_irqsave(&irq->irq_lock, flags); /* * An access targeting Group0 SGIs can only generate * those, while an access targeting Group1 SGIs can * generate interrupts of either group. */ if (!irq->group || allow_group1) { if (!irq->hw) { irq->pending_latch = true; vgic_queue_irq_unlock(vcpu->kvm, irq, flags); } else { /* HW SGI? Ask the GIC to inject it */ int err; err = irq_set_irqchip_state(irq->host_irq, IRQCHIP_STATE_PENDING, true); WARN_RATELIMIT(err, "IRQ %d", irq->host_irq); raw_spin_unlock_irqrestore(&irq->irq_lock, flags); } } else { raw_spin_unlock_irqrestore(&irq->irq_lock, flags); } vgic_put_irq(vcpu->kvm, irq); } /** * vgic_v3_dispatch_sgi - handle SGI requests from VCPUs * @vcpu: The VCPU requesting a SGI * @reg: The value written into ICC_{ASGI1,SGI0,SGI1}R by that VCPU * @allow_group1: Does the sysreg access allow generation of G1 SGIs * * With GICv3 (and ARE=1) CPUs trigger SGIs by writing to a system register. * This will trap in sys_regs.c and call this function. * This ICC_SGI1R_EL1 register contains the upper three affinity levels of the * target processors as well as a bitmask of 16 Aff0 CPUs. * * If the interrupt routing mode bit is not set, we iterate over the Aff0 * bits and signal the VCPUs matching the provided Aff{3,2,1}. * * If this bit is set, we signal all, but not the calling VCPU. */ void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg, bool allow_group1) { struct kvm *kvm = vcpu->kvm; struct kvm_vcpu *c_vcpu; unsigned long target_cpus; u64 mpidr; u32 sgi, aff0; unsigned long c; sgi = FIELD_GET(ICC_SGI1R_SGI_ID_MASK, reg); /* Broadcast */ if (unlikely(reg & BIT_ULL(ICC_SGI1R_IRQ_ROUTING_MODE_BIT))) { kvm_for_each_vcpu(c, c_vcpu, kvm) { /* Don't signal the calling VCPU */ if (c_vcpu == vcpu) continue; vgic_v3_queue_sgi(c_vcpu, sgi, allow_group1); } return; } /* We iterate over affinities to find the corresponding vcpus */ mpidr = SGI_AFFINITY_LEVEL(reg, 3); mpidr |= SGI_AFFINITY_LEVEL(reg, 2); mpidr |= SGI_AFFINITY_LEVEL(reg, 1); target_cpus = FIELD_GET(ICC_SGI1R_TARGET_LIST_MASK, reg); for_each_set_bit(aff0, &target_cpus, hweight_long(ICC_SGI1R_TARGET_LIST_MASK)) { c_vcpu = kvm_mpidr_to_vcpu(kvm, mpidr | aff0); if (c_vcpu) vgic_v3_queue_sgi(c_vcpu, sgi, allow_group1); } } int vgic_v3_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write, int offset, u32 *val) { struct vgic_io_device dev = { .regions = vgic_v3_dist_registers, .nr_regions = ARRAY_SIZE(vgic_v3_dist_registers), }; return vgic_uaccess(vcpu, &dev, is_write, offset, val); } int vgic_v3_redist_uaccess(struct kvm_vcpu *vcpu, bool is_write, int offset, u32 *val) { struct vgic_io_device rd_dev = { .regions = vgic_v3_rd_registers, .nr_regions = ARRAY_SIZE(vgic_v3_rd_registers), }; return vgic_uaccess(vcpu, &rd_dev, is_write, offset, val); } int vgic_v3_line_level_info_uaccess(struct kvm_vcpu *vcpu, bool is_write, u32 intid, u32 *val) { if (intid % 32) return -EINVAL; if (is_write) vgic_write_irq_line_level_info(vcpu, intid, *val); else *val = vgic_read_irq_line_level_info(vcpu, intid); return 0; }
108 107 109 109 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 // SPDX-License-Identifier: GPL-2.0-only /* * Implementation of the policy database. * * Author : Stephen Smalley, <stephen.smalley.work@gmail.com> */ /* * Updated: Trusted Computer Solutions, Inc. <dgoeddel@trustedcs.com> * Support for enhanced MLS infrastructure. * Copyright (C) 2004-2005 Trusted Computer Solutions, Inc. * * Updated: Frank Mayer <mayerf@tresys.com> and * Karl MacMillan <kmacmillan@tresys.com> * Added conditional policy language extensions * Copyright (C) 2003-2004 Tresys Technology, LLC * * Updated: Hewlett-Packard <paul@paul-moore.com> * Added support for the policy capability bitmap * Copyright (C) 2007 Hewlett-Packard Development Company, L.P. * * Update: Mellanox Techonologies * Added Infiniband support * Copyright (C) 2016 Mellanox Techonologies */ #include <linux/kernel.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/audit.h> #include "security.h" #include "policydb.h" #include "conditional.h" #include "mls.h" #include "services.h" #ifdef CONFIG_SECURITY_SELINUX_DEBUG /* clang-format off */ static const char *const symtab_name[SYM_NUM] = { "common prefixes", "classes", "roles", "types", "users", "bools", "levels", "categories", }; /* clang-format off */ #endif struct policydb_compat_info { unsigned int version; unsigned int sym_num; unsigned int ocon_num; }; /* These need to be updated if SYM_NUM or OCON_NUM changes */ static const struct policydb_compat_info policydb_compat[] = { { .version = POLICYDB_VERSION_BASE, .sym_num = SYM_NUM - 3, .ocon_num = OCON_NUM - 3, }, { .version = POLICYDB_VERSION_BOOL, .sym_num = SYM_NUM - 2, .ocon_num = OCON_NUM - 3, }, { .version = POLICYDB_VERSION_IPV6, .sym_num = SYM_NUM - 2, .ocon_num = OCON_NUM - 2, }, { .version = POLICYDB_VERSION_NLCLASS, .sym_num = SYM_NUM - 2, .ocon_num = OCON_NUM - 2, }, { .version = POLICYDB_VERSION_MLS, .sym_num = SYM_NUM, .ocon_num = OCON_NUM - 2, }, { .version = POLICYDB_VERSION_AVTAB, .sym_num = SYM_NUM, .ocon_num = OCON_NUM - 2, }, { .version = POLICYDB_VERSION_RANGETRANS, .sym_num = SYM_NUM, .ocon_num = OCON_NUM - 2, }, { .version = POLICYDB_VERSION_POLCAP, .sym_num = SYM_NUM, .ocon_num = OCON_NUM - 2, }, { .version = POLICYDB_VERSION_PERMISSIVE, .sym_num = SYM_NUM, .ocon_num = OCON_NUM - 2, }, { .version = POLICYDB_VERSION_BOUNDARY, .sym_num = SYM_NUM, .ocon_num = OCON_NUM - 2, }, { .version = POLICYDB_VERSION_FILENAME_TRANS, .sym_num = SYM_NUM, .ocon_num = OCON_NUM - 2, }, { .version = POLICYDB_VERSION_ROLETRANS, .sym_num = SYM_NUM, .ocon_num = OCON_NUM - 2, }, { .version = POLICYDB_VERSION_NEW_OBJECT_DEFAULTS, .sym_num = SYM_NUM, .ocon_num = OCON_NUM - 2, }, { .version = POLICYDB_VERSION_DEFAULT_TYPE, .sym_num = SYM_NUM, .ocon_num = OCON_NUM - 2, }, { .version = POLICYDB_VERSION_CONSTRAINT_NAMES, .sym_num = SYM_NUM, .ocon_num = OCON_NUM - 2, }, { .version = POLICYDB_VERSION_XPERMS_IOCTL, .sym_num = SYM_NUM, .ocon_num = OCON_NUM - 2, }, { .version = POLICYDB_VERSION_INFINIBAND, .sym_num = SYM_NUM, .ocon_num = OCON_NUM, }, { .version = POLICYDB_VERSION_GLBLUB, .sym_num = SYM_NUM, .ocon_num = OCON_NUM, }, { .version = POLICYDB_VERSION_COMP_FTRANS, .sym_num = SYM_NUM, .ocon_num = OCON_NUM, }, { .version = POLICYDB_VERSION_COND_XPERMS, .sym_num = SYM_NUM, .ocon_num = OCON_NUM, }, { .version = POLICYDB_VERSION_NEVERAUDIT, .sym_num = SYM_NUM, .ocon_num = OCON_NUM, }, }; static const struct policydb_compat_info * policydb_lookup_compat(unsigned int version) { unsigned int i; for (i = 0; i < ARRAY_SIZE(policydb_compat); i++) { if (policydb_compat[i].version == version) return &policydb_compat[i]; } return NULL; } /* * The following *_destroy functions are used to * free any memory allocated for each kind of * symbol data in the policy database. */ static int perm_destroy(void *key, void *datum, void *p) { kfree(key); kfree(datum); return 0; } static int common_destroy(void *key, void *datum, void *p) { struct common_datum *comdatum; kfree(key); if (datum) { comdatum = datum; hashtab_map(&comdatum->permissions.table, perm_destroy, NULL); hashtab_destroy(&comdatum->permissions.table); } kfree(datum); return 0; } static void constraint_expr_destroy(struct constraint_expr *expr) { if (expr) { ebitmap_destroy(&expr->names); if (expr->type_names) { ebitmap_destroy(&expr->type_names->types); ebitmap_destroy(&expr->type_names->negset); kfree(expr->type_names); } kfree(expr); } } static int cls_destroy(void *key, void *datum, void *p) { struct class_datum *cladatum; struct constraint_node *constraint, *ctemp; struct constraint_expr *e, *etmp; kfree(key); if (datum) { cladatum = datum; hashtab_map(&cladatum->permissions.table, perm_destroy, NULL); hashtab_destroy(&cladatum->permissions.table); constraint = cladatum->constraints; while (constraint) { e = constraint->expr; while (e) { etmp = e; e = e->next; constraint_expr_destroy(etmp); } ctemp = constraint; constraint = constraint->next; kfree(ctemp); } constraint = cladatum->validatetrans; while (constraint) { e = constraint->expr; while (e) { etmp = e; e = e->next; constraint_expr_destroy(etmp); } ctemp = constraint; constraint = constraint->next; kfree(ctemp); } kfree(cladatum->comkey); } kfree(datum); return 0; } static int role_destroy(void *key, void *datum, void *p) { struct role_datum *role; kfree(key); if (datum) { role = datum; ebitmap_destroy(&role->dominates); ebitmap_destroy(&role->types); } kfree(datum); return 0; } static int type_destroy(void *key, void *datum, void *p) { kfree(key); kfree(datum); return 0; } static int user_destroy(void *key, void *datum, void *p) { struct user_datum *usrdatum; kfree(key); if (datum) { usrdatum = datum; ebitmap_destroy(&usrdatum->roles); ebitmap_destroy(&usrdatum->range.level[0].cat); ebitmap_destroy(&usrdatum->range.level[1].cat); ebitmap_destroy(&usrdatum->dfltlevel.cat); } kfree(datum); return 0; } static int sens_destroy(void *key, void *datum, void *p) { struct level_datum *levdatum; kfree(key); if (datum) { levdatum = datum; ebitmap_destroy(&levdatum->level.cat); } kfree(datum); return 0; } static int cat_destroy(void *key, void *datum, void *p) { kfree(key); kfree(datum); return 0; } /* clang-format off */ static int (*const destroy_f[SYM_NUM])(void *key, void *datum, void *datap) = { common_destroy, cls_destroy, role_destroy, type_destroy, user_destroy, cond_destroy_bool, sens_destroy, cat_destroy, }; /* clang-format on */ static int filenametr_destroy(void *key, void *datum, void *p) { struct filename_trans_key *ft = key; struct filename_trans_datum *next, *d = datum; kfree(ft->name); kfree(key); do { ebitmap_destroy(&d->stypes); next = d->next; kfree(d); d = next; } while (unlikely(d)); cond_resched(); return 0; } static int range_tr_destroy(void *key, void *datum, void *p) { struct mls_range *rt = datum; kfree(key); ebitmap_destroy(&rt->level[0].cat); ebitmap_destroy(&rt->level[1].cat); kfree(datum); cond_resched(); return 0; } static int role_tr_destroy(void *key, void *datum, void *p) { kfree(key); kfree(datum); return 0; } static void ocontext_destroy(struct ocontext *c, unsigned int i) { if (!c) return; context_destroy(&c->context[0]); context_destroy(&c->context[1]); if (i == OCON_ISID || i == OCON_FS || i == OCON_NETIF || i == OCON_FSUSE) kfree(c->u.name); kfree(c); } /* * Initialize the role table. */ static int roles_init(struct policydb *p) { char *key = NULL; int rc; struct role_datum *role; role = kzalloc(sizeof(*role), GFP_KERNEL); if (!role) return -ENOMEM; rc = -EINVAL; role->value = ++p->p_roles.nprim; if (role->value != OBJECT_R_VAL) goto out; rc = -ENOMEM; key = kstrdup(OBJECT_R, GFP_KERNEL); if (!key) goto out; rc = symtab_insert(&p->p_roles, key, role); if (rc) goto out; return 0; out: kfree(key); kfree(role); return rc; } static u32 filenametr_hash(const void *k) { const struct filename_trans_key *ft = k; unsigned long salt = ft->ttype ^ ft->tclass; return full_name_hash((void *)salt, ft->name, strlen(ft->name)); } static int filenametr_cmp(const void *k1, const void *k2) { const struct filename_trans_key *ft1 = k1; const struct filename_trans_key *ft2 = k2; int v; v = ft1->ttype - ft2->ttype; if (v) return v; v = ft1->tclass - ft2->tclass; if (v) return v; return strcmp(ft1->name, ft2->name); } static const struct hashtab_key_params filenametr_key_params = { .hash = filenametr_hash, .cmp = filenametr_cmp, }; struct filename_trans_datum * policydb_filenametr_search(struct policydb *p, struct filename_trans_key *key) { return hashtab_search(&p->filename_trans, key, filenametr_key_params); } static u32 rangetr_hash(const void *k) { const struct range_trans *key = k; return key->source_type + (key->target_type << 3) + (key->target_class << 5); } static int rangetr_cmp(const void *k1, const void *k2) { const struct range_trans *key1 = k1, *key2 = k2; int v; v = key1->source_type - key2->source_type; if (v) return v; v = key1->target_type - key2->target_type; if (v) return v; v = key1->target_class - key2->target_class; return v; } static const struct hashtab_key_params rangetr_key_params = { .hash = rangetr_hash, .cmp = rangetr_cmp, }; struct mls_range *policydb_rangetr_search(struct policydb *p, struct range_trans *key) { return hashtab_search(&p->range_tr, key, rangetr_key_params); } static u32 role_trans_hash(const void *k) { const struct role_trans_key *key = k; return jhash_3words(key->role, key->type, (u32)key->tclass << 16 | key->tclass, 0); } static int role_trans_cmp(const void *k1, const void *k2) { const struct role_trans_key *key1 = k1, *key2 = k2; int v; v = key1->role - key2->role; if (v) return v; v = key1->type - key2->type; if (v) return v; return key1->tclass - key2->tclass; } static const struct hashtab_key_params roletr_key_params = { .hash = role_trans_hash, .cmp = role_trans_cmp, }; struct role_trans_datum *policydb_roletr_search(struct policydb *p, struct role_trans_key *key) { return hashtab_search(&p->role_tr, key, roletr_key_params); } /* * Initialize a policy database structure. */ static void policydb_init(struct policydb *p) { memset(p, 0, sizeof(*p)); avtab_init(&p->te_avtab); cond_policydb_init(p); ebitmap_init(&p->filename_trans_ttypes); ebitmap_init(&p->policycaps); ebitmap_init(&p->permissive_map); ebitmap_init(&p->neveraudit_map); } /* * The following *_index functions are used to * define the val_to_name and val_to_struct arrays * in a policy database structure. The val_to_name * arrays are used when converting security context * structures into string representations. The * val_to_struct arrays are used when the attributes * of a class, role, or user are needed. */ static int common_index(void *key, void *datum, void *datap) { struct policydb *p; struct common_datum *comdatum; comdatum = datum; p = datap; if (!comdatum->value || comdatum->value > p->p_commons.nprim) return -EINVAL; p->sym_val_to_name[SYM_COMMONS][comdatum->value - 1] = key; return 0; } static int class_index(void *key, void *datum, void *datap) { struct policydb *p; struct class_datum *cladatum; cladatum = datum; p = datap; if (!cladatum->value || cladatum->value > p->p_classes.nprim) return -EINVAL; p->sym_val_to_name[SYM_CLASSES][cladatum->value - 1] = key; p->class_val_to_struct[cladatum->value - 1] = cladatum; return 0; } static int role_index(void *key, void *datum, void *datap) { struct policydb *p; struct role_datum *role; role = datum; p = datap; if (!role->value || role->value > p->p_roles.nprim || role->bounds > p->p_roles.nprim) return -EINVAL; p->sym_val_to_name[SYM_ROLES][role->value - 1] = key; p->role_val_to_struct[role->value - 1] = role; return 0; } static int type_index(void *key, void *datum, void *datap) { struct policydb *p; struct type_datum *typdatum; typdatum = datum; p = datap; if (typdatum->primary) { if (!typdatum->value || typdatum->value > p->p_types.nprim || typdatum->bounds > p->p_types.nprim) return -EINVAL; p->sym_val_to_name[SYM_TYPES][typdatum->value - 1] = key; p->type_val_to_struct[typdatum->value - 1] = typdatum; } return 0; } static int user_index(void *key, void *datum, void *datap) { struct policydb *p; struct user_datum *usrdatum; usrdatum = datum; p = datap; if (!usrdatum->value || usrdatum->value > p->p_users.nprim || usrdatum->bounds > p->p_users.nprim) return -EINVAL; p->sym_val_to_name[SYM_USERS][usrdatum->value - 1] = key; p->user_val_to_struct[usrdatum->value - 1] = usrdatum; return 0; } static int sens_index(void *key, void *datum, void *datap) { struct policydb *p; struct level_datum *levdatum; levdatum = datum; p = datap; if (!levdatum->isalias) { if (!levdatum->level.sens || levdatum->level.sens > p->p_levels.nprim) return -EINVAL; p->sym_val_to_name[SYM_LEVELS][levdatum->level.sens - 1] = key; } return 0; } static int cat_index(void *key, void *datum, void *datap) { struct policydb *p; struct cat_datum *catdatum; catdatum = datum; p = datap; if (!catdatum->isalias) { if (!catdatum->value || catdatum->value > p->p_cats.nprim) return -EINVAL; p->sym_val_to_name[SYM_CATS][catdatum->value - 1] = key; } return 0; } /* clang-format off */ static int (*const index_f[SYM_NUM])(void *key, void *datum, void *datap) = { common_index, class_index, role_index, type_index, user_index, cond_index_bool, sens_index, cat_index, }; /* clang-format on */ #ifdef CONFIG_SECURITY_SELINUX_DEBUG static void hash_eval(struct hashtab *h, const char *hash_name, const char *hash_details) { struct hashtab_info info; hashtab_stat(h, &info); pr_debug( "SELinux: %s%s%s: %d entries and %d/%d buckets used, longest chain length %d, sum of chain length^2 %llu\n", hash_name, hash_details ? "@" : "", hash_details ?: "", h->nel, info.slots_used, h->size, info.max_chain_len, info.chain2_len_sum); } static void symtab_hash_eval(struct symtab *s) { int i; for (i = 0; i < SYM_NUM; i++) hash_eval(&s[i].table, symtab_name[i], NULL); } #else static inline void hash_eval(struct hashtab *h, const char *hash_name, const char *hash_details) { } static inline void symtab_hash_eval(struct symtab *s) { } #endif /* CONFIG_SECURITY_SELINUX_DEBUG */ /* * Define the other val_to_name and val_to_struct arrays * in a policy database structure. * * Caller must clean up on failure. */ static int policydb_index(struct policydb *p) { int i, rc; if (p->mls_enabled) pr_debug( "SELinux: %d users, %d roles, %d types, %d bools, %d sens, %d cats\n", p->p_users.nprim, p->p_roles.nprim, p->p_types.nprim, p->p_bools.nprim, p->p_levels.nprim, p->p_cats.nprim); else pr_debug("SELinux: %d users, %d roles, %d types, %d bools\n", p->p_users.nprim, p->p_roles.nprim, p->p_types.nprim, p->p_bools.nprim); pr_debug("SELinux: %d classes, %d rules\n", p->p_classes.nprim, p->te_avtab.nel); avtab_hash_eval(&p->te_avtab, "rules"); symtab_hash_eval(p->symtab); p->class_val_to_struct = kcalloc(p->p_classes.nprim, sizeof(*p->class_val_to_struct), GFP_KERNEL); if (!p->class_val_to_struct) return -ENOMEM; p->role_val_to_struct = kcalloc( p->p_roles.nprim, sizeof(*p->role_val_to_struct), GFP_KERNEL); if (!p->role_val_to_struct) return -ENOMEM; p->user_val_to_struct = kcalloc( p->p_users.nprim, sizeof(*p->user_val_to_struct), GFP_KERNEL); if (!p->user_val_to_struct) return -ENOMEM; p->type_val_to_struct = kvcalloc( p->p_types.nprim, sizeof(*p->type_val_to_struct), GFP_KERNEL); if (!p->type_val_to_struct) return -ENOMEM; rc = cond_init_bool_indexes(p); if (rc) goto out; for (i = 0; i < SYM_NUM; i++) { p->sym_val_to_name[i] = kvcalloc(p->symtab[i].nprim, sizeof(char *), GFP_KERNEL); if (!p->sym_val_to_name[i]) return -ENOMEM; rc = hashtab_map(&p->symtab[i].table, index_f[i], p); if (rc) goto out; } rc = 0; out: return rc; } /* * Free any memory allocated by a policy database structure. */ void policydb_destroy(struct policydb *p) { struct ocontext *c, *ctmp; struct genfs *g, *gtmp; u32 i; struct role_allow *ra, *lra = NULL; for (i = 0; i < SYM_NUM; i++) { cond_resched(); hashtab_map(&p->symtab[i].table, destroy_f[i], NULL); hashtab_destroy(&p->symtab[i].table); } for (i = 0; i < SYM_NUM; i++) kvfree(p->sym_val_to_name[i]); kfree(p->class_val_to_struct); kfree(p->role_val_to_struct); kfree(p->user_val_to_struct); kvfree(p->type_val_to_struct); avtab_destroy(&p->te_avtab); for (i = 0; i < OCON_NUM; i++) { cond_resched(); c = p->ocontexts[i]; while (c) { ctmp = c; c = c->next; ocontext_destroy(ctmp, i); } p->ocontexts[i] = NULL; } g = p->genfs; while (g) { cond_resched(); kfree(g->fstype); c = g->head; while (c) { ctmp = c; c = c->next; ocontext_destroy(ctmp, OCON_FSUSE); } gtmp = g; g = g->next; kfree(gtmp); } p->genfs = NULL; cond_policydb_destroy(p); hashtab_map(&p->role_tr, role_tr_destroy, NULL); hashtab_destroy(&p->role_tr); for (ra = p->role_allow; ra; ra = ra->next) { cond_resched(); kfree(lra); lra = ra; } kfree(lra); hashtab_map(&p->filename_trans, filenametr_destroy, NULL); hashtab_destroy(&p->filename_trans); hashtab_map(&p->range_tr, range_tr_destroy, NULL); hashtab_destroy(&p->range_tr); if (p->type_attr_map_array) { for (i = 0; i < p->p_types.nprim; i++) ebitmap_destroy(&p->type_attr_map_array[i]); kvfree(p->type_attr_map_array); } ebitmap_destroy(&p->filename_trans_ttypes); ebitmap_destroy(&p->policycaps); ebitmap_destroy(&p->permissive_map); ebitmap_destroy(&p->neveraudit_map); } /* * Load the initial SIDs specified in a policy database * structure into a SID table. */ int policydb_load_isids(struct policydb *p, struct sidtab *s) { struct ocontext *head, *c; bool isid_init; int rc; rc = sidtab_init(s); if (rc) { pr_err("SELinux: out of memory on SID table init\n"); return rc; } isid_init = ebitmap_get_bit(&p->policycaps, POLICYDB_CAP_USERSPACE_INITIAL_CONTEXT); head = p->ocontexts[OCON_ISID]; for (c = head; c; c = c->next) { u32 sid = c->sid[0]; const char *name = security_get_initial_sid_context(sid); if (sid == SECSID_NULL) { pr_err("SELinux: SID 0 was assigned a context.\n"); sidtab_destroy(s); return -EINVAL; } /* Ignore initial SIDs unused by this kernel. */ if (!name) continue; /* * Also ignore SECINITSID_INIT if the policy doesn't declare * support for it */ if (sid == SECINITSID_INIT && !isid_init) continue; rc = sidtab_set_initial(s, sid, &c->context[0]); if (rc) { pr_err("SELinux: unable to load initial SID %s.\n", name); sidtab_destroy(s); return rc; } /* * If the policy doesn't support the "userspace_initial_context" * capability, set SECINITSID_INIT to the same context as * SECINITSID_KERNEL. This ensures the same behavior as before * the reintroduction of SECINITSID_INIT, where all tasks * started before policy load would initially get the context * corresponding to SECINITSID_KERNEL. */ if (sid == SECINITSID_KERNEL && !isid_init) { rc = sidtab_set_initial(s, SECINITSID_INIT, &c->context[0]); if (rc) { pr_err("SELinux: unable to load initial SID %s.\n", name); sidtab_destroy(s); return rc; } } } return 0; } int policydb_class_isvalid(struct policydb *p, unsigned int class) { if (!class || class > p->p_classes.nprim) return 0; return 1; } int policydb_role_isvalid(struct policydb *p, unsigned int role) { if (!role || role > p->p_roles.nprim) return 0; return 1; } int policydb_type_isvalid(struct policydb *p, unsigned int type) { if (!type || type > p->p_types.nprim) return 0; return 1; } /* * Return 1 if the fields in the security context * structure `c' are valid. Return 0 otherwise. */ int policydb_context_isvalid(struct policydb *p, struct context *c) { struct role_datum *role; struct user_datum *usrdatum; if (!c->role || c->role > p->p_roles.nprim) return 0; if (!c->user || c->user > p->p_users.nprim) return 0; if (!c->type || c->type > p->p_types.nprim) return 0; if (c->role != OBJECT_R_VAL) { /* * Role must be authorized for the type. */ role = p->role_val_to_struct[c->role - 1]; if (!role || !ebitmap_get_bit(&role->types, c->type - 1)) /* role may not be associated with type */ return 0; /* * User must be authorized for the role. */ usrdatum = p->user_val_to_struct[c->user - 1]; if (!usrdatum) return 0; if (!ebitmap_get_bit(&usrdatum->roles, c->role - 1)) /* user may not be associated with role */ return 0; } if (!mls_context_isvalid(p, c)) return 0; return 1; } /* * Read a MLS range structure from a policydb binary * representation file. */ static int mls_read_range_helper(struct mls_range *r, struct policy_file *fp) { __le32 buf[2]; u32 items; int rc; rc = next_entry(buf, fp, sizeof(u32)); if (rc) goto out; rc = -EINVAL; items = le32_to_cpu(buf[0]); if (items > ARRAY_SIZE(buf)) { pr_err("SELinux: mls: range overflow\n"); goto out; } rc = next_entry(buf, fp, sizeof(u32) * items); if (rc) { pr_err("SELinux: mls: truncated range\n"); goto out; } r->level[0].sens = le32_to_cpu(buf[0]); if (items > 1) r->level[1].sens = le32_to_cpu(buf[1]); else r->level[1].sens = r->level[0].sens; rc = ebitmap_read(&r->level[0].cat, fp); if (rc) { pr_err("SELinux: mls: error reading low categories\n"); goto out; } if (items > 1) { rc = ebitmap_read(&r->level[1].cat, fp); if (rc) { pr_err("SELinux: mls: error reading high categories\n"); goto bad_high; } } else { rc = ebitmap_cpy(&r->level[1].cat, &r->level[0].cat); if (rc) { pr_err("SELinux: mls: out of memory\n"); goto bad_high; } } return 0; bad_high: ebitmap_destroy(&r->level[0].cat); out: return rc; } /* * Read and validate a security context structure * from a policydb binary representation file. */ static int context_read_and_validate(struct context *c, struct policydb *p, struct policy_file *fp) { __le32 buf[3]; int rc; rc = next_entry(buf, fp, sizeof buf); if (rc) { pr_err("SELinux: context truncated\n"); goto out; } c->user = le32_to_cpu(buf[0]); c->role = le32_to_cpu(buf[1]); c->type = le32_to_cpu(buf[2]); if (p->policyvers >= POLICYDB_VERSION_MLS) { rc = mls_read_range_helper(&c->range, fp); if (rc) { pr_err("SELinux: error reading MLS range of context\n"); goto out; } } rc = -EINVAL; if (!policydb_context_isvalid(p, c)) { pr_err("SELinux: invalid security context\n"); context_destroy(c); goto out; } rc = 0; out: return rc; } /* * The following *_read functions are used to * read the symbol data from a policy database * binary representation file. */ int str_read(char **strp, gfp_t flags, struct policy_file *fp, u32 len) { int rc; char *str; if ((len == 0) || (len == (u32)-1)) return -EINVAL; str = kmalloc(len + 1, flags | __GFP_NOWARN); if (!str) return -ENOMEM; rc = next_entry(str, fp, len); if (rc) { kfree(str); return rc; } str[len] = '\0'; *strp = str; return 0; } static int perm_read(struct policydb *p, struct symtab *s, struct policy_file *fp) { char *key = NULL; struct perm_datum *perdatum; int rc; __le32 buf[2]; u32 len; perdatum = kzalloc(sizeof(*perdatum), GFP_KERNEL); if (!perdatum) return -ENOMEM; rc = next_entry(buf, fp, sizeof buf); if (rc) goto bad; len = le32_to_cpu(buf[0]); perdatum->value = le32_to_cpu(buf[1]); rc = str_read(&key, GFP_KERNEL, fp, len); if (rc) goto bad; rc = symtab_insert(s, key, perdatum); if (rc) goto bad; return 0; bad: perm_destroy(key, perdatum, NULL); return rc; } static int common_read(struct policydb *p, struct symtab *s, struct policy_file *fp) { char *key = NULL; struct common_datum *comdatum; __le32 buf[4]; u32 i, len, nel; int rc; comdatum = kzalloc(sizeof(*comdatum), GFP_KERNEL); if (!comdatum) return -ENOMEM; rc = next_entry(buf, fp, sizeof buf); if (rc) goto bad; len = le32_to_cpu(buf[0]); comdatum->value = le32_to_cpu(buf[1]); nel = le32_to_cpu(buf[3]); rc = symtab_init(&comdatum->permissions, nel); if (rc) goto bad; comdatum->permissions.nprim = le32_to_cpu(buf[2]); rc = str_read(&key, GFP_KERNEL, fp, len); if (rc) goto bad; for (i = 0; i < nel; i++) { rc = perm_read(p, &comdatum->permissions, fp); if (rc) goto bad; } hash_eval(&comdatum->permissions.table, "common_permissions", key); rc = symtab_insert(s, key, comdatum); if (rc) goto bad; return 0; bad: common_destroy(key, comdatum, NULL); return rc; } static void type_set_init(struct type_set *t) { ebitmap_init(&t->types); ebitmap_init(&t->negset); } static int type_set_read(struct type_set *t, struct policy_file *fp) { __le32 buf[1]; int rc; if (ebitmap_read(&t->types, fp)) return -EINVAL; if (ebitmap_read(&t->negset, fp)) return -EINVAL; rc = next_entry(buf, fp, sizeof(u32)); if (rc < 0) return -EINVAL; t->flags = le32_to_cpu(buf[0]); return 0; } static int read_cons_helper(struct policydb *p, struct constraint_node **nodep, u32 ncons, int allowxtarget, struct policy_file *fp) { struct constraint_node *c, *lc; struct constraint_expr *e, *le; __le32 buf[3]; u32 i, j, nexpr; int rc, depth; lc = NULL; for (i = 0; i < ncons; i++) { c = kzalloc(sizeof(*c), GFP_KERNEL); if (!c) return -ENOMEM; if (lc) lc->next = c; else *nodep = c; rc = next_entry(buf, fp, (sizeof(u32) * 2)); if (rc) return rc; c->permissions = le32_to_cpu(buf[0]); nexpr = le32_to_cpu(buf[1]); le = NULL; depth = -1; for (j = 0; j < nexpr; j++) { e = kzalloc(sizeof(*e), GFP_KERNEL); if (!e) return -ENOMEM; if (le) le->next = e; else c->expr = e; rc = next_entry(buf, fp, (sizeof(u32) * 3)); if (rc) return rc; e->expr_type = le32_to_cpu(buf[0]); e->attr = le32_to_cpu(buf[1]); e->op = le32_to_cpu(buf[2]); switch (e->expr_type) { case CEXPR_NOT: if (depth < 0) return -EINVAL; break; case CEXPR_AND: case CEXPR_OR: if (depth < 1) return -EINVAL; depth--; break; case CEXPR_ATTR: if (depth == (CEXPR_MAXDEPTH - 1)) return -EINVAL; depth++; break; case CEXPR_NAMES: if (!allowxtarget && (e->attr & CEXPR_XTARGET)) return -EINVAL; if (depth == (CEXPR_MAXDEPTH - 1)) return -EINVAL; depth++; rc = ebitmap_read(&e->names, fp); if (rc) return rc; if (p->policyvers >= POLICYDB_VERSION_CONSTRAINT_NAMES) { e->type_names = kzalloc(sizeof(*e->type_names), GFP_KERNEL); if (!e->type_names) return -ENOMEM; type_set_init(e->type_names); rc = type_set_read(e->type_names, fp); if (rc) return rc; } break; default: return -EINVAL; } le = e; } if (depth != 0) return -EINVAL; lc = c; } return 0; } static int class_read(struct policydb *p, struct symtab *s, struct policy_file *fp) { char *key = NULL; struct class_datum *cladatum; __le32 buf[6]; u32 i, len, len2, ncons, nel; int rc; cladatum = kzalloc(sizeof(*cladatum), GFP_KERNEL); if (!cladatum) return -ENOMEM; rc = next_entry(buf, fp, sizeof(u32) * 6); if (rc) goto bad; len = le32_to_cpu(buf[0]); len2 = le32_to_cpu(buf[1]); cladatum->value = le32_to_cpu(buf[2]); nel = le32_to_cpu(buf[4]); rc = symtab_init(&cladatum->permissions, nel); if (rc) goto bad; cladatum->permissions.nprim = le32_to_cpu(buf[3]); ncons = le32_to_cpu(buf[5]); rc = str_read(&key, GFP_KERNEL, fp, len); if (rc) goto bad; if (len2) { rc = str_read(&cladatum->comkey, GFP_KERNEL, fp, len2); if (rc) goto bad; rc = -EINVAL; cladatum->comdatum = symtab_search(&p->p_commons, cladatum->comkey); if (!cladatum->comdatum) { pr_err("SELinux: unknown common %s\n", cladatum->comkey); goto bad; } } for (i = 0; i < nel; i++) { rc = perm_read(p, &cladatum->permissions, fp); if (rc) goto bad; } hash_eval(&cladatum->permissions.table, "class_permissions", key); rc = read_cons_helper(p, &cladatum->constraints, ncons, 0, fp); if (rc) goto bad; if (p->policyvers >= POLICYDB_VERSION_VALIDATETRANS) { /* grab the validatetrans rules */ rc = next_entry(buf, fp, sizeof(u32)); if (rc) goto bad; ncons = le32_to_cpu(buf[0]); rc = read_cons_helper(p, &cladatum->validatetrans, ncons, 1, fp); if (rc) goto bad; } if (p->policyvers >= POLICYDB_VERSION_NEW_OBJECT_DEFAULTS) { rc = next_entry(buf, fp, sizeof(u32) * 3); if (rc) goto bad; cladatum->default_user = le32_to_cpu(buf[0]); cladatum->default_role = le32_to_cpu(buf[1]); cladatum->default_range = le32_to_cpu(buf[2]); } if (p->policyvers >= POLICYDB_VERSION_DEFAULT_TYPE) { rc = next_entry(buf, fp, sizeof(u32) * 1); if (rc) goto bad; cladatum->default_type = le32_to_cpu(buf[0]); } rc = symtab_insert(s, key, cladatum); if (rc) goto bad; return 0; bad: cls_destroy(key, cladatum, NULL); return rc; } static int role_read(struct policydb *p, struct symtab *s, struct policy_file *fp) { char *key = NULL; struct role_datum *role; int rc; unsigned int to_read = 2; __le32 buf[3]; u32 len; role = kzalloc(sizeof(*role), GFP_KERNEL); if (!role) return -ENOMEM; if (p->policyvers >= POLICYDB_VERSION_BOUNDARY) to_read = 3; rc = next_entry(buf, fp, sizeof(buf[0]) * to_read); if (rc) goto bad; len = le32_to_cpu(buf[0]); role->value = le32_to_cpu(buf[1]); if (p->policyvers >= POLICYDB_VERSION_BOUNDARY) role->bounds = le32_to_cpu(buf[2]); rc = str_read(&key, GFP_KERNEL, fp, len); if (rc) goto bad; rc = ebitmap_read(&role->dominates, fp); if (rc) goto bad; rc = ebitmap_read(&role->types, fp); if (rc) goto bad; if (strcmp(key, OBJECT_R) == 0) { rc = -EINVAL; if (role->value != OBJECT_R_VAL) { pr_err("SELinux: Role %s has wrong value %d\n", OBJECT_R, role->value); goto bad; } rc = 0; goto bad; } rc = symtab_insert(s, key, role); if (rc) goto bad; return 0; bad: role_destroy(key, role, NULL); return rc; } static int type_read(struct policydb *p, struct symtab *s, struct policy_file *fp) { char *key = NULL; struct type_datum *typdatum; int rc; unsigned int to_read = 3; __le32 buf[4]; u32 len; typdatum = kzalloc(sizeof(*typdatum), GFP_KERNEL); if (!typdatum) return -ENOMEM; if (p->policyvers >= POLICYDB_VERSION_BOUNDARY) to_read = 4; rc = next_entry(buf, fp, sizeof(buf[0]) * to_read); if (rc) goto bad; len = le32_to_cpu(buf[0]); typdatum->value = le32_to_cpu(buf[1]); if (p->policyvers >= POLICYDB_VERSION_BOUNDARY) { u32 prop = le32_to_cpu(buf[2]); if (prop & TYPEDATUM_PROPERTY_PRIMARY) typdatum->primary = 1; if (prop & TYPEDATUM_PROPERTY_ATTRIBUTE) typdatum->attribute = 1; typdatum->bounds = le32_to_cpu(buf[3]); } else { typdatum->primary = le32_to_cpu(buf[2]); } rc = str_read(&key, GFP_KERNEL, fp, len); if (rc) goto bad; rc = symtab_insert(s, key, typdatum); if (rc) goto bad; return 0; bad: type_destroy(key, typdatum, NULL); return rc; } /* * Read a MLS level structure from a policydb binary * representation file. */ static int mls_read_level(struct mls_level *lp, struct policy_file *fp) { __le32 buf[1]; int rc; memset(lp, 0, sizeof(*lp)); rc = next_entry(buf, fp, sizeof buf); if (rc) { pr_err("SELinux: mls: truncated level\n"); return rc; } lp->sens = le32_to_cpu(buf[0]); rc = ebitmap_read(&lp->cat, fp); if (rc) { pr_err("SELinux: mls: error reading level categories\n"); return rc; } return 0; } static int user_read(struct policydb *p, struct symtab *s, struct policy_file *fp) { char *key = NULL; struct user_datum *usrdatum; int rc; unsigned int to_read = 2; __le32 buf[3]; u32 len; usrdatum = kzalloc(sizeof(*usrdatum), GFP_KERNEL); if (!usrdatum) return -ENOMEM; if (p->policyvers >= POLICYDB_VERSION_BOUNDARY) to_read = 3; rc = next_entry(buf, fp, sizeof(buf[0]) * to_read); if (rc) goto bad; len = le32_to_cpu(buf[0]); usrdatum->value = le32_to_cpu(buf[1]); if (p->policyvers >= POLICYDB_VERSION_BOUNDARY) usrdatum->bounds = le32_to_cpu(buf[2]); rc = str_read(&key, GFP_KERNEL, fp, len); if (rc) goto bad; rc = ebitmap_read(&usrdatum->roles, fp); if (rc) goto bad; if (p->policyvers >= POLICYDB_VERSION_MLS) { rc = mls_read_range_helper(&usrdatum->range, fp); if (rc) goto bad; rc = mls_read_level(&usrdatum->dfltlevel, fp); if (rc) goto bad; } rc = symtab_insert(s, key, usrdatum); if (rc) goto bad; return 0; bad: user_destroy(key, usrdatum, NULL); return rc; } static int sens_read(struct policydb *p, struct symtab *s, struct policy_file *fp) { char *key = NULL; struct level_datum *levdatum; int rc; __le32 buf[2]; u32 len; levdatum = kzalloc(sizeof(*levdatum), GFP_KERNEL); if (!levdatum) return -ENOMEM; rc = next_entry(buf, fp, sizeof buf); if (rc) goto bad; len = le32_to_cpu(buf[0]); levdatum->isalias = le32_to_cpu(buf[1]); rc = str_read(&key, GFP_KERNEL, fp, len); if (rc) goto bad; rc = mls_read_level(&levdatum->level, fp); if (rc) goto bad; rc = symtab_insert(s, key, levdatum); if (rc) goto bad; return 0; bad: sens_destroy(key, levdatum, NULL); return rc; } static int cat_read(struct policydb *p, struct symtab *s, struct policy_file *fp) { char *key = NULL; struct cat_datum *catdatum; int rc; __le32 buf[3]; u32 len; catdatum = kzalloc(sizeof(*catdatum), GFP_KERNEL); if (!catdatum) return -ENOMEM; rc = next_entry(buf, fp, sizeof buf); if (rc) goto bad; len = le32_to_cpu(buf[0]); catdatum->value = le32_to_cpu(buf[1]); catdatum->isalias = le32_to_cpu(buf[2]); rc = str_read(&key, GFP_KERNEL, fp, len); if (rc) goto bad; rc = symtab_insert(s, key, catdatum); if (rc) goto bad; return 0; bad: cat_destroy(key, catdatum, NULL); return rc; } /* clang-format off */ static int (*const read_f[SYM_NUM])(struct policydb *p, struct symtab *s, struct policy_file *fp) = { common_read, class_read, role_read, type_read, user_read, cond_read_bool, sens_read, cat_read, }; /* clang-format on */ static int user_bounds_sanity_check(void *key, void *datum, void *datap) { struct user_datum *upper, *user; struct policydb *p = datap; int depth = 0; upper = user = datum; while (upper->bounds) { struct ebitmap_node *node; u32 bit; if (++depth == POLICYDB_BOUNDS_MAXDEPTH) { pr_err("SELinux: user %s: " "too deep or looped boundary\n", (char *)key); return -EINVAL; } upper = p->user_val_to_struct[upper->bounds - 1]; ebitmap_for_each_positive_bit(&user->roles, node, bit) { if (ebitmap_get_bit(&upper->roles, bit)) continue; pr_err("SELinux: boundary violated policy: " "user=%s role=%s bounds=%s\n", sym_name(p, SYM_USERS, user->value - 1), sym_name(p, SYM_ROLES, bit), sym_name(p, SYM_USERS, upper->value - 1)); return -EINVAL; } } return 0; } static int role_bounds_sanity_check(void *key, void *datum, void *datap) { struct role_datum *upper, *role; struct policydb *p = datap; int depth = 0; upper = role = datum; while (upper->bounds) { struct ebitmap_node *node; u32 bit; if (++depth == POLICYDB_BOUNDS_MAXDEPTH) { pr_err("SELinux: role %s: " "too deep or looped bounds\n", (char *)key); return -EINVAL; } upper = p->role_val_to_struct[upper->bounds - 1]; ebitmap_for_each_positive_bit(&role->types, node, bit) { if (ebitmap_get_bit(&upper->types, bit)) continue; pr_err("SELinux: boundary violated policy: " "role=%s type=%s bounds=%s\n", sym_name(p, SYM_ROLES, role->value - 1), sym_name(p, SYM_TYPES, bit), sym_name(p, SYM_ROLES, upper->value - 1)); return -EINVAL; } } return 0; } static int type_bounds_sanity_check(void *key, void *datum, void *datap) { struct type_datum *upper; struct policydb *p = datap; int depth = 0; upper = datum; while (upper->bounds) { if (++depth == POLICYDB_BOUNDS_MAXDEPTH) { pr_err("SELinux: type %s: " "too deep or looped boundary\n", (char *)key); return -EINVAL; } upper = p->type_val_to_struct[upper->bounds - 1]; BUG_ON(!upper); if (upper->attribute) { pr_err("SELinux: type %s: " "bounded by attribute %s\n", (char *)key, sym_name(p, SYM_TYPES, upper->value - 1)); return -EINVAL; } } return 0; } static int policydb_bounds_sanity_check(struct policydb *p) { int rc; if (p->policyvers < POLICYDB_VERSION_BOUNDARY) return 0; rc = hashtab_map(&p->p_users.table, user_bounds_sanity_check, p); if (rc) return rc; rc = hashtab_map(&p->p_roles.table, role_bounds_sanity_check, p); if (rc) return rc; rc = hashtab_map(&p->p_types.table, type_bounds_sanity_check, p); if (rc) return rc; return 0; } u16 string_to_security_class(struct policydb *p, const char *name) { struct class_datum *cladatum; cladatum = symtab_search(&p->p_classes, name); if (!cladatum) return 0; return cladatum->value; } u32 string_to_av_perm(struct policydb *p, u16 tclass, const char *name) { struct class_datum *cladatum; struct perm_datum *perdatum = NULL; struct common_datum *comdatum; if (!tclass || tclass > p->p_classes.nprim) return 0; cladatum = p->class_val_to_struct[tclass - 1]; comdatum = cladatum->comdatum; if (comdatum) perdatum = symtab_search(&comdatum->permissions, name); if (!perdatum) perdatum = symtab_search(&cladatum->permissions, name); if (!perdatum) return 0; return 1U << (perdatum->value - 1); } static int range_read(struct policydb *p, struct policy_file *fp) { struct range_trans *rt = NULL; struct mls_range *r = NULL; int rc; __le32 buf[2]; u32 i, nel; if (p->policyvers < POLICYDB_VERSION_MLS) return 0; rc = next_entry(buf, fp, sizeof(u32)); if (rc) return rc; nel = le32_to_cpu(buf[0]); rc = hashtab_init(&p->range_tr, nel); if (rc) return rc; for (i = 0; i < nel; i++) { rc = -ENOMEM; rt = kzalloc(sizeof(*rt), GFP_KERNEL); if (!rt) goto out; rc = next_entry(buf, fp, (sizeof(u32) * 2)); if (rc) goto out; rt->source_type = le32_to_cpu(buf[0]); rt->target_type = le32_to_cpu(buf[1]); if (p->policyvers >= POLICYDB_VERSION_RANGETRANS) { rc = next_entry(buf, fp, sizeof(u32)); if (rc) goto out; rt->target_class = le32_to_cpu(buf[0]); } else rt->target_class = p->process_class; rc = -EINVAL; if (!policydb_type_isvalid(p, rt->source_type) || !policydb_type_isvalid(p, rt->target_type) || !policydb_class_isvalid(p, rt->target_class)) goto out; rc = -ENOMEM; r = kzalloc(sizeof(*r), GFP_KERNEL); if (!r) goto out; rc = mls_read_range_helper(r, fp); if (rc) goto out; rc = -EINVAL; if (!mls_range_isvalid(p, r)) { pr_warn("SELinux: rangetrans: invalid range\n"); goto out; } rc = hashtab_insert(&p->range_tr, rt, r, rangetr_key_params); if (rc) goto out; rt = NULL; r = NULL; } hash_eval(&p->range_tr, "rangetr", NULL); rc = 0; out: kfree(rt); kfree(r); return rc; } static int filename_trans_read_helper_compat(struct policydb *p, struct policy_file *fp) { struct filename_trans_key key, *ft = NULL; struct filename_trans_datum *last, *datum = NULL; char *name = NULL; u32 len, stype, otype; __le32 buf[4]; int rc; /* length of the path component string */ rc = next_entry(buf, fp, sizeof(u32)); if (rc) return rc; len = le32_to_cpu(buf[0]); /* path component string */ rc = str_read(&name, GFP_KERNEL, fp, len); if (rc) return rc; rc = next_entry(buf, fp, sizeof(u32) * 4); if (rc) goto out; stype = le32_to_cpu(buf[0]); key.ttype = le32_to_cpu(buf[1]); key.tclass = le32_to_cpu(buf[2]); key.name = name; otype = le32_to_cpu(buf[3]); last = NULL; datum = policydb_filenametr_search(p, &key); while (datum) { if (unlikely(ebitmap_get_bit(&datum->stypes, stype - 1))) { /* conflicting/duplicate rules are ignored */ datum = NULL; rc = 0; goto out; } if (likely(datum->otype == otype)) break; last = datum; datum = datum->next; } if (!datum) { rc = -ENOMEM; datum = kmalloc(sizeof(*datum), GFP_KERNEL); if (!datum) goto out; ebitmap_init(&datum->stypes); datum->otype = otype; datum->next = NULL; if (unlikely(last)) { last->next = datum; } else { rc = -ENOMEM; ft = kmemdup(&key, sizeof(key), GFP_KERNEL); if (!ft) goto out; rc = hashtab_insert(&p->filename_trans, ft, datum, filenametr_key_params); if (rc) goto out; name = NULL; rc = ebitmap_set_bit(&p->filename_trans_ttypes, key.ttype, 1); if (rc) return rc; } } kfree(name); return ebitmap_set_bit(&datum->stypes, stype - 1, 1); out: kfree(ft); kfree(name); kfree(datum); return rc; } static int filename_trans_read_helper(struct policydb *p, struct policy_file *fp) { struct filename_trans_key *ft = NULL; struct filename_trans_datum **dst, *datum, *first = NULL; char *name = NULL; u32 len, ttype, tclass, ndatum, i; __le32 buf[3]; int rc; /* length of the path component string */ rc = next_entry(buf, fp, sizeof(u32)); if (rc) return rc; len = le32_to_cpu(buf[0]); /* path component string */ rc = str_read(&name, GFP_KERNEL, fp, len); if (rc) return rc; rc = next_entry(buf, fp, sizeof(u32) * 3); if (rc) goto out; ttype = le32_to_cpu(buf[0]); tclass = le32_to_cpu(buf[1]); ndatum = le32_to_cpu(buf[2]); if (ndatum == 0) { pr_err("SELinux: Filename transition key with no datum\n"); rc = -ENOENT; goto out; } dst = &first; for (i = 0; i < ndatum; i++) { rc = -ENOMEM; datum = kmalloc(sizeof(*datum), GFP_KERNEL); if (!datum) goto out; datum->next = NULL; *dst = datum; /* ebitmap_read() will at least init the bitmap */ rc = ebitmap_read(&datum->stypes, fp); if (rc) goto out; rc = next_entry(buf, fp, sizeof(u32)); if (rc) goto out; datum->otype = le32_to_cpu(buf[0]); dst = &datum->next; } rc = -ENOMEM; ft = kmalloc(sizeof(*ft), GFP_KERNEL); if (!ft) goto out; ft->ttype = ttype; ft->tclass = tclass; ft->name = name; rc = hashtab_insert(&p->filename_trans, ft, first, filenametr_key_params); if (rc == -EEXIST) pr_err("SELinux: Duplicate filename transition key\n"); if (rc) goto out; return ebitmap_set_bit(&p->filename_trans_ttypes, ttype, 1); out: kfree(ft); kfree(name); while (first) { datum = first; first = first->next; ebitmap_destroy(&datum->stypes); kfree(datum); } return rc; } static int filename_trans_read(struct policydb *p, struct policy_file *fp) { u32 nel, i; __le32 buf[1]; int rc; if (p->policyvers < POLICYDB_VERSION_FILENAME_TRANS) return 0; rc = next_entry(buf, fp, sizeof(u32)); if (rc) return rc; nel = le32_to_cpu(buf[0]); if (p->policyvers < POLICYDB_VERSION_COMP_FTRANS) { p->compat_filename_trans_count = nel; rc = hashtab_init(&p->filename_trans, (1 << 11)); if (rc) return rc; for (i = 0; i < nel; i++) { rc = filename_trans_read_helper_compat(p, fp); if (rc) return rc; } } else { rc = hashtab_init(&p->filename_trans, nel); if (rc) return rc; for (i = 0; i < nel; i++) { rc = filename_trans_read_helper(p, fp); if (rc) return rc; } } hash_eval(&p->filename_trans, "filenametr", NULL); return 0; } static int genfs_read(struct policydb *p, struct policy_file *fp) { int rc; u32 i, j, nel, nel2, len, len2; __le32 buf[1]; struct ocontext *l, *c; struct ocontext *newc = NULL; struct genfs *genfs_p, *genfs; struct genfs *newgenfs = NULL; rc = next_entry(buf, fp, sizeof(u32)); if (rc) return rc; nel = le32_to_cpu(buf[0]); for (i = 0; i < nel; i++) { rc = next_entry(buf, fp, sizeof(u32)); if (rc) goto out; len = le32_to_cpu(buf[0]); rc = -ENOMEM; newgenfs = kzalloc(sizeof(*newgenfs), GFP_KERNEL); if (!newgenfs) goto out; rc = str_read(&newgenfs->fstype, GFP_KERNEL, fp, len); if (rc) goto out; for (genfs_p = NULL, genfs = p->genfs; genfs; genfs_p = genfs, genfs = genfs->next) { rc = -EINVAL; if (strcmp(newgenfs->fstype, genfs->fstype) == 0) { pr_err("SELinux: dup genfs fstype %s\n", newgenfs->fstype); goto out; } if (strcmp(newgenfs->fstype, genfs->fstype) < 0) break; } newgenfs->next = genfs; if (genfs_p) genfs_p->next = newgenfs; else p->genfs = newgenfs; genfs = newgenfs; newgenfs = NULL; rc = next_entry(buf, fp, sizeof(u32)); if (rc) goto out; nel2 = le32_to_cpu(buf[0]); for (j = 0; j < nel2; j++) { rc = next_entry(buf, fp, sizeof(u32)); if (rc) goto out; len = le32_to_cpu(buf[0]); rc = -ENOMEM; newc = kzalloc(sizeof(*newc), GFP_KERNEL); if (!newc) goto out; rc = str_read(&newc->u.name, GFP_KERNEL, fp, len); if (rc) goto out; rc = next_entry(buf, fp, sizeof(u32)); if (rc) goto out; newc->v.sclass = le32_to_cpu(buf[0]); rc = context_read_and_validate(&newc->context[0], p, fp); if (rc) goto out; for (l = NULL, c = genfs->head; c; l = c, c = c->next) { rc = -EINVAL; if (!strcmp(newc->u.name, c->u.name) && (!c->v.sclass || !newc->v.sclass || newc->v.sclass == c->v.sclass)) { pr_err("SELinux: dup genfs entry (%s,%s)\n", genfs->fstype, c->u.name); goto out; } len = strlen(newc->u.name); len2 = strlen(c->u.name); if (len > len2) break; } newc->next = c; if (l) l->next = newc; else genfs->head = newc; newc = NULL; } } rc = 0; out: if (newgenfs) { kfree(newgenfs->fstype); kfree(newgenfs); } ocontext_destroy(newc, OCON_FSUSE); return rc; } static int ocontext_read(struct policydb *p, const struct policydb_compat_info *info, struct policy_file *fp) { int rc; unsigned int i; u32 j, nel, len; __be64 prefixbuf[1]; __le32 buf[3]; struct ocontext *l, *c; u32 nodebuf[8]; for (i = 0; i < info->ocon_num; i++) { rc = next_entry(buf, fp, sizeof(u32)); if (rc) goto out; nel = le32_to_cpu(buf[0]); l = NULL; for (j = 0; j < nel; j++) { rc = -ENOMEM; c = kzalloc(sizeof(*c), GFP_KERNEL); if (!c) goto out; if (l) l->next = c; else p->ocontexts[i] = c; l = c; switch (i) { case OCON_ISID: rc = next_entry(buf, fp, sizeof(u32)); if (rc) goto out; c->sid[0] = le32_to_cpu(buf[0]); rc = context_read_and_validate(&c->context[0], p, fp); if (rc) goto out; break; case OCON_FS: case OCON_NETIF: rc = next_entry(buf, fp, sizeof(u32)); if (rc) goto out; len = le32_to_cpu(buf[0]); rc = str_read(&c->u.name, GFP_KERNEL, fp, len); if (rc) goto out; if (i == OCON_FS) pr_warn("SELinux: void and deprecated fs ocon %s\n", c->u.name); rc = context_read_and_validate(&c->context[0], p, fp); if (rc) goto out; rc = context_read_and_validate(&c->context[1], p, fp); if (rc) goto out; break; case OCON_PORT: rc = next_entry(buf, fp, sizeof(u32) * 3); if (rc) goto out; c->u.port.protocol = le32_to_cpu(buf[0]); c->u.port.low_port = le32_to_cpu(buf[1]); c->u.port.high_port = le32_to_cpu(buf[2]); rc = context_read_and_validate(&c->context[0], p, fp); if (rc) goto out; break; case OCON_NODE: rc = next_entry(nodebuf, fp, sizeof(u32) * 2); if (rc) goto out; c->u.node.addr = nodebuf[0]; /* network order */ c->u.node.mask = nodebuf[1]; /* network order */ rc = context_read_and_validate(&c->context[0], p, fp); if (rc) goto out; break; case OCON_FSUSE: rc = next_entry(buf, fp, sizeof(u32) * 2); if (rc) goto out; rc = -EINVAL; c->v.behavior = le32_to_cpu(buf[0]); /* Determined at runtime, not in policy DB. */ if (c->v.behavior == SECURITY_FS_USE_MNTPOINT) goto out; if (c->v.behavior > SECURITY_FS_USE_MAX) goto out; len = le32_to_cpu(buf[1]); rc = str_read(&c->u.name, GFP_KERNEL, fp, len); if (rc) goto out; rc = context_read_and_validate(&c->context[0], p, fp); if (rc) goto out; break; case OCON_NODE6: { int k; rc = next_entry(nodebuf, fp, sizeof(u32) * 8); if (rc) goto out; for (k = 0; k < 4; k++) c->u.node6.addr[k] = nodebuf[k]; for (k = 0; k < 4; k++) c->u.node6.mask[k] = nodebuf[k + 4]; rc = context_read_and_validate(&c->context[0], p, fp); if (rc) goto out; break; } case OCON_IBPKEY: { u32 pkey_lo, pkey_hi; rc = next_entry(prefixbuf, fp, sizeof(u64)); if (rc) goto out; /* we need to have subnet_prefix in CPU order */ c->u.ibpkey.subnet_prefix = be64_to_cpu(prefixbuf[0]); rc = next_entry(buf, fp, sizeof(u32) * 2); if (rc) goto out; pkey_lo = le32_to_cpu(buf[0]); pkey_hi = le32_to_cpu(buf[1]); if (pkey_lo > U16_MAX || pkey_hi > U16_MAX) { rc = -EINVAL; goto out; } c->u.ibpkey.low_pkey = pkey_lo; c->u.ibpkey.high_pkey = pkey_hi; rc = context_read_and_validate(&c->context[0], p, fp); if (rc) goto out; break; } case OCON_IBENDPORT: { u32 port; rc = next_entry(buf, fp, sizeof(u32) * 2); if (rc) goto out; len = le32_to_cpu(buf[0]); rc = str_read(&c->u.ibendport.dev_name, GFP_KERNEL, fp, len); if (rc) goto out; port = le32_to_cpu(buf[1]); if (port > U8_MAX || port == 0) { rc = -EINVAL; goto out; } c->u.ibendport.port = port; rc = context_read_and_validate(&c->context[0], p, fp); if (rc) goto out; break; } /* end case */ } /* end switch */ } } rc = 0; out: return rc; } /* * Read the configuration data from a policy database binary * representation file into a policy database structure. */ int policydb_read(struct policydb *p, struct policy_file *fp) { struct role_allow *ra, *lra; struct role_trans_key *rtk = NULL; struct role_trans_datum *rtd = NULL; int rc; __le32 buf[4]; u32 i, j, len, nprim, nel, perm; char *policydb_str; const struct policydb_compat_info *info; policydb_init(p); /* Read the magic number and string length. */ rc = next_entry(buf, fp, sizeof(u32) * 2); if (rc) goto bad; rc = -EINVAL; if (le32_to_cpu(buf[0]) != POLICYDB_MAGIC) { pr_err("SELinux: policydb magic number 0x%x does " "not match expected magic number 0x%x\n", le32_to_cpu(buf[0]), POLICYDB_MAGIC); goto bad; } rc = -EINVAL; len = le32_to_cpu(buf[1]); if (len != strlen(POLICYDB_STRING)) { pr_err("SELinux: policydb string length %d does not " "match expected length %zu\n", len, strlen(POLICYDB_STRING)); goto bad; } rc = str_read(&policydb_str, GFP_KERNEL, fp, len); if (rc) { if (rc == -ENOMEM) { pr_err("SELinux: unable to allocate memory for policydb string of length %d\n", len); } else { pr_err("SELinux: truncated policydb string identifier\n"); } goto bad; } rc = -EINVAL; if (strcmp(policydb_str, POLICYDB_STRING)) { pr_err("SELinux: policydb string %s does not match " "my string %s\n", policydb_str, POLICYDB_STRING); kfree(policydb_str); goto bad; } /* Done with policydb_str. */ kfree(policydb_str); policydb_str = NULL; /* Read the version and table sizes. */ rc = next_entry(buf, fp, sizeof(u32) * 4); if (rc) goto bad; rc = -EINVAL; p->policyvers = le32_to_cpu(buf[0]); if (p->policyvers < POLICYDB_VERSION_MIN || p->policyvers > POLICYDB_VERSION_MAX) { pr_err("SELinux: policydb version %d does not match " "my version range %d-%d\n", le32_to_cpu(buf[0]), POLICYDB_VERSION_MIN, POLICYDB_VERSION_MAX); goto bad; } if ((le32_to_cpu(buf[1]) & POLICYDB_CONFIG_MLS)) { p->mls_enabled = 1; rc = -EINVAL; if (p->policyvers < POLICYDB_VERSION_MLS) { pr_err("SELinux: security policydb version %d " "(MLS) not backwards compatible\n", p->policyvers); goto bad; } } p->reject_unknown = !!(le32_to_cpu(buf[1]) & REJECT_UNKNOWN); p->allow_unknown = !!(le32_to_cpu(buf[1]) & ALLOW_UNKNOWN); if (p->policyvers >= POLICYDB_VERSION_POLCAP) { rc = ebitmap_read(&p->policycaps, fp); if (rc) goto bad; } if (p->policyvers >= POLICYDB_VERSION_PERMISSIVE) { rc = ebitmap_read(&p->permissive_map, fp); if (rc) goto bad; } if (p->policyvers >= POLICYDB_VERSION_NEVERAUDIT) { rc = ebitmap_read(&p->neveraudit_map, fp); if (rc) goto bad; } rc = -EINVAL; info = policydb_lookup_compat(p->policyvers); if (!info) { pr_err("SELinux: unable to find policy compat info " "for version %d\n", p->policyvers); goto bad; } rc = -EINVAL; if (le32_to_cpu(buf[2]) != info->sym_num || le32_to_cpu(buf[3]) != info->ocon_num) { pr_err("SELinux: policydb table sizes (%d,%d) do " "not match mine (%d,%d)\n", le32_to_cpu(buf[2]), le32_to_cpu(buf[3]), info->sym_num, info->ocon_num); goto bad; } for (i = 0; i < info->sym_num; i++) { rc = next_entry(buf, fp, sizeof(u32) * 2); if (rc) goto bad; nprim = le32_to_cpu(buf[0]); nel = le32_to_cpu(buf[1]); rc = symtab_init(&p->symtab[i], nel); if (rc) goto out; if (i == SYM_ROLES) { rc = roles_init(p); if (rc) goto out; } for (j = 0; j < nel; j++) { rc = read_f[i](p, &p->symtab[i], fp); if (rc) goto bad; } p->symtab[i].nprim = nprim; } rc = -EINVAL; p->process_class = string_to_security_class(p, "process"); if (!p->process_class) { pr_err("SELinux: process class is required, not defined in policy\n"); goto bad; } rc = avtab_read(&p->te_avtab, fp, p); if (rc) goto bad; if (p->policyvers >= POLICYDB_VERSION_BOOL) { rc = cond_read_list(p, fp); if (rc) goto bad; } rc = next_entry(buf, fp, sizeof(u32)); if (rc) goto bad; nel = le32_to_cpu(buf[0]); rc = hashtab_init(&p->role_tr, nel); if (rc) goto bad; for (i = 0; i < nel; i++) { rc = -ENOMEM; rtk = kmalloc(sizeof(*rtk), GFP_KERNEL); if (!rtk) goto bad; rc = -ENOMEM; rtd = kmalloc(sizeof(*rtd), GFP_KERNEL); if (!rtd) goto bad; rc = next_entry(buf, fp, sizeof(u32) * 3); if (rc) goto bad; rtk->role = le32_to_cpu(buf[0]); rtk->type = le32_to_cpu(buf[1]); rtd->new_role = le32_to_cpu(buf[2]); if (p->policyvers >= POLICYDB_VERSION_ROLETRANS) { rc = next_entry(buf, fp, sizeof(u32)); if (rc) goto bad; rtk->tclass = le32_to_cpu(buf[0]); } else rtk->tclass = p->process_class; rc = -EINVAL; if (!policydb_role_isvalid(p, rtk->role) || !policydb_type_isvalid(p, rtk->type) || !policydb_class_isvalid(p, rtk->tclass) || !policydb_role_isvalid(p, rtd->new_role)) goto bad; rc = hashtab_insert(&p->role_tr, rtk, rtd, roletr_key_params); if (rc) goto bad; rtk = NULL; rtd = NULL; } hash_eval(&p->role_tr, "roletr", NULL); rc = next_entry(buf, fp, sizeof(u32)); if (rc) goto bad; nel = le32_to_cpu(buf[0]); lra = NULL; for (i = 0; i < nel; i++) { rc = -ENOMEM; ra = kzalloc(sizeof(*ra), GFP_KERNEL); if (!ra) goto bad; if (lra) lra->next = ra; else p->role_allow = ra; rc = next_entry(buf, fp, sizeof(u32) * 2); if (rc) goto bad; rc = -EINVAL; ra->role = le32_to_cpu(buf[0]); ra->new_role = le32_to_cpu(buf[1]); if (!policydb_role_isvalid(p, ra->role) || !policydb_role_isvalid(p, ra->new_role)) goto bad; lra = ra; } rc = filename_trans_read(p, fp); if (rc) goto bad; rc = policydb_index(p); if (rc) goto bad; rc = -EINVAL; perm = string_to_av_perm(p, p->process_class, "transition"); if (!perm) { pr_err("SELinux: process transition permission is required, not defined in policy\n"); goto bad; } p->process_trans_perms = perm; perm = string_to_av_perm(p, p->process_class, "dyntransition"); if (!perm) { pr_err("SELinux: process dyntransition permission is required, not defined in policy\n"); goto bad; } p->process_trans_perms |= perm; rc = ocontext_read(p, info, fp); if (rc) goto bad; rc = genfs_read(p, fp); if (rc) goto bad; rc = range_read(p, fp); if (rc) goto bad; rc = -ENOMEM; p->type_attr_map_array = kvcalloc( p->p_types.nprim, sizeof(*p->type_attr_map_array), GFP_KERNEL); if (!p->type_attr_map_array) goto bad; /* just in case ebitmap_init() becomes more than just a memset(0): */ for (i = 0; i < p->p_types.nprim; i++) ebitmap_init(&p->type_attr_map_array[i]); for (i = 0; i < p->p_types.nprim; i++) { struct ebitmap *e = &p->type_attr_map_array[i]; if (p->policyvers >= POLICYDB_VERSION_AVTAB) { rc = ebitmap_read(e, fp); if (rc) goto bad; } /* add the type itself as the degenerate case */ rc = ebitmap_set_bit(e, i, 1); if (rc) goto bad; } rc = policydb_bounds_sanity_check(p); if (rc) goto bad; rc = 0; out: return rc; bad: kfree(rtk); kfree(rtd); policydb_destroy(p); goto out; } /* * Write a MLS level structure to a policydb binary * representation file. */ static int mls_write_level(struct mls_level *l, struct policy_file *fp) { __le32 buf[1]; int rc; buf[0] = cpu_to_le32(l->sens); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; rc = ebitmap_write(&l->cat, fp); if (rc) return rc; return 0; } /* * Write a MLS range structure to a policydb binary * representation file. */ static int mls_write_range_helper(struct mls_range *r, struct policy_file *fp) { __le32 buf[3]; size_t items; int rc, eq; eq = mls_level_eq(&r->level[1], &r->level[0]); if (eq) items = 2; else items = 3; buf[0] = cpu_to_le32(items - 1); buf[1] = cpu_to_le32(r->level[0].sens); if (!eq) buf[2] = cpu_to_le32(r->level[1].sens); BUG_ON(items > ARRAY_SIZE(buf)); rc = put_entry(buf, sizeof(u32), items, fp); if (rc) return rc; rc = ebitmap_write(&r->level[0].cat, fp); if (rc) return rc; if (!eq) { rc = ebitmap_write(&r->level[1].cat, fp); if (rc) return rc; } return 0; } static int sens_write(void *vkey, void *datum, void *ptr) { char *key = vkey; struct level_datum *levdatum = datum; struct policy_data *pd = ptr; struct policy_file *fp = pd->fp; __le32 buf[2]; size_t len; int rc; len = strlen(key); buf[0] = cpu_to_le32(len); buf[1] = cpu_to_le32(levdatum->isalias); rc = put_entry(buf, sizeof(u32), 2, fp); if (rc) return rc; rc = put_entry(key, 1, len, fp); if (rc) return rc; rc = mls_write_level(&levdatum->level, fp); if (rc) return rc; return 0; } static int cat_write(void *vkey, void *datum, void *ptr) { char *key = vkey; struct cat_datum *catdatum = datum; struct policy_data *pd = ptr; struct policy_file *fp = pd->fp; __le32 buf[3]; size_t len; int rc; len = strlen(key); buf[0] = cpu_to_le32(len); buf[1] = cpu_to_le32(catdatum->value); buf[2] = cpu_to_le32(catdatum->isalias); rc = put_entry(buf, sizeof(u32), 3, fp); if (rc) return rc; rc = put_entry(key, 1, len, fp); if (rc) return rc; return 0; } static int role_trans_write_one(void *key, void *datum, void *ptr) { struct role_trans_key *rtk = key; struct role_trans_datum *rtd = datum; struct policy_data *pd = ptr; struct policy_file *fp = pd->fp; struct policydb *p = pd->p; __le32 buf[3]; int rc; buf[0] = cpu_to_le32(rtk->role); buf[1] = cpu_to_le32(rtk->type); buf[2] = cpu_to_le32(rtd->new_role); rc = put_entry(buf, sizeof(u32), 3, fp); if (rc) return rc; if (p->policyvers >= POLICYDB_VERSION_ROLETRANS) { buf[0] = cpu_to_le32(rtk->tclass); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; } return 0; } static int role_trans_write(struct policydb *p, struct policy_file *fp) { struct policy_data pd = { .p = p, .fp = fp }; __le32 buf[1]; int rc; buf[0] = cpu_to_le32(p->role_tr.nel); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; return hashtab_map(&p->role_tr, role_trans_write_one, &pd); } static int role_allow_write(struct role_allow *r, struct policy_file *fp) { struct role_allow *ra; __le32 buf[2]; size_t nel; int rc; nel = 0; for (ra = r; ra; ra = ra->next) nel++; buf[0] = cpu_to_le32(nel); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; for (ra = r; ra; ra = ra->next) { buf[0] = cpu_to_le32(ra->role); buf[1] = cpu_to_le32(ra->new_role); rc = put_entry(buf, sizeof(u32), 2, fp); if (rc) return rc; } return 0; } /* * Write a security context structure * to a policydb binary representation file. */ static int context_write(struct policydb *p, struct context *c, struct policy_file *fp) { int rc; __le32 buf[3]; buf[0] = cpu_to_le32(c->user); buf[1] = cpu_to_le32(c->role); buf[2] = cpu_to_le32(c->type); rc = put_entry(buf, sizeof(u32), 3, fp); if (rc) return rc; rc = mls_write_range_helper(&c->range, fp); if (rc) return rc; return 0; } /* * The following *_write functions are used to * write the symbol data to a policy database * binary representation file. */ static int perm_write(void *vkey, void *datum, void *fp) { char *key = vkey; struct perm_datum *perdatum = datum; __le32 buf[2]; size_t len; int rc; len = strlen(key); buf[0] = cpu_to_le32(len); buf[1] = cpu_to_le32(perdatum->value); rc = put_entry(buf, sizeof(u32), 2, fp); if (rc) return rc; rc = put_entry(key, 1, len, fp); if (rc) return rc; return 0; } static int common_write(void *vkey, void *datum, void *ptr) { char *key = vkey; struct common_datum *comdatum = datum; struct policy_data *pd = ptr; struct policy_file *fp = pd->fp; __le32 buf[4]; size_t len; int rc; len = strlen(key); buf[0] = cpu_to_le32(len); buf[1] = cpu_to_le32(comdatum->value); buf[2] = cpu_to_le32(comdatum->permissions.nprim); buf[3] = cpu_to_le32(comdatum->permissions.table.nel); rc = put_entry(buf, sizeof(u32), 4, fp); if (rc) return rc; rc = put_entry(key, 1, len, fp); if (rc) return rc; rc = hashtab_map(&comdatum->permissions.table, perm_write, fp); if (rc) return rc; return 0; } static int type_set_write(struct type_set *t, struct policy_file *fp) { int rc; __le32 buf[1]; if (ebitmap_write(&t->types, fp)) return -EINVAL; if (ebitmap_write(&t->negset, fp)) return -EINVAL; buf[0] = cpu_to_le32(t->flags); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return -EINVAL; return 0; } static int write_cons_helper(struct policydb *p, struct constraint_node *node, struct policy_file *fp) { struct constraint_node *c; struct constraint_expr *e; __le32 buf[3]; u32 nel; int rc; for (c = node; c; c = c->next) { nel = 0; for (e = c->expr; e; e = e->next) nel++; buf[0] = cpu_to_le32(c->permissions); buf[1] = cpu_to_le32(nel); rc = put_entry(buf, sizeof(u32), 2, fp); if (rc) return rc; for (e = c->expr; e; e = e->next) { buf[0] = cpu_to_le32(e->expr_type); buf[1] = cpu_to_le32(e->attr); buf[2] = cpu_to_le32(e->op); rc = put_entry(buf, sizeof(u32), 3, fp); if (rc) return rc; switch (e->expr_type) { case CEXPR_NAMES: rc = ebitmap_write(&e->names, fp); if (rc) return rc; if (p->policyvers >= POLICYDB_VERSION_CONSTRAINT_NAMES) { rc = type_set_write(e->type_names, fp); if (rc) return rc; } break; default: break; } } } return 0; } static int class_write(void *vkey, void *datum, void *ptr) { char *key = vkey; struct class_datum *cladatum = datum; struct policy_data *pd = ptr; struct policy_file *fp = pd->fp; struct policydb *p = pd->p; struct constraint_node *c; __le32 buf[6]; u32 ncons; size_t len, len2; int rc; len = strlen(key); if (cladatum->comkey) len2 = strlen(cladatum->comkey); else len2 = 0; ncons = 0; for (c = cladatum->constraints; c; c = c->next) ncons++; buf[0] = cpu_to_le32(len); buf[1] = cpu_to_le32(len2); buf[2] = cpu_to_le32(cladatum->value); buf[3] = cpu_to_le32(cladatum->permissions.nprim); buf[4] = cpu_to_le32(cladatum->permissions.table.nel); buf[5] = cpu_to_le32(ncons); rc = put_entry(buf, sizeof(u32), 6, fp); if (rc) return rc; rc = put_entry(key, 1, len, fp); if (rc) return rc; if (cladatum->comkey) { rc = put_entry(cladatum->comkey, 1, len2, fp); if (rc) return rc; } rc = hashtab_map(&cladatum->permissions.table, perm_write, fp); if (rc) return rc; rc = write_cons_helper(p, cladatum->constraints, fp); if (rc) return rc; /* write out the validatetrans rule */ ncons = 0; for (c = cladatum->validatetrans; c; c = c->next) ncons++; buf[0] = cpu_to_le32(ncons); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; rc = write_cons_helper(p, cladatum->validatetrans, fp); if (rc) return rc; if (p->policyvers >= POLICYDB_VERSION_NEW_OBJECT_DEFAULTS) { buf[0] = cpu_to_le32(cladatum->default_user); buf[1] = cpu_to_le32(cladatum->default_role); buf[2] = cpu_to_le32(cladatum->default_range); rc = put_entry(buf, sizeof(uint32_t), 3, fp); if (rc) return rc; } if (p->policyvers >= POLICYDB_VERSION_DEFAULT_TYPE) { buf[0] = cpu_to_le32(cladatum->default_type); rc = put_entry(buf, sizeof(uint32_t), 1, fp); if (rc) return rc; } return 0; } static int role_write(void *vkey, void *datum, void *ptr) { char *key = vkey; struct role_datum *role = datum; struct policy_data *pd = ptr; struct policy_file *fp = pd->fp; struct policydb *p = pd->p; __le32 buf[3]; size_t items, len; int rc; len = strlen(key); items = 0; buf[items++] = cpu_to_le32(len); buf[items++] = cpu_to_le32(role->value); if (p->policyvers >= POLICYDB_VERSION_BOUNDARY) buf[items++] = cpu_to_le32(role->bounds); BUG_ON(items > ARRAY_SIZE(buf)); rc = put_entry(buf, sizeof(u32), items, fp); if (rc) return rc; rc = put_entry(key, 1, len, fp); if (rc) return rc; rc = ebitmap_write(&role->dominates, fp); if (rc) return rc; rc = ebitmap_write(&role->types, fp); if (rc) return rc; return 0; } static int type_write(void *vkey, void *datum, void *ptr) { char *key = vkey; struct type_datum *typdatum = datum; struct policy_data *pd = ptr; struct policydb *p = pd->p; struct policy_file *fp = pd->fp; __le32 buf[4]; int rc; size_t items, len; len = strlen(key); items = 0; buf[items++] = cpu_to_le32(len); buf[items++] = cpu_to_le32(typdatum->value); if (p->policyvers >= POLICYDB_VERSION_BOUNDARY) { u32 properties = 0; if (typdatum->primary) properties |= TYPEDATUM_PROPERTY_PRIMARY; if (typdatum->attribute) properties |= TYPEDATUM_PROPERTY_ATTRIBUTE; buf[items++] = cpu_to_le32(properties); buf[items++] = cpu_to_le32(typdatum->bounds); } else { buf[items++] = cpu_to_le32(typdatum->primary); } BUG_ON(items > ARRAY_SIZE(buf)); rc = put_entry(buf, sizeof(u32), items, fp); if (rc) return rc; rc = put_entry(key, 1, len, fp); if (rc) return rc; return 0; } static int user_write(void *vkey, void *datum, void *ptr) { char *key = vkey; struct user_datum *usrdatum = datum; struct policy_data *pd = ptr; struct policydb *p = pd->p; struct policy_file *fp = pd->fp; __le32 buf[3]; size_t items, len; int rc; len = strlen(key); items = 0; buf[items++] = cpu_to_le32(len); buf[items++] = cpu_to_le32(usrdatum->value); if (p->policyvers >= POLICYDB_VERSION_BOUNDARY) buf[items++] = cpu_to_le32(usrdatum->bounds); BUG_ON(items > ARRAY_SIZE(buf)); rc = put_entry(buf, sizeof(u32), items, fp); if (rc) return rc; rc = put_entry(key, 1, len, fp); if (rc) return rc; rc = ebitmap_write(&usrdatum->roles, fp); if (rc) return rc; rc = mls_write_range_helper(&usrdatum->range, fp); if (rc) return rc; rc = mls_write_level(&usrdatum->dfltlevel, fp); if (rc) return rc; return 0; } /* clang-format off */ static int (*const write_f[SYM_NUM])(void *key, void *datum, void *datap) = { common_write, class_write, role_write, type_write, user_write, cond_write_bool, sens_write, cat_write, }; /* clang-format on */ static int ocontext_write(struct policydb *p, const struct policydb_compat_info *info, struct policy_file *fp) { unsigned int i, j; int rc; size_t nel, len; __be64 prefixbuf[1]; __le32 buf[3]; u32 nodebuf[8]; struct ocontext *c; for (i = 0; i < info->ocon_num; i++) { nel = 0; for (c = p->ocontexts[i]; c; c = c->next) nel++; buf[0] = cpu_to_le32(nel); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; for (c = p->ocontexts[i]; c; c = c->next) { switch (i) { case OCON_ISID: buf[0] = cpu_to_le32(c->sid[0]); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; rc = context_write(p, &c->context[0], fp); if (rc) return rc; break; case OCON_FS: case OCON_NETIF: len = strlen(c->u.name); buf[0] = cpu_to_le32(len); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; rc = put_entry(c->u.name, 1, len, fp); if (rc) return rc; rc = context_write(p, &c->context[0], fp); if (rc) return rc; rc = context_write(p, &c->context[1], fp); if (rc) return rc; break; case OCON_PORT: buf[0] = cpu_to_le32(c->u.port.protocol); buf[1] = cpu_to_le32(c->u.port.low_port); buf[2] = cpu_to_le32(c->u.port.high_port); rc = put_entry(buf, sizeof(u32), 3, fp); if (rc) return rc; rc = context_write(p, &c->context[0], fp); if (rc) return rc; break; case OCON_NODE: nodebuf[0] = c->u.node.addr; /* network order */ nodebuf[1] = c->u.node.mask; /* network order */ rc = put_entry(nodebuf, sizeof(u32), 2, fp); if (rc) return rc; rc = context_write(p, &c->context[0], fp); if (rc) return rc; break; case OCON_FSUSE: buf[0] = cpu_to_le32(c->v.behavior); len = strlen(c->u.name); buf[1] = cpu_to_le32(len); rc = put_entry(buf, sizeof(u32), 2, fp); if (rc) return rc; rc = put_entry(c->u.name, 1, len, fp); if (rc) return rc; rc = context_write(p, &c->context[0], fp); if (rc) return rc; break; case OCON_NODE6: for (j = 0; j < 4; j++) nodebuf[j] = c->u.node6.addr [j]; /* network order */ for (j = 0; j < 4; j++) nodebuf[j + 4] = c->u.node6.mask [j]; /* network order */ rc = put_entry(nodebuf, sizeof(u32), 8, fp); if (rc) return rc; rc = context_write(p, &c->context[0], fp); if (rc) return rc; break; case OCON_IBPKEY: /* subnet_prefix is in CPU order */ prefixbuf[0] = cpu_to_be64(c->u.ibpkey.subnet_prefix); rc = put_entry(prefixbuf, sizeof(u64), 1, fp); if (rc) return rc; buf[0] = cpu_to_le32(c->u.ibpkey.low_pkey); buf[1] = cpu_to_le32(c->u.ibpkey.high_pkey); rc = put_entry(buf, sizeof(u32), 2, fp); if (rc) return rc; rc = context_write(p, &c->context[0], fp); if (rc) return rc; break; case OCON_IBENDPORT: len = strlen(c->u.ibendport.dev_name); buf[0] = cpu_to_le32(len); buf[1] = cpu_to_le32(c->u.ibendport.port); rc = put_entry(buf, sizeof(u32), 2, fp); if (rc) return rc; rc = put_entry(c->u.ibendport.dev_name, 1, len, fp); if (rc) return rc; rc = context_write(p, &c->context[0], fp); if (rc) return rc; break; } } } return 0; } static int genfs_write(struct policydb *p, struct policy_file *fp) { struct genfs *genfs; struct ocontext *c; size_t len; __le32 buf[1]; int rc; len = 0; for (genfs = p->genfs; genfs; genfs = genfs->next) len++; buf[0] = cpu_to_le32(len); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; for (genfs = p->genfs; genfs; genfs = genfs->next) { len = strlen(genfs->fstype); buf[0] = cpu_to_le32(len); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; rc = put_entry(genfs->fstype, 1, len, fp); if (rc) return rc; len = 0; for (c = genfs->head; c; c = c->next) len++; buf[0] = cpu_to_le32(len); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; for (c = genfs->head; c; c = c->next) { len = strlen(c->u.name); buf[0] = cpu_to_le32(len); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; rc = put_entry(c->u.name, 1, len, fp); if (rc) return rc; buf[0] = cpu_to_le32(c->v.sclass); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; rc = context_write(p, &c->context[0], fp); if (rc) return rc; } } return 0; } static int range_write_helper(void *key, void *data, void *ptr) { __le32 buf[2]; struct range_trans *rt = key; struct mls_range *r = data; struct policy_data *pd = ptr; struct policy_file *fp = pd->fp; struct policydb *p = pd->p; int rc; buf[0] = cpu_to_le32(rt->source_type); buf[1] = cpu_to_le32(rt->target_type); rc = put_entry(buf, sizeof(u32), 2, fp); if (rc) return rc; if (p->policyvers >= POLICYDB_VERSION_RANGETRANS) { buf[0] = cpu_to_le32(rt->target_class); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; } rc = mls_write_range_helper(r, fp); if (rc) return rc; return 0; } static int range_write(struct policydb *p, struct policy_file *fp) { __le32 buf[1]; int rc; struct policy_data pd; pd.p = p; pd.fp = fp; buf[0] = cpu_to_le32(p->range_tr.nel); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; /* actually write all of the entries */ rc = hashtab_map(&p->range_tr, range_write_helper, &pd); if (rc) return rc; return 0; } static int filename_write_helper_compat(void *key, void *data, void *ptr) { struct filename_trans_key *ft = key; struct filename_trans_datum *datum = data; struct ebitmap_node *node; struct policy_file *fp = ptr; __le32 buf[4]; int rc; u32 bit, len = strlen(ft->name); do { ebitmap_for_each_positive_bit(&datum->stypes, node, bit) { buf[0] = cpu_to_le32(len); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; rc = put_entry(ft->name, sizeof(char), len, fp); if (rc) return rc; buf[0] = cpu_to_le32(bit + 1); buf[1] = cpu_to_le32(ft->ttype); buf[2] = cpu_to_le32(ft->tclass); buf[3] = cpu_to_le32(datum->otype); rc = put_entry(buf, sizeof(u32), 4, fp); if (rc) return rc; } datum = datum->next; } while (unlikely(datum)); return 0; } static int filename_write_helper(void *key, void *data, void *ptr) { struct filename_trans_key *ft = key; struct filename_trans_datum *datum; struct policy_file *fp = ptr; __le32 buf[3]; int rc; u32 ndatum, len = strlen(ft->name); buf[0] = cpu_to_le32(len); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; rc = put_entry(ft->name, sizeof(char), len, fp); if (rc) return rc; ndatum = 0; datum = data; do { ndatum++; datum = datum->next; } while (unlikely(datum)); buf[0] = cpu_to_le32(ft->ttype); buf[1] = cpu_to_le32(ft->tclass); buf[2] = cpu_to_le32(ndatum); rc = put_entry(buf, sizeof(u32), 3, fp); if (rc) return rc; datum = data; do { rc = ebitmap_write(&datum->stypes, fp); if (rc) return rc; buf[0] = cpu_to_le32(datum->otype); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; datum = datum->next; } while (unlikely(datum)); return 0; } static int filename_trans_write(struct policydb *p, struct policy_file *fp) { __le32 buf[1]; int rc; if (p->policyvers < POLICYDB_VERSION_FILENAME_TRANS) return 0; if (p->policyvers < POLICYDB_VERSION_COMP_FTRANS) { buf[0] = cpu_to_le32(p->compat_filename_trans_count); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; rc = hashtab_map(&p->filename_trans, filename_write_helper_compat, fp); } else { buf[0] = cpu_to_le32(p->filename_trans.nel); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; rc = hashtab_map(&p->filename_trans, filename_write_helper, fp); } return rc; } /* * Write the configuration data in a policy database * structure to a policy database binary representation * file. */ int policydb_write(struct policydb *p, struct policy_file *fp) { unsigned int num_syms; int rc; __le32 buf[4]; u32 config, i; size_t len; const struct policydb_compat_info *info; /* * refuse to write policy older than compressed avtab * to simplify the writer. There are other tests dropped * since we assume this throughout the writer code. Be * careful if you ever try to remove this restriction */ if (p->policyvers < POLICYDB_VERSION_AVTAB) { pr_err("SELinux: refusing to write policy version %d." " Because it is less than version %d\n", p->policyvers, POLICYDB_VERSION_AVTAB); return -EINVAL; } config = 0; if (p->mls_enabled) config |= POLICYDB_CONFIG_MLS; if (p->reject_unknown) config |= REJECT_UNKNOWN; if (p->allow_unknown) config |= ALLOW_UNKNOWN; /* Write the magic number and string identifiers. */ buf[0] = cpu_to_le32(POLICYDB_MAGIC); len = strlen(POLICYDB_STRING); buf[1] = cpu_to_le32(len); rc = put_entry(buf, sizeof(u32), 2, fp); if (rc) return rc; rc = put_entry(POLICYDB_STRING, 1, len, fp); if (rc) return rc; /* Write the version, config, and table sizes. */ info = policydb_lookup_compat(p->policyvers); if (!info) { pr_err("SELinux: compatibility lookup failed for policy " "version %d\n", p->policyvers); return -EINVAL; } buf[0] = cpu_to_le32(p->policyvers); buf[1] = cpu_to_le32(config); buf[2] = cpu_to_le32(info->sym_num); buf[3] = cpu_to_le32(info->ocon_num); rc = put_entry(buf, sizeof(u32), 4, fp); if (rc) return rc; if (p->policyvers >= POLICYDB_VERSION_POLCAP) { rc = ebitmap_write(&p->policycaps, fp); if (rc) return rc; } if (p->policyvers >= POLICYDB_VERSION_PERMISSIVE) { rc = ebitmap_write(&p->permissive_map, fp); if (rc) return rc; } if (p->policyvers >= POLICYDB_VERSION_NEVERAUDIT) { rc = ebitmap_write(&p->neveraudit_map, fp); if (rc) return rc; } num_syms = info->sym_num; for (i = 0; i < num_syms; i++) { struct policy_data pd; pd.fp = fp; pd.p = p; buf[0] = cpu_to_le32(p->symtab[i].nprim); buf[1] = cpu_to_le32(p->symtab[i].table.nel); rc = put_entry(buf, sizeof(u32), 2, fp); if (rc) return rc; rc = hashtab_map(&p->symtab[i].table, write_f[i], &pd); if (rc) return rc; } rc = avtab_write(p, &p->te_avtab, fp); if (rc) return rc; rc = cond_write_list(p, fp); if (rc) return rc; rc = role_trans_write(p, fp); if (rc) return rc; rc = role_allow_write(p->role_allow, fp); if (rc) return rc; rc = filename_trans_write(p, fp); if (rc) return rc; rc = ocontext_write(p, info, fp); if (rc) return rc; rc = genfs_write(p, fp); if (rc) return rc; rc = range_write(p, fp); if (rc) return rc; for (i = 0; i < p->p_types.nprim; i++) { struct ebitmap *e = &p->type_attr_map_array[i]; rc = ebitmap_write(e, fp); if (rc) return rc; } return 0; }
4 4 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_BITMAP_H #define __LINUX_BITMAP_H #ifndef __ASSEMBLY__ #include <linux/align.h> #include <linux/bitops.h> #include <linux/cleanup.h> #include <linux/errno.h> #include <linux/find.h> #include <linux/limits.h> #include <linux/string.h> #include <linux/types.h> #include <linux/bitmap-str.h> struct device; /* * bitmaps provide bit arrays that consume one or more unsigned * longs. The bitmap interface and available operations are listed * here, in bitmap.h * * Function implementations generic to all architectures are in * lib/bitmap.c. Functions implementations that are architecture * specific are in various arch/<arch>/include/asm/bitops.h headers * and other arch/<arch> specific files. * * See lib/bitmap.c for more details. */ /** * DOC: bitmap overview * * The available bitmap operations and their rough meaning in the * case that the bitmap is a single unsigned long are thus: * * The generated code is more efficient when nbits is known at * compile-time and at most BITS_PER_LONG. * * :: * * bitmap_zero(dst, nbits) *dst = 0UL * bitmap_fill(dst, nbits) *dst = ~0UL * bitmap_copy(dst, src, nbits) *dst = *src * bitmap_and(dst, src1, src2, nbits) *dst = *src1 & *src2 * bitmap_or(dst, src1, src2, nbits) *dst = *src1 | *src2 * bitmap_xor(dst, src1, src2, nbits) *dst = *src1 ^ *src2 * bitmap_andnot(dst, src1, src2, nbits) *dst = *src1 & ~(*src2) * bitmap_complement(dst, src, nbits) *dst = ~(*src) * bitmap_equal(src1, src2, nbits) Are *src1 and *src2 equal? * bitmap_intersects(src1, src2, nbits) Do *src1 and *src2 overlap? * bitmap_subset(src1, src2, nbits) Is *src1 a subset of *src2? * bitmap_empty(src, nbits) Are all bits zero in *src? * bitmap_full(src, nbits) Are all bits set in *src? * bitmap_weight(src, nbits) Hamming Weight: number set bits * bitmap_weight_and(src1, src2, nbits) Hamming Weight of and'ed bitmap * bitmap_weight_andnot(src1, src2, nbits) Hamming Weight of andnot'ed bitmap * bitmap_set(dst, pos, nbits) Set specified bit area * bitmap_clear(dst, pos, nbits) Clear specified bit area * bitmap_find_next_zero_area(buf, len, pos, n, mask) Find bit free area * bitmap_find_next_zero_area_off(buf, len, pos, n, mask, mask_off) as above * bitmap_shift_right(dst, src, n, nbits) *dst = *src >> n * bitmap_shift_left(dst, src, n, nbits) *dst = *src << n * bitmap_cut(dst, src, first, n, nbits) Cut n bits from first, copy rest * bitmap_replace(dst, old, new, mask, nbits) *dst = (*old & ~(*mask)) | (*new & *mask) * bitmap_scatter(dst, src, mask, nbits) *dst = map(dense, sparse)(src) * bitmap_gather(dst, src, mask, nbits) *dst = map(sparse, dense)(src) * bitmap_remap(dst, src, old, new, nbits) *dst = map(old, new)(src) * bitmap_bitremap(oldbit, old, new, nbits) newbit = map(old, new)(oldbit) * bitmap_onto(dst, orig, relmap, nbits) *dst = orig relative to relmap * bitmap_fold(dst, orig, sz, nbits) dst bits = orig bits mod sz * bitmap_parse(buf, buflen, dst, nbits) Parse bitmap dst from kernel buf * bitmap_parse_user(ubuf, ulen, dst, nbits) Parse bitmap dst from user buf * bitmap_parselist(buf, dst, nbits) Parse bitmap dst from kernel buf * bitmap_parselist_user(buf, dst, nbits) Parse bitmap dst from user buf * bitmap_find_free_region(bitmap, bits, order) Find and allocate bit region * bitmap_release_region(bitmap, pos, order) Free specified bit region * bitmap_allocate_region(bitmap, pos, order) Allocate specified bit region * bitmap_from_arr32(dst, buf, nbits) Copy nbits from u32[] buf to dst * bitmap_from_arr64(dst, buf, nbits) Copy nbits from u64[] buf to dst * bitmap_to_arr32(buf, src, nbits) Copy nbits from buf to u32[] dst * bitmap_to_arr64(buf, src, nbits) Copy nbits from buf to u64[] dst * bitmap_get_value8(map, start) Get 8bit value from map at start * bitmap_set_value8(map, value, start) Set 8bit value to map at start * bitmap_read(map, start, nbits) Read an nbits-sized value from * map at start * bitmap_write(map, value, start, nbits) Write an nbits-sized value to * map at start * * Note, bitmap_zero() and bitmap_fill() operate over the region of * unsigned longs, that is, bits behind bitmap till the unsigned long * boundary will be zeroed or filled as well. Consider to use * bitmap_clear() or bitmap_set() to make explicit zeroing or filling * respectively. */ /** * DOC: bitmap bitops * * Also the following operations in asm/bitops.h apply to bitmaps.:: * * set_bit(bit, addr) *addr |= bit * clear_bit(bit, addr) *addr &= ~bit * change_bit(bit, addr) *addr ^= bit * test_bit(bit, addr) Is bit set in *addr? * test_and_set_bit(bit, addr) Set bit and return old value * test_and_clear_bit(bit, addr) Clear bit and return old value * test_and_change_bit(bit, addr) Change bit and return old value * find_first_zero_bit(addr, nbits) Position first zero bit in *addr * find_first_bit(addr, nbits) Position first set bit in *addr * find_next_zero_bit(addr, nbits, bit) * Position next zero bit in *addr >= bit * find_next_bit(addr, nbits, bit) Position next set bit in *addr >= bit * find_next_and_bit(addr1, addr2, nbits, bit) * Same as find_next_bit, but in * (*addr1 & *addr2) * */ /** * DOC: declare bitmap * The DECLARE_BITMAP(name,bits) macro, in linux/types.h, can be used * to declare an array named 'name' of just enough unsigned longs to * contain all bit positions from 0 to 'bits' - 1. */ /* * Allocation and deallocation of bitmap. * Provided in lib/bitmap.c to avoid circular dependency. */ unsigned long *bitmap_alloc(unsigned int nbits, gfp_t flags); unsigned long *bitmap_zalloc(unsigned int nbits, gfp_t flags); unsigned long *bitmap_alloc_node(unsigned int nbits, gfp_t flags, int node); unsigned long *bitmap_zalloc_node(unsigned int nbits, gfp_t flags, int node); void bitmap_free(const unsigned long *bitmap); DEFINE_FREE(bitmap, unsigned long *, if (_T) bitmap_free(_T)) /* Managed variants of the above. */ unsigned long *devm_bitmap_alloc(struct device *dev, unsigned int nbits, gfp_t flags); unsigned long *devm_bitmap_zalloc(struct device *dev, unsigned int nbits, gfp_t flags); /* * lib/bitmap.c provides these functions: */ bool __bitmap_equal(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); bool __pure __bitmap_or_equal(const unsigned long *src1, const unsigned long *src2, const unsigned long *src3, unsigned int nbits); void __bitmap_complement(unsigned long *dst, const unsigned long *src, unsigned int nbits); void __bitmap_shift_right(unsigned long *dst, const unsigned long *src, unsigned int shift, unsigned int nbits); void __bitmap_shift_left(unsigned long *dst, const unsigned long *src, unsigned int shift, unsigned int nbits); void bitmap_cut(unsigned long *dst, const unsigned long *src, unsigned int first, unsigned int cut, unsigned int nbits); bool __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); void __bitmap_replace(unsigned long *dst, const unsigned long *old, const unsigned long *new, const unsigned long *mask, unsigned int nbits); bool __bitmap_intersects(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); bool __bitmap_subset(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); unsigned int __bitmap_weight(const unsigned long *bitmap, unsigned int nbits); unsigned int __bitmap_weight_and(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); unsigned int __bitmap_weight_andnot(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); void __bitmap_set(unsigned long *map, unsigned int start, int len); void __bitmap_clear(unsigned long *map, unsigned int start, int len); unsigned long bitmap_find_next_zero_area_off(unsigned long *map, unsigned long size, unsigned long start, unsigned int nr, unsigned long align_mask, unsigned long align_offset); /** * bitmap_find_next_zero_area - find a contiguous aligned zero area * @map: The address to base the search on * @size: The bitmap size in bits * @start: The bitnumber to start searching at * @nr: The number of zeroed bits we're looking for * @align_mask: Alignment mask for zero area * * The @align_mask should be one less than a power of 2; the effect is that * the bit offset of all zero areas this function finds is multiples of that * power of 2. A @align_mask of 0 means no alignment is required. */ static __always_inline unsigned long bitmap_find_next_zero_area(unsigned long *map, unsigned long size, unsigned long start, unsigned int nr, unsigned long align_mask) { return bitmap_find_next_zero_area_off(map, size, start, nr, align_mask, 0); } void bitmap_remap(unsigned long *dst, const unsigned long *src, const unsigned long *old, const unsigned long *new, unsigned int nbits); int bitmap_bitremap(int oldbit, const unsigned long *old, const unsigned long *new, int bits); void bitmap_onto(unsigned long *dst, const unsigned long *orig, const unsigned long *relmap, unsigned int bits); void bitmap_fold(unsigned long *dst, const unsigned long *orig, unsigned int sz, unsigned int nbits); #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1))) #define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1))) #define bitmap_size(nbits) (ALIGN(nbits, BITS_PER_LONG) / BITS_PER_BYTE) static __always_inline void bitmap_zero(unsigned long *dst, unsigned int nbits) { unsigned int len = bitmap_size(nbits); if (small_const_nbits(nbits)) *dst = 0; else memset(dst, 0, len); } static __always_inline void bitmap_fill(unsigned long *dst, unsigned int nbits) { unsigned int len = bitmap_size(nbits); if (small_const_nbits(nbits)) *dst = ~0UL; else memset(dst, 0xff, len); } static __always_inline void bitmap_copy(unsigned long *dst, const unsigned long *src, unsigned int nbits) { unsigned int len = bitmap_size(nbits); if (small_const_nbits(nbits)) *dst = *src; else memcpy(dst, src, len); } /* * Copy bitmap and clear tail bits in last word. */ static __always_inline void bitmap_copy_clear_tail(unsigned long *dst, const unsigned long *src, unsigned int nbits) { bitmap_copy(dst, src, nbits); if (nbits % BITS_PER_LONG) dst[nbits / BITS_PER_LONG] &= BITMAP_LAST_WORD_MASK(nbits); } static inline void bitmap_copy_and_extend(unsigned long *to, const unsigned long *from, unsigned int count, unsigned int size) { unsigned int copy = BITS_TO_LONGS(count); memcpy(to, from, copy * sizeof(long)); if (count % BITS_PER_LONG) to[copy - 1] &= BITMAP_LAST_WORD_MASK(count); memset(to + copy, 0, bitmap_size(size) - copy * sizeof(long)); } /* * On 32-bit systems bitmaps are represented as u32 arrays internally. On LE64 * machines the order of hi and lo parts of numbers match the bitmap structure. * In both cases conversion is not needed when copying data from/to arrays of * u32. But in LE64 case, typecast in bitmap_copy_clear_tail() may lead * to out-of-bound access. To avoid that, both LE and BE variants of 64-bit * architectures are not using bitmap_copy_clear_tail(). */ #if BITS_PER_LONG == 64 void bitmap_from_arr32(unsigned long *bitmap, const u32 *buf, unsigned int nbits); void bitmap_to_arr32(u32 *buf, const unsigned long *bitmap, unsigned int nbits); #else #define bitmap_from_arr32(bitmap, buf, nbits) \ bitmap_copy_clear_tail((unsigned long *) (bitmap), \ (const unsigned long *) (buf), (nbits)) #define bitmap_to_arr32(buf, bitmap, nbits) \ bitmap_copy_clear_tail((unsigned long *) (buf), \ (const unsigned long *) (bitmap), (nbits)) #endif /* * On 64-bit systems bitmaps are represented as u64 arrays internally. So, * the conversion is not needed when copying data from/to arrays of u64. */ #if BITS_PER_LONG == 32 void bitmap_from_arr64(unsigned long *bitmap, const u64 *buf, unsigned int nbits); void bitmap_to_arr64(u64 *buf, const unsigned long *bitmap, unsigned int nbits); #else #define bitmap_from_arr64(bitmap, buf, nbits) \ bitmap_copy_clear_tail((unsigned long *)(bitmap), (const unsigned long *)(buf), (nbits)) #define bitmap_to_arr64(buf, bitmap, nbits) \ bitmap_copy_clear_tail((unsigned long *)(buf), (const unsigned long *)(bitmap), (nbits)) #endif static __always_inline bool bitmap_and(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) return (*dst = *src1 & *src2 & BITMAP_LAST_WORD_MASK(nbits)) != 0; return __bitmap_and(dst, src1, src2, nbits); } static __always_inline void bitmap_or(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) *dst = *src1 | *src2; else __bitmap_or(dst, src1, src2, nbits); } static __always_inline void bitmap_xor(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) *dst = *src1 ^ *src2; else __bitmap_xor(dst, src1, src2, nbits); } static __always_inline bool bitmap_andnot(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) return (*dst = *src1 & ~(*src2) & BITMAP_LAST_WORD_MASK(nbits)) != 0; return __bitmap_andnot(dst, src1, src2, nbits); } static __always_inline void bitmap_complement(unsigned long *dst, const unsigned long *src, unsigned int nbits) { if (small_const_nbits(nbits)) *dst = ~(*src); else __bitmap_complement(dst, src, nbits); } #ifdef __LITTLE_ENDIAN #define BITMAP_MEM_ALIGNMENT 8 #else #define BITMAP_MEM_ALIGNMENT (8 * sizeof(unsigned long)) #endif #define BITMAP_MEM_MASK (BITMAP_MEM_ALIGNMENT - 1) static __always_inline bool bitmap_equal(const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) return !((*src1 ^ *src2) & BITMAP_LAST_WORD_MASK(nbits)); if (__builtin_constant_p(nbits & BITMAP_MEM_MASK) && IS_ALIGNED(nbits, BITMAP_MEM_ALIGNMENT)) return !memcmp(src1, src2, nbits / 8); return __bitmap_equal(src1, src2, nbits); } /** * bitmap_or_equal - Check whether the or of two bitmaps is equal to a third * @src1: Pointer to bitmap 1 * @src2: Pointer to bitmap 2 will be or'ed with bitmap 1 * @src3: Pointer to bitmap 3. Compare to the result of *@src1 | *@src2 * @nbits: number of bits in each of these bitmaps * * Returns: True if (*@src1 | *@src2) == *@src3, false otherwise */ static __always_inline bool bitmap_or_equal(const unsigned long *src1, const unsigned long *src2, const unsigned long *src3, unsigned int nbits) { if (!small_const_nbits(nbits)) return __bitmap_or_equal(src1, src2, src3, nbits); return !(((*src1 | *src2) ^ *src3) & BITMAP_LAST_WORD_MASK(nbits)); } static __always_inline bool bitmap_intersects(const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) return ((*src1 & *src2) & BITMAP_LAST_WORD_MASK(nbits)) != 0; else return __bitmap_intersects(src1, src2, nbits); } static __always_inline bool bitmap_subset(const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) return ! ((*src1 & ~(*src2)) & BITMAP_LAST_WORD_MASK(nbits)); else return __bitmap_subset(src1, src2, nbits); } static __always_inline bool bitmap_empty(const unsigned long *src, unsigned nbits) { if (small_const_nbits(nbits)) return ! (*src & BITMAP_LAST_WORD_MASK(nbits)); return find_first_bit(src, nbits) == nbits; } static __always_inline bool bitmap_full(const unsigned long *src, unsigned int nbits) { if (small_const_nbits(nbits)) return ! (~(*src) & BITMAP_LAST_WORD_MASK(nbits)); return find_first_zero_bit(src, nbits) == nbits; } static __always_inline unsigned int bitmap_weight(const unsigned long *src, unsigned int nbits) { if (small_const_nbits(nbits)) return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits)); return __bitmap_weight(src, nbits); } static __always_inline unsigned long bitmap_weight_and(const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) return hweight_long(*src1 & *src2 & BITMAP_LAST_WORD_MASK(nbits)); return __bitmap_weight_and(src1, src2, nbits); } static __always_inline unsigned long bitmap_weight_andnot(const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) return hweight_long(*src1 & ~(*src2) & BITMAP_LAST_WORD_MASK(nbits)); return __bitmap_weight_andnot(src1, src2, nbits); } static __always_inline void bitmap_set(unsigned long *map, unsigned int start, unsigned int nbits) { if (__builtin_constant_p(nbits) && nbits == 1) __set_bit(start, map); else if (small_const_nbits(start + nbits)) *map |= GENMASK(start + nbits - 1, start); else if (__builtin_constant_p(start & BITMAP_MEM_MASK) && IS_ALIGNED(start, BITMAP_MEM_ALIGNMENT) && __builtin_constant_p(nbits & BITMAP_MEM_MASK) && IS_ALIGNED(nbits, BITMAP_MEM_ALIGNMENT)) memset((char *)map + start / 8, 0xff, nbits / 8); else __bitmap_set(map, start, nbits); } static __always_inline void bitmap_clear(unsigned long *map, unsigned int start, unsigned int nbits) { if (__builtin_constant_p(nbits) && nbits == 1) __clear_bit(start, map); else if (small_const_nbits(start + nbits)) *map &= ~GENMASK(start + nbits - 1, start); else if (__builtin_constant_p(start & BITMAP_MEM_MASK) && IS_ALIGNED(start, BITMAP_MEM_ALIGNMENT) && __builtin_constant_p(nbits & BITMAP_MEM_MASK) && IS_ALIGNED(nbits, BITMAP_MEM_ALIGNMENT)) memset((char *)map + start / 8, 0, nbits / 8); else __bitmap_clear(map, start, nbits); } static __always_inline void bitmap_shift_right(unsigned long *dst, const unsigned long *src, unsigned int shift, unsigned int nbits) { if (small_const_nbits(nbits)) *dst = (*src & BITMAP_LAST_WORD_MASK(nbits)) >> shift; else __bitmap_shift_right(dst, src, shift, nbits); } static __always_inline void bitmap_shift_left(unsigned long *dst, const unsigned long *src, unsigned int shift, unsigned int nbits) { if (small_const_nbits(nbits)) *dst = (*src << shift) & BITMAP_LAST_WORD_MASK(nbits); else __bitmap_shift_left(dst, src, shift, nbits); } static __always_inline void bitmap_replace(unsigned long *dst, const unsigned long *old, const unsigned long *new, const unsigned long *mask, unsigned int nbits) { if (small_const_nbits(nbits)) *dst = (*old & ~(*mask)) | (*new & *mask); else __bitmap_replace(dst, old, new, mask, nbits); } /** * bitmap_scatter - Scatter a bitmap according to the given mask * @dst: scattered bitmap * @src: gathered bitmap * @mask: mask representing bits to assign to in the scattered bitmap * @nbits: number of bits in each of these bitmaps * * Scatters bitmap with sequential bits according to the given @mask. * * Example: * If @src bitmap = 0x005a, with @mask = 0x1313, @dst will be 0x0302. * * Or in binary form * @src @mask @dst * 0000000001011010 0001001100010011 0000001100000010 * * (Bits 0, 1, 2, 3, 4, 5 are copied to the bits 0, 1, 4, 8, 9, 12) * * A more 'visual' description of the operation:: * * src: 0000000001011010 * |||||| * +------+||||| * | +----+|||| * | |+----+||| * | || +-+|| * | || | || * mask: ...v..vv...v..vv * ...0..11...0..10 * dst: 0000001100000010 * * A relationship exists between bitmap_scatter() and bitmap_gather(). See * bitmap_gather() for the bitmap gather detailed operations. TL;DR: * bitmap_gather() can be seen as the 'reverse' bitmap_scatter() operation. */ static __always_inline void bitmap_scatter(unsigned long *dst, const unsigned long *src, const unsigned long *mask, unsigned int nbits) { unsigned int n = 0; unsigned int bit; bitmap_zero(dst, nbits); for_each_set_bit(bit, mask, nbits) __assign_bit(bit, dst, test_bit(n++, src)); } /** * bitmap_gather - Gather a bitmap according to given mask * @dst: gathered bitmap * @src: scattered bitmap * @mask: mask representing bits to extract from in the scattered bitmap * @nbits: number of bits in each of these bitmaps * * Gathers bitmap with sparse bits according to the given @mask. * * Example: * If @src bitmap = 0x0302, with @mask = 0x1313, @dst will be 0x001a. * * Or in binary form * @src @mask @dst * 0000001100000010 0001001100010011 0000000000011010 * * (Bits 0, 1, 4, 8, 9, 12 are copied to the bits 0, 1, 2, 3, 4, 5) * * A more 'visual' description of the operation:: * * mask: ...v..vv...v..vv * src: 0000001100000010 * ^ ^^ ^ 0 * | || | 10 * | || > 010 * | |+--> 1010 * | +--> 11010 * +----> 011010 * dst: 0000000000011010 * * A relationship exists between bitmap_gather() and bitmap_scatter(). See * bitmap_scatter() for the bitmap scatter detailed operations. TL;DR: * bitmap_scatter() can be seen as the 'reverse' bitmap_gather() operation. * * Suppose scattered computed using bitmap_scatter(scattered, src, mask, n). * The operation bitmap_gather(result, scattered, mask, n) leads to a result * equal or equivalent to src. * * The result can be 'equivalent' because bitmap_scatter() and bitmap_gather() * are not bijective. * The result and src values are equivalent in that sense that a call to * bitmap_scatter(res, src, mask, n) and a call to * bitmap_scatter(res, result, mask, n) will lead to the same res value. */ static __always_inline void bitmap_gather(unsigned long *dst, const unsigned long *src, const unsigned long *mask, unsigned int nbits) { unsigned int n = 0; unsigned int bit; bitmap_zero(dst, nbits); for_each_set_bit(bit, mask, nbits) __assign_bit(n++, dst, test_bit(bit, src)); } static __always_inline void bitmap_next_set_region(unsigned long *bitmap, unsigned int *rs, unsigned int *re, unsigned int end) { *rs = find_next_bit(bitmap, end, *rs); *re = find_next_zero_bit(bitmap, end, *rs + 1); } /** * bitmap_release_region - release allocated bitmap region * @bitmap: array of unsigned longs corresponding to the bitmap * @pos: beginning of bit region to release * @order: region size (log base 2 of number of bits) to release * * This is the complement to __bitmap_find_free_region() and releases * the found region (by clearing it in the bitmap). */ static __always_inline void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order) { bitmap_clear(bitmap, pos, BIT(order)); } /** * bitmap_allocate_region - allocate bitmap region * @bitmap: array of unsigned longs corresponding to the bitmap * @pos: beginning of bit region to allocate * @order: region size (log base 2 of number of bits) to allocate * * Allocate (set bits in) a specified region of a bitmap. * * Returns: 0 on success, or %-EBUSY if specified region wasn't * free (not all bits were zero). */ static __always_inline int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order) { unsigned int len = BIT(order); if (find_next_bit(bitmap, pos + len, pos) < pos + len) return -EBUSY; bitmap_set(bitmap, pos, len); return 0; } /** * bitmap_find_free_region - find a contiguous aligned mem region * @bitmap: array of unsigned longs corresponding to the bitmap * @bits: number of bits in the bitmap * @order: region size (log base 2 of number of bits) to find * * Find a region of free (zero) bits in a @bitmap of @bits bits and * allocate them (set them to one). Only consider regions of length * a power (@order) of two, aligned to that power of two, which * makes the search algorithm much faster. * * Returns: the bit offset in bitmap of the allocated region, * or -errno on failure. */ static __always_inline int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order) { unsigned int pos, end; /* scans bitmap by regions of size order */ for (pos = 0; (end = pos + BIT(order)) <= bits; pos = end) { if (!bitmap_allocate_region(bitmap, pos, order)) return pos; } return -ENOMEM; } /** * BITMAP_FROM_U64() - Represent u64 value in the format suitable for bitmap. * @n: u64 value * * Linux bitmaps are internally arrays of unsigned longs, i.e. 32-bit * integers in 32-bit environment, and 64-bit integers in 64-bit one. * * There are four combinations of endianness and length of the word in linux * ABIs: LE64, BE64, LE32 and BE32. * * On 64-bit kernels 64-bit LE and BE numbers are naturally ordered in * bitmaps and therefore don't require any special handling. * * On 32-bit kernels 32-bit LE ABI orders lo word of 64-bit number in memory * prior to hi, and 32-bit BE orders hi word prior to lo. The bitmap on the * other hand is represented as an array of 32-bit words and the position of * bit N may therefore be calculated as: word #(N/32) and bit #(N%32) in that * word. For example, bit #42 is located at 10th position of 2nd word. * It matches 32-bit LE ABI, and we can simply let the compiler store 64-bit * values in memory as it usually does. But for BE we need to swap hi and lo * words manually. * * With all that, the macro BITMAP_FROM_U64() does explicit reordering of hi and * lo parts of u64. For LE32 it does nothing, and for BE environment it swaps * hi and lo words, as is expected by bitmap. */ #if __BITS_PER_LONG == 64 #define BITMAP_FROM_U64(n) (n) #else #define BITMAP_FROM_U64(n) ((unsigned long) ((u64)(n) & ULONG_MAX)), \ ((unsigned long) ((u64)(n) >> 32)) #endif /** * bitmap_from_u64 - Check and swap words within u64. * @mask: source bitmap * @dst: destination bitmap * * In 32-bit Big Endian kernel, when using ``(u32 *)(&val)[*]`` * to read u64 mask, we will get the wrong word. * That is ``(u32 *)(&val)[0]`` gets the upper 32 bits, * but we expect the lower 32-bits of u64. */ static __always_inline void bitmap_from_u64(unsigned long *dst, u64 mask) { bitmap_from_arr64(dst, &mask, 64); } /** * bitmap_read - read a value of n-bits from the memory region * @map: address to the bitmap memory region * @start: bit offset of the n-bit value * @nbits: size of value in bits, nonzero, up to BITS_PER_LONG * * Returns: value of @nbits bits located at the @start bit offset within the * @map memory region. For @nbits = 0 and @nbits > BITS_PER_LONG the return * value is undefined. */ static __always_inline unsigned long bitmap_read(const unsigned long *map, unsigned long start, unsigned long nbits) { size_t index = BIT_WORD(start); unsigned long offset = start % BITS_PER_LONG; unsigned long space = BITS_PER_LONG - offset; unsigned long value_low, value_high; if (unlikely(!nbits || nbits > BITS_PER_LONG)) return 0; if (space >= nbits) return (map[index] >> offset) & BITMAP_LAST_WORD_MASK(nbits); value_low = map[index] & BITMAP_FIRST_WORD_MASK(start); value_high = map[index + 1] & BITMAP_LAST_WORD_MASK(start + nbits); return (value_low >> offset) | (value_high << space); } /** * bitmap_write - write n-bit value within a memory region * @map: address to the bitmap memory region * @value: value to write, clamped to nbits * @start: bit offset of the n-bit value * @nbits: size of value in bits, nonzero, up to BITS_PER_LONG. * * bitmap_write() behaves as-if implemented as @nbits calls of __assign_bit(), * i.e. bits beyond @nbits are ignored: * * for (bit = 0; bit < nbits; bit++) * __assign_bit(start + bit, bitmap, val & BIT(bit)); * * For @nbits == 0 and @nbits > BITS_PER_LONG no writes are performed. */ static __always_inline void bitmap_write(unsigned long *map, unsigned long value, unsigned long start, unsigned long nbits) { size_t index; unsigned long offset; unsigned long space; unsigned long mask; bool fit; if (unlikely(!nbits || nbits > BITS_PER_LONG)) return; mask = BITMAP_LAST_WORD_MASK(nbits); value &= mask; offset = start % BITS_PER_LONG; space = BITS_PER_LONG - offset; fit = space >= nbits; index = BIT_WORD(start); map[index] &= (fit ? (~(mask << offset)) : ~BITMAP_FIRST_WORD_MASK(start)); map[index] |= value << offset; if (fit) return; map[index + 1] &= BITMAP_FIRST_WORD_MASK(start + nbits); map[index + 1] |= (value >> space); } #define bitmap_get_value8(map, start) \ bitmap_read(map, start, BITS_PER_BYTE) #define bitmap_set_value8(map, value, start) \ bitmap_write(map, value, start, BITS_PER_BYTE) #endif /* __ASSEMBLY__ */ #endif /* __LINUX_BITMAP_H */
92 92 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 // SPDX-License-Identifier: GPL-2.0 /* * SafeSetID Linux Security Module * * Author: Micah Morton <mortonm@chromium.org> * * Copyright (C) 2018 The Chromium OS Authors. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2, as * published by the Free Software Foundation. * */ #define pr_fmt(fmt) "SafeSetID: " fmt #include <linux/lsm_hooks.h> #include <linux/module.h> #include <linux/ptrace.h> #include <linux/sched/task_stack.h> #include <linux/security.h> #include <uapi/linux/lsm.h> #include "lsm.h" /* Flag indicating whether initialization completed */ int safesetid_initialized __initdata; struct setid_ruleset __rcu *safesetid_setuid_rules; struct setid_ruleset __rcu *safesetid_setgid_rules; /* Compute a decision for a transition from @src to @dst under @policy. */ enum sid_policy_type _setid_policy_lookup(struct setid_ruleset *policy, kid_t src, kid_t dst) { struct setid_rule *rule; enum sid_policy_type result = SIDPOL_DEFAULT; if (policy->type == UID) { hash_for_each_possible(policy->rules, rule, next, __kuid_val(src.uid)) { if (!uid_eq(rule->src_id.uid, src.uid)) continue; if (uid_eq(rule->dst_id.uid, dst.uid)) return SIDPOL_ALLOWED; result = SIDPOL_CONSTRAINED; } } else if (policy->type == GID) { hash_for_each_possible(policy->rules, rule, next, __kgid_val(src.gid)) { if (!gid_eq(rule->src_id.gid, src.gid)) continue; if (gid_eq(rule->dst_id.gid, dst.gid)){ return SIDPOL_ALLOWED; } result = SIDPOL_CONSTRAINED; } } else { /* Should not reach here, report the ID as contrainsted */ result = SIDPOL_CONSTRAINED; } return result; } /* * Compute a decision for a transition from @src to @dst under the active * policy. */ static enum sid_policy_type setid_policy_lookup(kid_t src, kid_t dst, enum setid_type new_type) { enum sid_policy_type result = SIDPOL_DEFAULT; struct setid_ruleset *pol; rcu_read_lock(); if (new_type == UID) pol = rcu_dereference(safesetid_setuid_rules); else if (new_type == GID) pol = rcu_dereference(safesetid_setgid_rules); else { /* Should not reach here */ result = SIDPOL_CONSTRAINED; rcu_read_unlock(); return result; } if (pol) { pol->type = new_type; result = _setid_policy_lookup(pol, src, dst); } rcu_read_unlock(); return result; } static int safesetid_security_capable(const struct cred *cred, struct user_namespace *ns, int cap, unsigned int opts) { /* We're only interested in CAP_SETUID and CAP_SETGID. */ if (cap != CAP_SETUID && cap != CAP_SETGID) return 0; /* * If CAP_SET{U/G}ID is currently used for a setid or setgroups syscall, we * want to let it go through here; the real security check happens later, in * the task_fix_set{u/g}id or task_fix_setgroups hooks. */ if ((opts & CAP_OPT_INSETID) != 0) return 0; switch (cap) { case CAP_SETUID: /* * If no policy applies to this task, allow the use of CAP_SETUID for * other purposes. */ if (setid_policy_lookup((kid_t){.uid = cred->uid}, INVALID_ID, UID) == SIDPOL_DEFAULT) return 0; /* * Reject use of CAP_SETUID for functionality other than calling * set*uid() (e.g. setting up userns uid mappings). */ pr_warn("Operation requires CAP_SETUID, which is not available to UID %u for operations besides approved set*uid transitions\n", __kuid_val(cred->uid)); return -EPERM; case CAP_SETGID: /* * If no policy applies to this task, allow the use of CAP_SETGID for * other purposes. */ if (setid_policy_lookup((kid_t){.gid = cred->gid}, INVALID_ID, GID) == SIDPOL_DEFAULT) return 0; /* * Reject use of CAP_SETUID for functionality other than calling * set*gid() (e.g. setting up userns gid mappings). */ pr_warn("Operation requires CAP_SETGID, which is not available to GID %u for operations besides approved set*gid transitions\n", __kgid_val(cred->gid)); return -EPERM; default: /* Error, the only capabilities were checking for is CAP_SETUID/GID */ return 0; } return 0; } /* * Check whether a caller with old credentials @old is allowed to switch to * credentials that contain @new_id. */ static bool id_permitted_for_cred(const struct cred *old, kid_t new_id, enum setid_type new_type) { bool permitted; /* If our old creds already had this ID in it, it's fine. */ if (new_type == UID) { if (uid_eq(new_id.uid, old->uid) || uid_eq(new_id.uid, old->euid) || uid_eq(new_id.uid, old->suid)) return true; } else if (new_type == GID){ if (gid_eq(new_id.gid, old->gid) || gid_eq(new_id.gid, old->egid) || gid_eq(new_id.gid, old->sgid)) return true; } else /* Error, new_type is an invalid type */ return false; /* * Transitions to new UIDs require a check against the policy of the old * RUID. */ permitted = setid_policy_lookup((kid_t){.uid = old->uid}, new_id, new_type) != SIDPOL_CONSTRAINED; if (!permitted) { if (new_type == UID) { pr_warn("UID transition ((%d,%d,%d) -> %d) blocked\n", __kuid_val(old->uid), __kuid_val(old->euid), __kuid_val(old->suid), __kuid_val(new_id.uid)); } else if (new_type == GID) { pr_warn("GID transition ((%d,%d,%d) -> %d) blocked\n", __kgid_val(old->gid), __kgid_val(old->egid), __kgid_val(old->sgid), __kgid_val(new_id.gid)); } else /* Error, new_type is an invalid type */ return false; } return permitted; } /* * Check whether there is either an exception for user under old cred struct to * set*uid to user under new cred struct, or the UID transition is allowed (by * Linux set*uid rules) even without CAP_SETUID. */ static int safesetid_task_fix_setuid(struct cred *new, const struct cred *old, int flags) { /* Do nothing if there are no setuid restrictions for our old RUID. */ if (setid_policy_lookup((kid_t){.uid = old->uid}, INVALID_ID, UID) == SIDPOL_DEFAULT) return 0; if (id_permitted_for_cred(old, (kid_t){.uid = new->uid}, UID) && id_permitted_for_cred(old, (kid_t){.uid = new->euid}, UID) && id_permitted_for_cred(old, (kid_t){.uid = new->suid}, UID) && id_permitted_for_cred(old, (kid_t){.uid = new->fsuid}, UID)) return 0; /* * Kill this process to avoid potential security vulnerabilities * that could arise from a missing allowlist entry preventing a * privileged process from dropping to a lesser-privileged one. */ force_sig(SIGKILL); return -EACCES; } static int safesetid_task_fix_setgid(struct cred *new, const struct cred *old, int flags) { /* Do nothing if there are no setgid restrictions for our old RGID. */ if (setid_policy_lookup((kid_t){.gid = old->gid}, INVALID_ID, GID) == SIDPOL_DEFAULT) return 0; if (id_permitted_for_cred(old, (kid_t){.gid = new->gid}, GID) && id_permitted_for_cred(old, (kid_t){.gid = new->egid}, GID) && id_permitted_for_cred(old, (kid_t){.gid = new->sgid}, GID) && id_permitted_for_cred(old, (kid_t){.gid = new->fsgid}, GID)) return 0; /* * Kill this process to avoid potential security vulnerabilities * that could arise from a missing allowlist entry preventing a * privileged process from dropping to a lesser-privileged one. */ force_sig(SIGKILL); return -EACCES; } static int safesetid_task_fix_setgroups(struct cred *new, const struct cred *old) { int i; /* Do nothing if there are no setgid restrictions for our old RGID. */ if (setid_policy_lookup((kid_t){.gid = old->gid}, INVALID_ID, GID) == SIDPOL_DEFAULT) return 0; get_group_info(new->group_info); for (i = 0; i < new->group_info->ngroups; i++) { if (!id_permitted_for_cred(old, (kid_t){.gid = new->group_info->gid[i]}, GID)) { put_group_info(new->group_info); /* * Kill this process to avoid potential security vulnerabilities * that could arise from a missing allowlist entry preventing a * privileged process from dropping to a lesser-privileged one. */ force_sig(SIGKILL); return -EACCES; } } put_group_info(new->group_info); return 0; } static const struct lsm_id safesetid_lsmid = { .name = "safesetid", .id = LSM_ID_SAFESETID, }; static struct security_hook_list safesetid_security_hooks[] = { LSM_HOOK_INIT(task_fix_setuid, safesetid_task_fix_setuid), LSM_HOOK_INIT(task_fix_setgid, safesetid_task_fix_setgid), LSM_HOOK_INIT(task_fix_setgroups, safesetid_task_fix_setgroups), LSM_HOOK_INIT(capable, safesetid_security_capable) }; static int __init safesetid_security_init(void) { security_add_hooks(safesetid_security_hooks, ARRAY_SIZE(safesetid_security_hooks), &safesetid_lsmid); /* Report that SafeSetID successfully initialized */ safesetid_initialized = 1; return 0; } DEFINE_LSM(safesetid_security_init) = { .init = safesetid_security_init, .name = "safesetid", };
93 94 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 /* SPDX-License-Identifier: GPL-2.0 */ /* * Implementations of the security context functions. * * Author: Ondrej Mosnacek <omosnacek@gmail.com> * Copyright (C) 2020 Red Hat, Inc. */ #include <linux/jhash.h> #include "context.h" #include "mls.h" u32 context_compute_hash(const struct context *c) { u32 hash = 0; /* * If a context is invalid, it will always be represented by a * context struct with only the len & str set (and vice versa) * under a given policy. Since context structs from different * policies should never meet, it is safe to hash valid and * invalid contexts differently. The context_equal() function * already operates under the same assumption. */ if (c->len) return full_name_hash(NULL, c->str, c->len); hash = jhash_3words(c->user, c->role, c->type, hash); hash = mls_range_hash(&c->range, hash); return hash; }
210 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 /* SPDX-License-Identifier: GPL-2.0+ */ /* * Read-Copy Update mechanism for mutual exclusion (tree-based version) * * Copyright IBM Corporation, 2008 * * Author: Dipankar Sarma <dipankar@in.ibm.com> * Paul E. McKenney <paulmck@linux.ibm.com> Hierarchical algorithm * * Based on the original work by Paul McKenney <paulmck@linux.ibm.com> * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. * * For detailed explanation of Read-Copy Update mechanism see - * Documentation/RCU */ #ifndef __LINUX_RCUTREE_H #define __LINUX_RCUTREE_H void rcu_softirq_qs(void); void rcu_note_context_switch(bool preempt); int rcu_needs_cpu(void); void rcu_cpu_stall_reset(void); void rcu_request_urgent_qs_task(struct task_struct *t); /* * Note a virtualization-based context switch. This is simply a * wrapper around rcu_note_context_switch(), which allows TINY_RCU * to save a few bytes. The caller must have disabled interrupts. */ static inline void rcu_virt_note_context_switch(void) { rcu_note_context_switch(false); } void synchronize_rcu_expedited(void); void rcu_barrier(void); void rcu_momentary_eqs(void); struct rcu_gp_oldstate { unsigned long rgos_norm; unsigned long rgos_exp; }; // Maximum number of rcu_gp_oldstate values corresponding to // not-yet-completed RCU grace periods. #define NUM_ACTIVE_RCU_POLL_FULL_OLDSTATE 4 /** * same_state_synchronize_rcu_full - Are two old-state values identical? * @rgosp1: First old-state value. * @rgosp2: Second old-state value. * * The two old-state values must have been obtained from either * get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(), * or get_completed_synchronize_rcu_full(). Returns @true if the two * values are identical and @false otherwise. This allows structures * whose lifetimes are tracked by old-state values to push these values * to a list header, allowing those structures to be slightly smaller. * * Note that equality is judged on a bitwise basis, so that an * @rcu_gp_oldstate structure with an already-completed state in one field * will compare not-equal to a structure with an already-completed state * in the other field. After all, the @rcu_gp_oldstate structure is opaque * so how did such a situation come to pass in the first place? */ static inline bool same_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp1, struct rcu_gp_oldstate *rgosp2) { return rgosp1->rgos_norm == rgosp2->rgos_norm && rgosp1->rgos_exp == rgosp2->rgos_exp; } unsigned long start_poll_synchronize_rcu_expedited(void); void start_poll_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp); void cond_synchronize_rcu_expedited(unsigned long oldstate); void cond_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp); unsigned long get_state_synchronize_rcu(void); void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp); unsigned long start_poll_synchronize_rcu(void); void start_poll_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp); bool poll_state_synchronize_rcu(unsigned long oldstate); bool poll_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp); void cond_synchronize_rcu(unsigned long oldstate); void cond_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp); #ifdef CONFIG_PROVE_RCU void rcu_irq_exit_check_preempt(void); #else static inline void rcu_irq_exit_check_preempt(void) { } #endif struct task_struct; void rcu_preempt_deferred_qs(struct task_struct *t); void exit_rcu(void); void rcu_scheduler_starting(void); extern int rcu_scheduler_active; void rcu_end_inkernel_boot(void); bool rcu_inkernel_boot_has_ended(void); bool rcu_is_watching(void); #ifndef CONFIG_PREEMPT_RCU void rcu_all_qs(void); #endif /* RCUtree hotplug events */ int rcutree_prepare_cpu(unsigned int cpu); int rcutree_online_cpu(unsigned int cpu); void rcutree_report_cpu_starting(unsigned int cpu); #ifdef CONFIG_HOTPLUG_CPU int rcutree_dead_cpu(unsigned int cpu); int rcutree_dying_cpu(unsigned int cpu); int rcutree_offline_cpu(unsigned int cpu); #else #define rcutree_dead_cpu NULL #define rcutree_dying_cpu NULL #define rcutree_offline_cpu NULL #endif void rcutree_migrate_callbacks(int cpu); /* Called from hotplug and also arm64 early secondary boot failure */ void rcutree_report_cpu_dead(void); #endif /* __LINUX_RCUTREE_H */
1429 270 12 921 703 662 29 28 821 1200 740 746 135 1185 588 125 1458 1447 29 1459 902 478 181 79 452 962 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Based on arch/arm/include/asm/atomic.h * * Copyright (C) 1996 Russell King. * Copyright (C) 2002 Deep Blue Solutions Ltd. * Copyright (C) 2012 ARM Ltd. */ #ifndef __ASM_ATOMIC_LSE_H #define __ASM_ATOMIC_LSE_H #define ATOMIC_OP(op, asm_op) \ static __always_inline void \ __lse_atomic_##op(int i, atomic_t *v) \ { \ asm volatile( \ __LSE_PREAMBLE \ " " #asm_op " %w[i], %[v]\n" \ : [v] "+Q" (v->counter) \ : [i] "r" (i)); \ } ATOMIC_OP(andnot, stclr) ATOMIC_OP(or, stset) ATOMIC_OP(xor, steor) ATOMIC_OP(add, stadd) static __always_inline void __lse_atomic_sub(int i, atomic_t *v) { __lse_atomic_add(-i, v); } #undef ATOMIC_OP #define ATOMIC_FETCH_OP(name, mb, op, asm_op, cl...) \ static __always_inline int \ __lse_atomic_fetch_##op##name(int i, atomic_t *v) \ { \ int old; \ \ asm volatile( \ __LSE_PREAMBLE \ " " #asm_op #mb " %w[i], %w[old], %[v]" \ : [v] "+Q" (v->counter), \ [old] "=r" (old) \ : [i] "r" (i) \ : cl); \ \ return old; \ } #define ATOMIC_FETCH_OPS(op, asm_op) \ ATOMIC_FETCH_OP(_relaxed, , op, asm_op) \ ATOMIC_FETCH_OP(_acquire, a, op, asm_op, "memory") \ ATOMIC_FETCH_OP(_release, l, op, asm_op, "memory") \ ATOMIC_FETCH_OP( , al, op, asm_op, "memory") ATOMIC_FETCH_OPS(andnot, ldclr) ATOMIC_FETCH_OPS(or, ldset) ATOMIC_FETCH_OPS(xor, ldeor) ATOMIC_FETCH_OPS(add, ldadd) #undef ATOMIC_FETCH_OP #undef ATOMIC_FETCH_OPS #define ATOMIC_FETCH_OP_SUB(name) \ static __always_inline int \ __lse_atomic_fetch_sub##name(int i, atomic_t *v) \ { \ return __lse_atomic_fetch_add##name(-i, v); \ } ATOMIC_FETCH_OP_SUB(_relaxed) ATOMIC_FETCH_OP_SUB(_acquire) ATOMIC_FETCH_OP_SUB(_release) ATOMIC_FETCH_OP_SUB( ) #undef ATOMIC_FETCH_OP_SUB #define ATOMIC_OP_ADD_SUB_RETURN(name) \ static __always_inline int \ __lse_atomic_add_return##name(int i, atomic_t *v) \ { \ return __lse_atomic_fetch_add##name(i, v) + i; \ } \ \ static __always_inline int \ __lse_atomic_sub_return##name(int i, atomic_t *v) \ { \ return __lse_atomic_fetch_sub(i, v) - i; \ } ATOMIC_OP_ADD_SUB_RETURN(_relaxed) ATOMIC_OP_ADD_SUB_RETURN(_acquire) ATOMIC_OP_ADD_SUB_RETURN(_release) ATOMIC_OP_ADD_SUB_RETURN( ) #undef ATOMIC_OP_ADD_SUB_RETURN static __always_inline void __lse_atomic_and(int i, atomic_t *v) { return __lse_atomic_andnot(~i, v); } #define ATOMIC_FETCH_OP_AND(name, mb, cl...) \ static __always_inline int \ __lse_atomic_fetch_and##name(int i, atomic_t *v) \ { \ return __lse_atomic_fetch_andnot##name(~i, v); \ } ATOMIC_FETCH_OP_AND(_relaxed, ) ATOMIC_FETCH_OP_AND(_acquire, a, "memory") ATOMIC_FETCH_OP_AND(_release, l, "memory") ATOMIC_FETCH_OP_AND( , al, "memory") #undef ATOMIC_FETCH_OP_AND #define ATOMIC64_OP(op, asm_op) \ static __always_inline void \ __lse_atomic64_##op(s64 i, atomic64_t *v) \ { \ asm volatile( \ __LSE_PREAMBLE \ " " #asm_op " %[i], %[v]\n" \ : [v] "+Q" (v->counter) \ : [i] "r" (i)); \ } ATOMIC64_OP(andnot, stclr) ATOMIC64_OP(or, stset) ATOMIC64_OP(xor, steor) ATOMIC64_OP(add, stadd) static __always_inline void __lse_atomic64_sub(s64 i, atomic64_t *v) { __lse_atomic64_add(-i, v); } #undef ATOMIC64_OP #define ATOMIC64_FETCH_OP(name, mb, op, asm_op, cl...) \ static __always_inline long \ __lse_atomic64_fetch_##op##name(s64 i, atomic64_t *v) \ { \ s64 old; \ \ asm volatile( \ __LSE_PREAMBLE \ " " #asm_op #mb " %[i], %[old], %[v]" \ : [v] "+Q" (v->counter), \ [old] "=r" (old) \ : [i] "r" (i) \ : cl); \ \ return old; \ } #define ATOMIC64_FETCH_OPS(op, asm_op) \ ATOMIC64_FETCH_OP(_relaxed, , op, asm_op) \ ATOMIC64_FETCH_OP(_acquire, a, op, asm_op, "memory") \ ATOMIC64_FETCH_OP(_release, l, op, asm_op, "memory") \ ATOMIC64_FETCH_OP( , al, op, asm_op, "memory") ATOMIC64_FETCH_OPS(andnot, ldclr) ATOMIC64_FETCH_OPS(or, ldset) ATOMIC64_FETCH_OPS(xor, ldeor) ATOMIC64_FETCH_OPS(add, ldadd) #undef ATOMIC64_FETCH_OP #undef ATOMIC64_FETCH_OPS #define ATOMIC64_FETCH_OP_SUB(name) \ static __always_inline long \ __lse_atomic64_fetch_sub##name(s64 i, atomic64_t *v) \ { \ return __lse_atomic64_fetch_add##name(-i, v); \ } ATOMIC64_FETCH_OP_SUB(_relaxed) ATOMIC64_FETCH_OP_SUB(_acquire) ATOMIC64_FETCH_OP_SUB(_release) ATOMIC64_FETCH_OP_SUB( ) #undef ATOMIC64_FETCH_OP_SUB #define ATOMIC64_OP_ADD_SUB_RETURN(name) \ static __always_inline long \ __lse_atomic64_add_return##name(s64 i, atomic64_t *v) \ { \ return __lse_atomic64_fetch_add##name(i, v) + i; \ } \ \ static __always_inline long \ __lse_atomic64_sub_return##name(s64 i, atomic64_t *v) \ { \ return __lse_atomic64_fetch_sub##name(i, v) - i; \ } ATOMIC64_OP_ADD_SUB_RETURN(_relaxed) ATOMIC64_OP_ADD_SUB_RETURN(_acquire) ATOMIC64_OP_ADD_SUB_RETURN(_release) ATOMIC64_OP_ADD_SUB_RETURN( ) #undef ATOMIC64_OP_ADD_SUB_RETURN static __always_inline void __lse_atomic64_and(s64 i, atomic64_t *v) { return __lse_atomic64_andnot(~i, v); } #define ATOMIC64_FETCH_OP_AND(name, mb, cl...) \ static __always_inline long \ __lse_atomic64_fetch_and##name(s64 i, atomic64_t *v) \ { \ return __lse_atomic64_fetch_andnot##name(~i, v); \ } ATOMIC64_FETCH_OP_AND(_relaxed, ) ATOMIC64_FETCH_OP_AND(_acquire, a, "memory") ATOMIC64_FETCH_OP_AND(_release, l, "memory") ATOMIC64_FETCH_OP_AND( , al, "memory") #undef ATOMIC64_FETCH_OP_AND static __always_inline s64 __lse_atomic64_dec_if_positive(atomic64_t *v) { unsigned long tmp; asm volatile( __LSE_PREAMBLE "1: ldr %x[tmp], %[v]\n" " subs %[ret], %x[tmp], #1\n" " b.lt 2f\n" " casal %x[tmp], %[ret], %[v]\n" " sub %x[tmp], %x[tmp], #1\n" " sub %x[tmp], %x[tmp], %[ret]\n" " cbnz %x[tmp], 1b\n" "2:" : [ret] "+&r" (v), [v] "+Q" (v->counter), [tmp] "=&r" (tmp) : : "cc", "memory"); return (long)v; } #define __CMPXCHG_CASE(w, sfx, name, sz, mb, cl...) \ static __always_inline u##sz \ __lse__cmpxchg_case_##name##sz(volatile void *ptr, \ u##sz old, \ u##sz new) \ { \ asm volatile( \ __LSE_PREAMBLE \ " cas" #mb #sfx " %" #w "[old], %" #w "[new], %[v]\n" \ : [v] "+Q" (*(u##sz *)ptr), \ [old] "+r" (old) \ : [new] "rZ" (new) \ : cl); \ \ return old; \ } __CMPXCHG_CASE(w, b, , 8, ) __CMPXCHG_CASE(w, h, , 16, ) __CMPXCHG_CASE(w, , , 32, ) __CMPXCHG_CASE(x, , , 64, ) __CMPXCHG_CASE(w, b, acq_, 8, a, "memory") __CMPXCHG_CASE(w, h, acq_, 16, a, "memory") __CMPXCHG_CASE(w, , acq_, 32, a, "memory") __CMPXCHG_CASE(x, , acq_, 64, a, "memory") __CMPXCHG_CASE(w, b, rel_, 8, l, "memory") __CMPXCHG_CASE(w, h, rel_, 16, l, "memory") __CMPXCHG_CASE(w, , rel_, 32, l, "memory") __CMPXCHG_CASE(x, , rel_, 64, l, "memory") __CMPXCHG_CASE(w, b, mb_, 8, al, "memory") __CMPXCHG_CASE(w, h, mb_, 16, al, "memory") __CMPXCHG_CASE(w, , mb_, 32, al, "memory") __CMPXCHG_CASE(x, , mb_, 64, al, "memory") #undef __CMPXCHG_CASE #define __CMPXCHG128(name, mb, cl...) \ static __always_inline u128 \ __lse__cmpxchg128##name(volatile u128 *ptr, u128 old, u128 new) \ { \ union __u128_halves r, o = { .full = (old) }, \ n = { .full = (new) }; \ register unsigned long x0 asm ("x0") = o.low; \ register unsigned long x1 asm ("x1") = o.high; \ register unsigned long x2 asm ("x2") = n.low; \ register unsigned long x3 asm ("x3") = n.high; \ register unsigned long x4 asm ("x4") = (unsigned long)ptr; \ \ asm volatile( \ __LSE_PREAMBLE \ " casp" #mb "\t%[old1], %[old2], %[new1], %[new2], %[v]\n"\ : [old1] "+&r" (x0), [old2] "+&r" (x1), \ [v] "+Q" (*(u128 *)ptr) \ : [new1] "r" (x2), [new2] "r" (x3), [ptr] "r" (x4), \ [oldval1] "r" (o.low), [oldval2] "r" (o.high) \ : cl); \ \ r.low = x0; r.high = x1; \ \ return r.full; \ } __CMPXCHG128( , ) __CMPXCHG128(_mb, al, "memory") #undef __CMPXCHG128 #endif /* __ASM_ATOMIC_LSE_H */
3 3 3 3 1 2 5 2 3 3 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 /* * llc_input.c - Minimal input path for LLC * * Copyright (c) 1997 by Procom Technology, Inc. * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * This program can be redistributed or modified under the terms of the * GNU General Public License as published by the Free Software Foundation. * This program is distributed without any warranty or implied warranty * of merchantability or fitness for a particular purpose. * * See the GNU General Public License for more details. */ #include <linux/netdevice.h> #include <linux/slab.h> #include <linux/export.h> #include <net/net_namespace.h> #include <net/llc.h> #include <net/llc_pdu.h> #include <net/llc_sap.h> #if 0 #define dprintk(args...) printk(KERN_DEBUG args) #else #define dprintk(args...) #endif /* * Packet handler for the station, registerable because in the minimal * LLC core that is taking shape only the very minimal subset of LLC that * is needed for things like IPX, Appletalk, etc will stay, with all the * rest in the llc1 and llc2 modules. */ static void (*llc_station_handler)(struct sk_buff *skb); /* * Packet handlers for LLC_DEST_SAP and LLC_DEST_CONN. */ static void (*llc_type_handlers[2])(struct llc_sap *sap, struct sk_buff *skb); void llc_add_pack(int type, void (*handler)(struct llc_sap *sap, struct sk_buff *skb)) { smp_wmb(); /* ensure initialisation is complete before it's called */ if (type == LLC_DEST_SAP || type == LLC_DEST_CONN) llc_type_handlers[type - 1] = handler; } void llc_remove_pack(int type) { if (type == LLC_DEST_SAP || type == LLC_DEST_CONN) llc_type_handlers[type - 1] = NULL; synchronize_net(); } void llc_set_station_handler(void (*handler)(struct sk_buff *skb)) { /* Ensure initialisation is complete before it's called */ if (handler) smp_wmb(); llc_station_handler = handler; if (!handler) synchronize_net(); } /** * llc_pdu_type - returns which LLC component must handle for PDU * @skb: input skb * * This function returns which LLC component must handle this PDU. */ static __inline__ int llc_pdu_type(struct sk_buff *skb) { int type = LLC_DEST_CONN; /* I-PDU or S-PDU type */ struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); if ((pdu->ctrl_1 & LLC_PDU_TYPE_MASK) != LLC_PDU_TYPE_U) goto out; switch (LLC_U_PDU_CMD(pdu)) { case LLC_1_PDU_CMD_XID: case LLC_1_PDU_CMD_UI: case LLC_1_PDU_CMD_TEST: type = LLC_DEST_SAP; break; case LLC_2_PDU_CMD_SABME: case LLC_2_PDU_CMD_DISC: case LLC_2_PDU_RSP_UA: case LLC_2_PDU_RSP_DM: case LLC_2_PDU_RSP_FRMR: break; default: type = LLC_DEST_INVALID; break; } out: return type; } /** * llc_fixup_skb - initializes skb pointers * @skb: This argument points to incoming skb * * Initializes internal skb pointer to start of network layer by deriving * length of LLC header; finds length of LLC control field in LLC header * by looking at the two lowest-order bits of the first control field * byte; field is either 3 or 4 bytes long. */ static inline int llc_fixup_skb(struct sk_buff *skb) { u8 llc_len = 2; struct llc_pdu_un *pdu; if (unlikely(!pskb_may_pull(skb, sizeof(*pdu)))) return 0; pdu = (struct llc_pdu_un *)skb->data; if ((pdu->ctrl_1 & LLC_PDU_TYPE_MASK) == LLC_PDU_TYPE_U) llc_len = 1; llc_len += 2; if (unlikely(!pskb_may_pull(skb, llc_len))) return 0; skb_pull(skb, llc_len); skb_reset_transport_header(skb); if (skb->protocol == htons(ETH_P_802_2)) { __be16 pdulen; s32 data_size; if (skb->mac_len < ETH_HLEN) return 0; pdulen = eth_hdr(skb)->h_proto; data_size = ntohs(pdulen) - llc_len; if (data_size < 0 || !pskb_may_pull(skb, data_size)) return 0; if (unlikely(pskb_trim_rcsum(skb, data_size))) return 0; } return 1; } /** * llc_rcv - 802.2 entry point from net lower layers * @skb: received pdu * @dev: device that receive pdu * @pt: packet type * @orig_dev: the original receive net device * * When the system receives a 802.2 frame this function is called. It * checks SAP and connection of received pdu and passes frame to * llc_{station,sap,conn}_rcv for sending to proper state machine. If * the frame is related to a busy connection (a connection is sending * data now), it queues this frame in the connection's backlog. */ int llc_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct llc_sap *sap; struct llc_pdu_sn *pdu; int dest; int (*rcv)(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *); void (*sta_handler)(struct sk_buff *skb); void (*sap_handler)(struct llc_sap *sap, struct sk_buff *skb); /* * When the interface is in promisc. mode, drop all the crap that it * receives, do not try to analyse it. */ if (unlikely(skb->pkt_type == PACKET_OTHERHOST)) { dprintk("%s: PACKET_OTHERHOST\n", __func__); goto drop; } skb = skb_share_check(skb, GFP_ATOMIC); if (unlikely(!skb)) goto out; if (unlikely(!llc_fixup_skb(skb))) goto drop; pdu = llc_pdu_sn_hdr(skb); if (unlikely(!pdu->dsap)) /* NULL DSAP, refer to station */ goto handle_station; sap = llc_sap_find(pdu->dsap); if (unlikely(!sap)) {/* unknown SAP */ dprintk("%s: llc_sap_find(%02X) failed!\n", __func__, pdu->dsap); goto drop; } /* * First the upper layer protocols that don't need the full * LLC functionality */ rcv = rcu_dereference(sap->rcv_func); dest = llc_pdu_type(skb); sap_handler = dest ? READ_ONCE(llc_type_handlers[dest - 1]) : NULL; if (unlikely(!sap_handler)) { if (rcv) rcv(skb, dev, pt, orig_dev); else kfree_skb(skb); } else { if (rcv) { struct sk_buff *cskb = skb_clone(skb, GFP_ATOMIC); if (cskb) rcv(cskb, dev, pt, orig_dev); } sap_handler(sap, skb); } llc_sap_put(sap); out: return 0; drop: kfree_skb(skb); goto out; handle_station: sta_handler = READ_ONCE(llc_station_handler); if (!sta_handler) goto drop; sta_handler(skb); goto out; } EXPORT_SYMBOL(llc_add_pack); EXPORT_SYMBOL(llc_remove_pack); EXPORT_SYMBOL(llc_set_station_handler);
774 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2016 ARM Ltd. */ #ifndef __ASM_PGTABLE_PROT_H #define __ASM_PGTABLE_PROT_H #include <asm/memory.h> #include <asm/pgtable-hwdef.h> #include <linux/const.h> /* * Software defined PTE bits definition. */ #define PTE_WRITE (PTE_DBM) /* same as DBM (51) */ #define PTE_SWP_EXCLUSIVE (_AT(pteval_t, 1) << 2) /* only for swp ptes */ #define PTE_DIRTY (_AT(pteval_t, 1) << 55) #define PTE_SPECIAL (_AT(pteval_t, 1) << 56) /* * PTE_PRESENT_INVALID=1 & PTE_VALID=0 indicates that the pte's fields should be * interpreted according to the HW layout by SW but any attempted HW access to * the address will result in a fault. pte_present() returns true. */ #define PTE_PRESENT_INVALID (PTE_NG) /* only when !PTE_VALID */ #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP #define PTE_UFFD_WP (_AT(pteval_t, 1) << 58) /* uffd-wp tracking */ #define PTE_SWP_UFFD_WP (_AT(pteval_t, 1) << 3) /* only for swp ptes */ #else #define PTE_UFFD_WP (_AT(pteval_t, 0)) #define PTE_SWP_UFFD_WP (_AT(pteval_t, 0)) #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */ #define _PROT_DEFAULT (PTE_TYPE_PAGE | PTE_AF | PTE_SHARED) #define PROT_DEFAULT (PTE_TYPE_PAGE | PTE_MAYBE_NG | PTE_MAYBE_SHARED | PTE_AF) #define PROT_SECT_DEFAULT (PMD_TYPE_SECT | PMD_MAYBE_NG | PMD_MAYBE_SHARED | PMD_SECT_AF) #define PROT_DEVICE_nGnRnE (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_DEVICE_nGnRnE)) #define PROT_DEVICE_nGnRE (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_DEVICE_nGnRE)) #define PROT_NORMAL_NC (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_NORMAL_NC)) #define PROT_NORMAL (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_NORMAL)) #define PROT_NORMAL_TAGGED (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_NORMAL_TAGGED)) #define PROT_SECT_DEVICE_nGnRE (PROT_SECT_DEFAULT | PMD_SECT_PXN | PMD_SECT_UXN | PMD_ATTRINDX(MT_DEVICE_nGnRE)) #define PROT_SECT_NORMAL (PROT_SECT_DEFAULT | PMD_SECT_PXN | PMD_SECT_UXN | PTE_WRITE | PMD_ATTRINDX(MT_NORMAL)) #define PROT_SECT_NORMAL_EXEC (PROT_SECT_DEFAULT | PMD_SECT_UXN | PMD_ATTRINDX(MT_NORMAL)) #define _PAGE_DEFAULT (_PROT_DEFAULT | PTE_ATTRINDX(MT_NORMAL)) #define _PAGE_KERNEL (PROT_NORMAL) #define _PAGE_KERNEL_RO ((PROT_NORMAL & ~PTE_WRITE) | PTE_RDONLY) #define _PAGE_KERNEL_ROX ((PROT_NORMAL & ~(PTE_WRITE | PTE_PXN)) | PTE_RDONLY) #define _PAGE_KERNEL_EXEC (PROT_NORMAL & ~PTE_PXN) #define _PAGE_KERNEL_EXEC_CONT ((PROT_NORMAL & ~PTE_PXN) | PTE_CONT) #define _PAGE_SHARED (_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN | PTE_WRITE) #define _PAGE_SHARED_EXEC (_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_WRITE) #define _PAGE_READONLY (_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN) #define _PAGE_READONLY_EXEC (_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN) #define _PAGE_EXECONLY (_PAGE_DEFAULT | PTE_RDONLY | PTE_NG | PTE_PXN) #ifndef __ASSEMBLY__ #include <asm/cpufeature.h> #include <asm/pgtable-types.h> #include <asm/rsi.h> extern bool arm64_use_ng_mappings; extern unsigned long prot_ns_shared; #define PROT_NS_SHARED (is_realm_world() ? prot_ns_shared : 0) #define PTE_MAYBE_NG (arm64_use_ng_mappings ? PTE_NG : 0) #define PMD_MAYBE_NG (arm64_use_ng_mappings ? PMD_SECT_NG : 0) #ifndef CONFIG_ARM64_LPA2 #define lpa2_is_enabled() false #define PTE_MAYBE_SHARED PTE_SHARED #define PMD_MAYBE_SHARED PMD_SECT_S #define PHYS_MASK_SHIFT (CONFIG_ARM64_PA_BITS) #else static inline bool __pure lpa2_is_enabled(void) { return read_tcr() & TCR_DS; } #define PTE_MAYBE_SHARED (lpa2_is_enabled() ? 0 : PTE_SHARED) #define PMD_MAYBE_SHARED (lpa2_is_enabled() ? 0 : PMD_SECT_S) #define PHYS_MASK_SHIFT (lpa2_is_enabled() ? CONFIG_ARM64_PA_BITS : 48) #endif /* * Highest possible physical address supported. */ #define PHYS_MASK ((UL(1) << PHYS_MASK_SHIFT) - 1) /* * If we have userspace only BTI we don't want to mark kernel pages * guarded even if the system does support BTI. */ #define PTE_MAYBE_GP (system_supports_bti_kernel() ? PTE_GP : 0) #define PAGE_KERNEL __pgprot(_PAGE_KERNEL) #define PAGE_KERNEL_RO __pgprot(_PAGE_KERNEL_RO) #define PAGE_KERNEL_ROX __pgprot(_PAGE_KERNEL_ROX) #define PAGE_KERNEL_EXEC __pgprot(_PAGE_KERNEL_EXEC) #define PAGE_KERNEL_EXEC_CONT __pgprot(_PAGE_KERNEL_EXEC_CONT) #define PAGE_S2_MEMATTR(attr, has_fwb) \ ({ \ u64 __val; \ if (has_fwb) \ __val = PTE_S2_MEMATTR(MT_S2_FWB_ ## attr); \ else \ __val = PTE_S2_MEMATTR(MT_S2_ ## attr); \ __val; \ }) #define PAGE_NONE __pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | PTE_PRESENT_INVALID | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN) /* shared+writable pages are clean by default, hence PTE_RDONLY|PTE_WRITE */ #define PAGE_SHARED __pgprot(_PAGE_SHARED) #define PAGE_SHARED_EXEC __pgprot(_PAGE_SHARED_EXEC) #define PAGE_READONLY __pgprot(_PAGE_READONLY) #define PAGE_READONLY_EXEC __pgprot(_PAGE_READONLY_EXEC) #define PAGE_EXECONLY __pgprot(_PAGE_EXECONLY) #endif /* __ASSEMBLY__ */ #define pte_pi_index(pte) ( \ ((pte & BIT(PTE_PI_IDX_3)) >> (PTE_PI_IDX_3 - 3)) | \ ((pte & BIT(PTE_PI_IDX_2)) >> (PTE_PI_IDX_2 - 2)) | \ ((pte & BIT(PTE_PI_IDX_1)) >> (PTE_PI_IDX_1 - 1)) | \ ((pte & BIT(PTE_PI_IDX_0)) >> (PTE_PI_IDX_0 - 0))) /* * Page types used via Permission Indirection Extension (PIE). PIE uses * the USER, DBM, PXN and UXN bits to to generate an index which is used * to look up the actual permission in PIR_ELx and PIRE0_EL1. We define * combinations we use on non-PIE systems with the same encoding, for * convenience these are listed here as comments as are the unallocated * encodings. */ /* 0: PAGE_DEFAULT */ /* 1: PTE_USER */ /* 2: PTE_WRITE */ /* 3: PTE_WRITE | PTE_USER */ /* 4: PAGE_EXECONLY PTE_PXN */ /* 5: PAGE_READONLY_EXEC PTE_PXN | PTE_USER */ /* 6: PTE_PXN | PTE_WRITE */ /* 7: PAGE_SHARED_EXEC PTE_PXN | PTE_WRITE | PTE_USER */ /* 8: PAGE_KERNEL_ROX PTE_UXN */ /* 9: PAGE_GCS_RO PTE_UXN | PTE_USER */ /* a: PAGE_KERNEL_EXEC PTE_UXN | PTE_WRITE */ /* b: PAGE_GCS PTE_UXN | PTE_WRITE | PTE_USER */ /* c: PAGE_KERNEL_RO PTE_UXN | PTE_PXN */ /* d: PAGE_READONLY PTE_UXN | PTE_PXN | PTE_USER */ /* e: PAGE_KERNEL PTE_UXN | PTE_PXN | PTE_WRITE */ /* f: PAGE_SHARED PTE_UXN | PTE_PXN | PTE_WRITE | PTE_USER */ #define _PAGE_GCS (_PAGE_DEFAULT | PTE_NG | PTE_UXN | PTE_WRITE | PTE_USER) #define _PAGE_GCS_RO (_PAGE_DEFAULT | PTE_NG | PTE_UXN | PTE_USER) #define PAGE_GCS __pgprot(_PAGE_GCS) #define PAGE_GCS_RO __pgprot(_PAGE_GCS_RO) #define PIE_E0 ( \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_GCS), PIE_GCS) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_GCS_RO), PIE_R) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_EXECONLY), PIE_X_O) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_READONLY_EXEC), PIE_RX_O) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_SHARED_EXEC), PIE_RWX_O) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_READONLY), PIE_R_O) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_SHARED), PIE_RW_O)) #define PIE_E1 ( \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_GCS), PIE_NONE_O) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_GCS_RO), PIE_NONE_O) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_EXECONLY), PIE_NONE_O) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_READONLY_EXEC), PIE_R) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_SHARED_EXEC), PIE_RW) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_READONLY), PIE_R) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_SHARED), PIE_RW) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_KERNEL_ROX), PIE_RX) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_KERNEL_EXEC), PIE_RWX) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_KERNEL_RO), PIE_R) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_KERNEL), PIE_RW)) #endif /* __ASM_PGTABLE_PROT_H */
334 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 // SPDX-License-Identifier: GPL-2.0 /* * Copyright IBM Corporation, 2021 * * Author: Mike Rapoport <rppt@linux.ibm.com> */ #include <linux/mm.h> #include <linux/fs.h> #include <linux/swap.h> #include <linux/mount.h> #include <linux/memfd.h> #include <linux/bitops.h> #include <linux/printk.h> #include <linux/pagemap.h> #include <linux/syscalls.h> #include <linux/pseudo_fs.h> #include <linux/secretmem.h> #include <linux/set_memory.h> #include <linux/sched/signal.h> #include <uapi/linux/magic.h> #include <asm/tlbflush.h> #include "internal.h" #undef pr_fmt #define pr_fmt(fmt) "secretmem: " fmt /* * Define mode and flag masks to allow validation of the system call * parameters. */ #define SECRETMEM_MODE_MASK (0x0) #define SECRETMEM_FLAGS_MASK SECRETMEM_MODE_MASK static bool secretmem_enable __ro_after_init = 1; module_param_named(enable, secretmem_enable, bool, 0400); MODULE_PARM_DESC(secretmem_enable, "Enable secretmem and memfd_secret(2) system call"); static atomic_t secretmem_users; bool secretmem_active(void) { return !!atomic_read(&secretmem_users); } static vm_fault_t secretmem_fault(struct vm_fault *vmf) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; struct inode *inode = file_inode(vmf->vma->vm_file); pgoff_t offset = vmf->pgoff; gfp_t gfp = vmf->gfp_mask; unsigned long addr; struct folio *folio; vm_fault_t ret; int err; if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode)) return vmf_error(-EINVAL); filemap_invalidate_lock_shared(mapping); retry: folio = filemap_lock_folio(mapping, offset); if (IS_ERR(folio)) { folio = folio_alloc(gfp | __GFP_ZERO, 0); if (!folio) { ret = VM_FAULT_OOM; goto out; } err = set_direct_map_invalid_noflush(folio_page(folio, 0)); if (err) { folio_put(folio); ret = vmf_error(err); goto out; } __folio_mark_uptodate(folio); err = filemap_add_folio(mapping, folio, offset, gfp); if (unlikely(err)) { folio_put(folio); /* * If a split of large page was required, it * already happened when we marked the page invalid * which guarantees that this call won't fail */ set_direct_map_default_noflush(folio_page(folio, 0)); if (err == -EEXIST) goto retry; ret = vmf_error(err); goto out; } addr = (unsigned long)folio_address(folio); flush_tlb_kernel_range(addr, addr + PAGE_SIZE); } vmf->page = folio_file_page(folio, vmf->pgoff); ret = VM_FAULT_LOCKED; out: filemap_invalidate_unlock_shared(mapping); return ret; } static const struct vm_operations_struct secretmem_vm_ops = { .fault = secretmem_fault, }; static int secretmem_release(struct inode *inode, struct file *file) { atomic_dec(&secretmem_users); return 0; } static int secretmem_mmap_prepare(struct vm_area_desc *desc) { const unsigned long len = desc->end - desc->start; if ((desc->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0) return -EINVAL; if (!mlock_future_ok(desc->mm, desc->vm_flags | VM_LOCKED, len)) return -EAGAIN; desc->vm_flags |= VM_LOCKED | VM_DONTDUMP; desc->vm_ops = &secretmem_vm_ops; return 0; } bool vma_is_secretmem(struct vm_area_struct *vma) { return vma->vm_ops == &secretmem_vm_ops; } static const struct file_operations secretmem_fops = { .release = secretmem_release, .mmap_prepare = secretmem_mmap_prepare, }; static int secretmem_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) { return -EBUSY; } static void secretmem_free_folio(struct folio *folio) { set_direct_map_default_noflush(folio_page(folio, 0)); folio_zero_segment(folio, 0, folio_size(folio)); } const struct address_space_operations secretmem_aops = { .dirty_folio = noop_dirty_folio, .free_folio = secretmem_free_folio, .migrate_folio = secretmem_migrate_folio, }; static int secretmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); struct address_space *mapping = inode->i_mapping; unsigned int ia_valid = iattr->ia_valid; int ret; filemap_invalidate_lock(mapping); if ((ia_valid & ATTR_SIZE) && inode->i_size) ret = -EINVAL; else ret = simple_setattr(idmap, dentry, iattr); filemap_invalidate_unlock(mapping); return ret; } static const struct inode_operations secretmem_iops = { .setattr = secretmem_setattr, }; static struct vfsmount *secretmem_mnt; static struct file *secretmem_file_create(unsigned long flags) { struct file *file; struct inode *inode; const char *anon_name = "[secretmem]"; inode = anon_inode_make_secure_inode(secretmem_mnt->mnt_sb, anon_name, NULL); if (IS_ERR(inode)) return ERR_CAST(inode); file = alloc_file_pseudo(inode, secretmem_mnt, "secretmem", O_RDWR | O_LARGEFILE, &secretmem_fops); if (IS_ERR(file)) goto err_free_inode; mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); mapping_set_unevictable(inode->i_mapping); inode->i_op = &secretmem_iops; inode->i_mapping->a_ops = &secretmem_aops; /* pretend we are a normal file with zero size */ inode->i_mode |= S_IFREG; inode->i_size = 0; atomic_inc(&secretmem_users); return file; err_free_inode: iput(inode); return file; } SYSCALL_DEFINE1(memfd_secret, unsigned int, flags) { struct file *file; int fd, err; /* make sure local flags do not confict with global fcntl.h */ BUILD_BUG_ON(SECRETMEM_FLAGS_MASK & O_CLOEXEC); if (!secretmem_enable || !can_set_direct_map()) return -ENOSYS; if (flags & ~(SECRETMEM_FLAGS_MASK | O_CLOEXEC)) return -EINVAL; if (atomic_read(&secretmem_users) < 0) return -ENFILE; fd = get_unused_fd_flags(flags & O_CLOEXEC); if (fd < 0) return fd; file = secretmem_file_create(flags); if (IS_ERR(file)) { err = PTR_ERR(file); goto err_put_fd; } fd_install(fd, file); return fd; err_put_fd: put_unused_fd(fd); return err; } static int secretmem_init_fs_context(struct fs_context *fc) { struct pseudo_fs_context *ctx; ctx = init_pseudo(fc, SECRETMEM_MAGIC); if (!ctx) return -ENOMEM; fc->s_iflags |= SB_I_NOEXEC; fc->s_iflags |= SB_I_NODEV; return 0; } static struct file_system_type secretmem_fs = { .name = "secretmem", .init_fs_context = secretmem_init_fs_context, .kill_sb = kill_anon_super, }; static int __init secretmem_init(void) { if (!secretmem_enable || !can_set_direct_map()) return 0; secretmem_mnt = kern_mount(&secretmem_fs); if (IS_ERR(secretmem_mnt)) return PTR_ERR(secretmem_mnt); return 0; } fs_initcall(secretmem_init);
93 94 3 93 93 98 3 3 3 3 98 3 131 3 98 131 130 131 130 131 41 92 92 3 3 3 40 41 41 4 4 4 41 41 41 4 40 41 4 41 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 // SPDX-License-Identifier: GPL-2.0 /* * Implementation of the SID table type. * * Original author: Stephen Smalley, <stephen.smalley.work@gmail.com> * Author: Ondrej Mosnacek, <omosnacek@gmail.com> * * Copyright (C) 2018 Red Hat, Inc. */ #include <linux/errno.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/rcupdate.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/spinlock.h> #include <asm/barrier.h> #include "flask.h" #include "security.h" #include "sidtab.h" #include "services.h" struct sidtab_str_cache { struct rcu_head rcu_member; struct list_head lru_member; struct sidtab_entry *parent; u32 len; char str[] __counted_by(len); }; #define index_to_sid(index) ((index) + SECINITSID_NUM + 1) #define sid_to_index(sid) ((sid) - (SECINITSID_NUM + 1)) int sidtab_init(struct sidtab *s) { u32 i; memset(s->roots, 0, sizeof(s->roots)); for (i = 0; i < SECINITSID_NUM; i++) s->isids[i].set = 0; s->frozen = false; s->count = 0; s->convert = NULL; hash_init(s->context_to_sid); spin_lock_init(&s->lock); #if CONFIG_SECURITY_SELINUX_SID2STR_CACHE_SIZE > 0 s->cache_free_slots = CONFIG_SECURITY_SELINUX_SID2STR_CACHE_SIZE; INIT_LIST_HEAD(&s->cache_lru_list); spin_lock_init(&s->cache_lock); #endif return 0; } static u32 context_to_sid(struct sidtab *s, struct context *context, u32 hash) { struct sidtab_entry *entry; u32 sid = 0; rcu_read_lock(); hash_for_each_possible_rcu(s->context_to_sid, entry, list, hash) { if (entry->hash != hash) continue; if (context_equal(&entry->context, context)) { sid = entry->sid; break; } } rcu_read_unlock(); return sid; } int sidtab_set_initial(struct sidtab *s, u32 sid, struct context *context) { struct sidtab_isid_entry *isid; u32 hash; int rc; if (sid == 0 || sid > SECINITSID_NUM) return -EINVAL; isid = &s->isids[sid - 1]; rc = context_cpy(&isid->entry.context, context); if (rc) return rc; #if CONFIG_SECURITY_SELINUX_SID2STR_CACHE_SIZE > 0 isid->entry.cache = NULL; #endif isid->set = 1; hash = context_compute_hash(context); /* * Multiple initial sids may map to the same context. Check that this * context is not already represented in the context_to_sid hashtable * to avoid duplicate entries and long linked lists upon hash * collision. */ if (!context_to_sid(s, context, hash)) { isid->entry.sid = sid; isid->entry.hash = hash; hash_add(s->context_to_sid, &isid->entry.list, hash); } return 0; } int sidtab_hash_stats(struct sidtab *sidtab, char *page) { unsigned int i; int chain_len = 0; int slots_used = 0; int entries = 0; int max_chain_len = 0; unsigned int cur_bucket = 0; struct sidtab_entry *entry; rcu_read_lock(); hash_for_each_rcu(sidtab->context_to_sid, i, entry, list) { entries++; if (i == cur_bucket) { chain_len++; if (chain_len == 1) slots_used++; } else { cur_bucket = i; if (chain_len > max_chain_len) max_chain_len = chain_len; chain_len = 0; } } rcu_read_unlock(); if (chain_len > max_chain_len) max_chain_len = chain_len; return scnprintf(page, PAGE_SIZE, "entries: %d\nbuckets used: %d/%d\n" "longest chain: %d\n", entries, slots_used, SIDTAB_HASH_BUCKETS, max_chain_len); } static u32 sidtab_level_from_count(u32 count) { u32 capacity = SIDTAB_LEAF_ENTRIES; u32 level = 0; while (count > capacity) { capacity <<= SIDTAB_INNER_SHIFT; ++level; } return level; } static int sidtab_alloc_roots(struct sidtab *s, u32 level) { u32 l; if (!s->roots[0].ptr_leaf) { s->roots[0].ptr_leaf = kzalloc(SIDTAB_NODE_ALLOC_SIZE, GFP_ATOMIC); if (!s->roots[0].ptr_leaf) return -ENOMEM; } for (l = 1; l <= level; ++l) if (!s->roots[l].ptr_inner) { s->roots[l].ptr_inner = kzalloc(SIDTAB_NODE_ALLOC_SIZE, GFP_ATOMIC); if (!s->roots[l].ptr_inner) return -ENOMEM; s->roots[l].ptr_inner->entries[0] = s->roots[l - 1]; } return 0; } static struct sidtab_entry *sidtab_do_lookup(struct sidtab *s, u32 index, int alloc) { union sidtab_entry_inner *entry; u32 level, capacity_shift, leaf_index = index / SIDTAB_LEAF_ENTRIES; /* find the level of the subtree we need */ level = sidtab_level_from_count(index + 1); capacity_shift = level * SIDTAB_INNER_SHIFT; /* allocate roots if needed */ if (alloc && sidtab_alloc_roots(s, level) != 0) return NULL; /* lookup inside the subtree */ entry = &s->roots[level]; while (level != 0) { capacity_shift -= SIDTAB_INNER_SHIFT; --level; entry = &entry->ptr_inner->entries[leaf_index >> capacity_shift]; leaf_index &= ((u32)1 << capacity_shift) - 1; if (!entry->ptr_inner) { if (alloc) entry->ptr_inner = kzalloc( SIDTAB_NODE_ALLOC_SIZE, GFP_ATOMIC); if (!entry->ptr_inner) return NULL; } } if (!entry->ptr_leaf) { if (alloc) entry->ptr_leaf = kzalloc(SIDTAB_NODE_ALLOC_SIZE, GFP_ATOMIC); if (!entry->ptr_leaf) return NULL; } return &entry->ptr_leaf->entries[index % SIDTAB_LEAF_ENTRIES]; } static struct sidtab_entry *sidtab_lookup(struct sidtab *s, u32 index) { /* read entries only after reading count */ u32 count = smp_load_acquire(&s->count); if (index >= count) return NULL; return sidtab_do_lookup(s, index, 0); } static struct sidtab_entry *sidtab_lookup_initial(struct sidtab *s, u32 sid) { return s->isids[sid - 1].set ? &s->isids[sid - 1].entry : NULL; } static struct sidtab_entry *sidtab_search_core(struct sidtab *s, u32 sid, int force) { if (sid != 0) { struct sidtab_entry *entry; if (sid > SECINITSID_NUM) entry = sidtab_lookup(s, sid_to_index(sid)); else entry = sidtab_lookup_initial(s, sid); if (entry && (!entry->context.len || force)) return entry; } return sidtab_lookup_initial(s, SECINITSID_UNLABELED); } struct sidtab_entry *sidtab_search_entry(struct sidtab *s, u32 sid) { return sidtab_search_core(s, sid, 0); } struct sidtab_entry *sidtab_search_entry_force(struct sidtab *s, u32 sid) { return sidtab_search_core(s, sid, 1); } int sidtab_context_to_sid(struct sidtab *s, struct context *context, u32 *sid) { unsigned long flags; u32 count, hash = context_compute_hash(context); struct sidtab_convert_params *convert; struct sidtab_entry *dst, *dst_convert; int rc; *sid = context_to_sid(s, context, hash); if (*sid) return 0; /* lock-free search failed: lock, re-search, and insert if not found */ spin_lock_irqsave(&s->lock, flags); rc = 0; *sid = context_to_sid(s, context, hash); if (*sid) goto out_unlock; if (unlikely(s->frozen)) { /* * This sidtab is now frozen - tell the caller to abort and * get the new one. */ rc = -ESTALE; goto out_unlock; } count = s->count; /* bail out if we already reached max entries */ rc = -EOVERFLOW; if (count >= SIDTAB_MAX) goto out_unlock; /* insert context into new entry */ rc = -ENOMEM; dst = sidtab_do_lookup(s, count, 1); if (!dst) goto out_unlock; dst->sid = index_to_sid(count); dst->hash = hash; rc = context_cpy(&dst->context, context); if (rc) goto out_unlock; /* * if we are building a new sidtab, we need to convert the context * and insert it there as well */ convert = s->convert; if (convert) { struct sidtab *target = convert->target; rc = -ENOMEM; dst_convert = sidtab_do_lookup(target, count, 1); if (!dst_convert) { context_destroy(&dst->context); goto out_unlock; } rc = services_convert_context(convert->args, context, &dst_convert->context, GFP_ATOMIC); if (rc) { context_destroy(&dst->context); goto out_unlock; } dst_convert->sid = index_to_sid(count); dst_convert->hash = context_compute_hash(&dst_convert->context); target->count = count + 1; hash_add_rcu(target->context_to_sid, &dst_convert->list, dst_convert->hash); } if (context->len) pr_info("SELinux: Context %s is not valid (left unmapped).\n", context->str); *sid = index_to_sid(count); /* write entries before updating count */ smp_store_release(&s->count, count + 1); hash_add_rcu(s->context_to_sid, &dst->list, dst->hash); rc = 0; out_unlock: spin_unlock_irqrestore(&s->lock, flags); return rc; } static void sidtab_convert_hashtable(struct sidtab *s, u32 count) { struct sidtab_entry *entry; u32 i; for (i = 0; i < count; i++) { entry = sidtab_do_lookup(s, i, 0); entry->sid = index_to_sid(i); entry->hash = context_compute_hash(&entry->context); hash_add_rcu(s->context_to_sid, &entry->list, entry->hash); } } static int sidtab_convert_tree(union sidtab_entry_inner *edst, union sidtab_entry_inner *esrc, u32 *pos, u32 count, u32 level, struct sidtab_convert_params *convert) { int rc; u32 i; if (level != 0) { if (!edst->ptr_inner) { edst->ptr_inner = kzalloc(SIDTAB_NODE_ALLOC_SIZE, GFP_KERNEL); if (!edst->ptr_inner) return -ENOMEM; } i = 0; while (i < SIDTAB_INNER_ENTRIES && *pos < count) { rc = sidtab_convert_tree(&edst->ptr_inner->entries[i], &esrc->ptr_inner->entries[i], pos, count, level - 1, convert); if (rc) return rc; i++; } } else { if (!edst->ptr_leaf) { edst->ptr_leaf = kzalloc(SIDTAB_NODE_ALLOC_SIZE, GFP_KERNEL); if (!edst->ptr_leaf) return -ENOMEM; } i = 0; while (i < SIDTAB_LEAF_ENTRIES && *pos < count) { rc = services_convert_context( convert->args, &esrc->ptr_leaf->entries[i].context, &edst->ptr_leaf->entries[i].context, GFP_KERNEL); if (rc) return rc; (*pos)++; i++; } cond_resched(); } return 0; } int sidtab_convert(struct sidtab *s, struct sidtab_convert_params *params) { unsigned long flags; u32 count, level, pos; int rc; spin_lock_irqsave(&s->lock, flags); /* concurrent policy loads are not allowed */ if (s->convert) { spin_unlock_irqrestore(&s->lock, flags); return -EBUSY; } count = s->count; level = sidtab_level_from_count(count); /* allocate last leaf in the new sidtab (to avoid race with * live convert) */ rc = sidtab_do_lookup(params->target, count - 1, 1) ? 0 : -ENOMEM; if (rc) { spin_unlock_irqrestore(&s->lock, flags); return rc; } /* set count in case no new entries are added during conversion */ params->target->count = count; /* enable live convert of new entries */ s->convert = params; /* we can safely convert the tree outside the lock */ spin_unlock_irqrestore(&s->lock, flags); pr_info("SELinux: Converting %u SID table entries...\n", count); /* convert all entries not covered by live convert */ pos = 0; rc = sidtab_convert_tree(&params->target->roots[level], &s->roots[level], &pos, count, level, params); if (rc) { /* we need to keep the old table - disable live convert */ spin_lock_irqsave(&s->lock, flags); s->convert = NULL; spin_unlock_irqrestore(&s->lock, flags); return rc; } /* * The hashtable can also be modified in sidtab_context_to_sid() * so we must re-acquire the lock here. */ spin_lock_irqsave(&s->lock, flags); sidtab_convert_hashtable(params->target, count); spin_unlock_irqrestore(&s->lock, flags); return 0; } void sidtab_cancel_convert(struct sidtab *s) { unsigned long flags; /* cancelling policy load - disable live convert of sidtab */ spin_lock_irqsave(&s->lock, flags); s->convert = NULL; spin_unlock_irqrestore(&s->lock, flags); } void sidtab_freeze_begin(struct sidtab *s, unsigned long *flags) __acquires(&s->lock) { spin_lock_irqsave(&s->lock, *flags); s->frozen = true; s->convert = NULL; } void sidtab_freeze_end(struct sidtab *s, unsigned long *flags) __releases(&s->lock) { spin_unlock_irqrestore(&s->lock, *flags); } static void sidtab_destroy_entry(struct sidtab_entry *entry) { context_destroy(&entry->context); #if CONFIG_SECURITY_SELINUX_SID2STR_CACHE_SIZE > 0 kfree(rcu_dereference_raw(entry->cache)); #endif } static void sidtab_destroy_tree(union sidtab_entry_inner entry, u32 level) { u32 i; if (level != 0) { struct sidtab_node_inner *node = entry.ptr_inner; if (!node) return; for (i = 0; i < SIDTAB_INNER_ENTRIES; i++) sidtab_destroy_tree(node->entries[i], level - 1); kfree(node); } else { struct sidtab_node_leaf *node = entry.ptr_leaf; if (!node) return; for (i = 0; i < SIDTAB_LEAF_ENTRIES; i++) sidtab_destroy_entry(&node->entries[i]); kfree(node); } } void sidtab_destroy(struct sidtab *s) { u32 i, level; for (i = 0; i < SECINITSID_NUM; i++) if (s->isids[i].set) sidtab_destroy_entry(&s->isids[i].entry); level = SIDTAB_MAX_LEVEL; while (level && !s->roots[level].ptr_inner) --level; sidtab_destroy_tree(s->roots[level], level); /* * The context_to_sid hashtable's objects are all shared * with the isids array and context tree, and so don't need * to be cleaned up here. */ } #if CONFIG_SECURITY_SELINUX_SID2STR_CACHE_SIZE > 0 void sidtab_sid2str_put(struct sidtab *s, struct sidtab_entry *entry, const char *str, u32 str_len) { struct sidtab_str_cache *cache, *victim = NULL; unsigned long flags; /* do not cache invalid contexts */ if (entry->context.len) return; spin_lock_irqsave(&s->cache_lock, flags); cache = rcu_dereference_protected(entry->cache, lockdep_is_held(&s->cache_lock)); if (cache) { /* entry in cache - just bump to the head of LRU list */ list_move(&cache->lru_member, &s->cache_lru_list); goto out_unlock; } cache = kmalloc(struct_size(cache, str, str_len), GFP_ATOMIC); if (!cache) goto out_unlock; if (s->cache_free_slots == 0) { /* pop a cache entry from the tail and free it */ victim = container_of(s->cache_lru_list.prev, struct sidtab_str_cache, lru_member); list_del(&victim->lru_member); rcu_assign_pointer(victim->parent->cache, NULL); } else { s->cache_free_slots--; } cache->parent = entry; cache->len = str_len; memcpy(cache->str, str, str_len); list_add(&cache->lru_member, &s->cache_lru_list); rcu_assign_pointer(entry->cache, cache); out_unlock: spin_unlock_irqrestore(&s->cache_lock, flags); kfree_rcu(victim, rcu_member); } int sidtab_sid2str_get(struct sidtab *s, struct sidtab_entry *entry, char **out, u32 *out_len) { struct sidtab_str_cache *cache; int rc = 0; if (entry->context.len) return -ENOENT; /* do not cache invalid contexts */ rcu_read_lock(); cache = rcu_dereference(entry->cache); if (!cache) { rc = -ENOENT; } else { *out_len = cache->len; if (out) { *out = kmemdup(cache->str, cache->len, GFP_ATOMIC); if (!*out) rc = -ENOMEM; } } rcu_read_unlock(); if (!rc && out) sidtab_sid2str_put(s, entry, *out, *out_len); return rc; } #endif /* CONFIG_SECURITY_SELINUX_SID2STR_CACHE_SIZE > 0 */
1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 // SPDX-License-Identifier: GPL-2.0-only #include <linux/ethtool_netlink.h> #include <linux/bitmap.h> #include "netlink.h" #include "bitset.h" /* Some bitmaps are internally represented as an array of unsigned long, some * as an array of u32 (some even as single u32 for now). To avoid the need of * wrappers on caller side, we provide two set of functions: those with "32" * suffix in their names expect u32 based bitmaps, those without it expect * unsigned long bitmaps. */ static u32 ethnl_lower_bits(unsigned int n) { return ~(u32)0 >> (32 - n % 32); } static u32 ethnl_upper_bits(unsigned int n) { return ~(u32)0 << (n % 32); } /** * ethnl_bitmap32_clear() - Clear u32 based bitmap * @dst: bitmap to clear * @start: beginning of the interval * @end: end of the interval * @mod: set if bitmap was modified * * Clear @nbits bits of a bitmap with indices @start <= i < @end */ static void ethnl_bitmap32_clear(u32 *dst, unsigned int start, unsigned int end, bool *mod) { unsigned int start_word = start / 32; unsigned int end_word = end / 32; unsigned int i; u32 mask; if (end <= start) return; if (start % 32) { mask = ethnl_upper_bits(start); if (end_word == start_word) { mask &= ethnl_lower_bits(end); if (dst[start_word] & mask) { dst[start_word] &= ~mask; *mod = true; } return; } if (dst[start_word] & mask) { dst[start_word] &= ~mask; *mod = true; } start_word++; } for (i = start_word; i < end_word; i++) { if (dst[i]) { dst[i] = 0; *mod = true; } } if (end % 32) { mask = ethnl_lower_bits(end); if (dst[end_word] & mask) { dst[end_word] &= ~mask; *mod = true; } } } /** * ethnl_bitmap32_not_zero() - Check if any bit is set in an interval * @map: bitmap to test * @start: beginning of the interval * @end: end of the interval * * Return: true if there is non-zero bit with index @start <= i < @end, * false if the whole interval is zero */ static bool ethnl_bitmap32_not_zero(const u32 *map, unsigned int start, unsigned int end) { unsigned int start_word = start / 32; unsigned int end_word = end / 32; u32 mask; if (end <= start) return true; if (start % 32) { mask = ethnl_upper_bits(start); if (end_word == start_word) { mask &= ethnl_lower_bits(end); return map[start_word] & mask; } if (map[start_word] & mask) return true; start_word++; } if (!memchr_inv(map + start_word, '\0', (end_word - start_word) * sizeof(u32))) return true; if (end % 32 == 0) return true; return map[end_word] & ethnl_lower_bits(end); } /** * ethnl_bitmap32_update() - Modify u32 based bitmap according to value/mask * pair * @dst: bitmap to update * @nbits: bit size of the bitmap * @value: values to set * @mask: mask of bits to set * @mod: set to true if bitmap is modified, preserve if not * * Set bits in @dst bitmap which are set in @mask to values from @value, leave * the rest untouched. If destination bitmap was modified, set @mod to true, * leave as it is if not. */ static void ethnl_bitmap32_update(u32 *dst, unsigned int nbits, const u32 *value, const u32 *mask, bool *mod) { while (nbits > 0) { u32 real_mask = mask ? *mask : ~(u32)0; u32 new_value; if (nbits < 32) real_mask &= ethnl_lower_bits(nbits); new_value = (*dst & ~real_mask) | (*value & real_mask); if (new_value != *dst) { *dst = new_value; *mod = true; } if (nbits <= 32) break; dst++; nbits -= 32; value++; if (mask) mask++; } } static bool ethnl_bitmap32_test_bit(const u32 *map, unsigned int index) { return map[index / 32] & (1U << (index % 32)); } /** * ethnl_bitset32_size() - Calculate size of bitset nested attribute * @val: value bitmap (u32 based) * @mask: mask bitmap (u32 based, optional) * @nbits: bit length of the bitset * @names: array of bit names (optional) * @compact: assume compact format for output * * Estimate length of netlink attribute composed by a later call to * ethnl_put_bitset32() call with the same arguments. * * Return: negative error code or attribute length estimate */ int ethnl_bitset32_size(const u32 *val, const u32 *mask, unsigned int nbits, ethnl_string_array_t names, bool compact) { unsigned int len = 0; /* list flag */ if (!mask) len += nla_total_size(sizeof(u32)); /* size */ len += nla_total_size(sizeof(u32)); if (compact) { unsigned int nwords = DIV_ROUND_UP(nbits, 32); /* value, mask */ len += (mask ? 2 : 1) * nla_total_size(nwords * sizeof(u32)); } else { unsigned int bits_len = 0; unsigned int bit_len, i; for (i = 0; i < nbits; i++) { const char *name = names ? names[i] : NULL; if (!ethnl_bitmap32_test_bit(mask ?: val, i)) continue; /* index */ bit_len = nla_total_size(sizeof(u32)); /* name */ if (name) bit_len += ethnl_strz_size(name); /* value */ if (mask && ethnl_bitmap32_test_bit(val, i)) bit_len += nla_total_size(0); /* bit nest */ bits_len += nla_total_size(bit_len); } /* bits nest */ len += nla_total_size(bits_len); } /* outermost nest */ return nla_total_size(len); } /** * ethnl_put_bitset32() - Put a bitset nest into a message * @skb: skb with the message * @attrtype: attribute type for the bitset nest * @val: value bitmap (u32 based) * @mask: mask bitmap (u32 based, optional) * @nbits: bit length of the bitset * @names: array of bit names (optional) * @compact: use compact format for the output * * Compose a nested attribute representing a bitset. If @mask is null, simple * bitmap (bit list) is created, if @mask is provided, represent a value/mask * pair. Bit names are only used in verbose mode and when provided by calller. * * Return: 0 on success, negative error value on error */ int ethnl_put_bitset32(struct sk_buff *skb, int attrtype, const u32 *val, const u32 *mask, unsigned int nbits, ethnl_string_array_t names, bool compact) { struct nlattr *nest; struct nlattr *attr; nest = nla_nest_start(skb, attrtype); if (!nest) return -EMSGSIZE; if (!mask && nla_put_flag(skb, ETHTOOL_A_BITSET_NOMASK)) goto nla_put_failure; if (nla_put_u32(skb, ETHTOOL_A_BITSET_SIZE, nbits)) goto nla_put_failure; if (compact) { unsigned int nwords = DIV_ROUND_UP(nbits, 32); unsigned int nbytes = nwords * sizeof(u32); u32 *dst; attr = nla_reserve(skb, ETHTOOL_A_BITSET_VALUE, nbytes); if (!attr) goto nla_put_failure; dst = nla_data(attr); memcpy(dst, val, nbytes); if (nbits % 32) dst[nwords - 1] &= ethnl_lower_bits(nbits); if (mask) { attr = nla_reserve(skb, ETHTOOL_A_BITSET_MASK, nbytes); if (!attr) goto nla_put_failure; dst = nla_data(attr); memcpy(dst, mask, nbytes); if (nbits % 32) dst[nwords - 1] &= ethnl_lower_bits(nbits); } } else { struct nlattr *bits; unsigned int i; bits = nla_nest_start(skb, ETHTOOL_A_BITSET_BITS); if (!bits) goto nla_put_failure; for (i = 0; i < nbits; i++) { const char *name = names ? names[i] : NULL; if (!ethnl_bitmap32_test_bit(mask ?: val, i)) continue; attr = nla_nest_start(skb, ETHTOOL_A_BITSET_BITS_BIT); if (!attr) goto nla_put_failure; if (nla_put_u32(skb, ETHTOOL_A_BITSET_BIT_INDEX, i)) goto nla_put_failure; if (name && ethnl_put_strz(skb, ETHTOOL_A_BITSET_BIT_NAME, name)) goto nla_put_failure; if (mask && ethnl_bitmap32_test_bit(val, i) && nla_put_flag(skb, ETHTOOL_A_BITSET_BIT_VALUE)) goto nla_put_failure; nla_nest_end(skb, attr); } nla_nest_end(skb, bits); } nla_nest_end(skb, nest); return 0; nla_put_failure: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static const struct nla_policy bitset_policy[] = { [ETHTOOL_A_BITSET_NOMASK] = { .type = NLA_FLAG }, [ETHTOOL_A_BITSET_SIZE] = NLA_POLICY_MAX(NLA_U32, ETHNL_MAX_BITSET_SIZE), [ETHTOOL_A_BITSET_BITS] = { .type = NLA_NESTED }, [ETHTOOL_A_BITSET_VALUE] = { .type = NLA_BINARY }, [ETHTOOL_A_BITSET_MASK] = { .type = NLA_BINARY }, }; static const struct nla_policy bit_policy[] = { [ETHTOOL_A_BITSET_BIT_INDEX] = { .type = NLA_U32 }, [ETHTOOL_A_BITSET_BIT_NAME] = { .type = NLA_NUL_STRING }, [ETHTOOL_A_BITSET_BIT_VALUE] = { .type = NLA_FLAG }, }; /** * ethnl_bitset_is_compact() - check if bitset attribute represents a compact * bitset * @bitset: nested attribute representing a bitset * @compact: pointer for return value * * Return: 0 on success, negative error code on failure */ int ethnl_bitset_is_compact(const struct nlattr *bitset, bool *compact) { struct nlattr *tb[ARRAY_SIZE(bitset_policy)]; int ret; ret = nla_parse_nested(tb, ARRAY_SIZE(bitset_policy) - 1, bitset, bitset_policy, NULL); if (ret < 0) return ret; if (tb[ETHTOOL_A_BITSET_BITS]) { if (tb[ETHTOOL_A_BITSET_VALUE] || tb[ETHTOOL_A_BITSET_MASK]) return -EINVAL; *compact = false; return 0; } if (!tb[ETHTOOL_A_BITSET_SIZE] || !tb[ETHTOOL_A_BITSET_VALUE]) return -EINVAL; *compact = true; return 0; } /** * ethnl_name_to_idx() - look up string index for a name * @names: array of ETH_GSTRING_LEN sized strings * @n_names: number of strings in the array * @name: name to look up * * Return: index of the string if found, -ENOENT if not found */ static int ethnl_name_to_idx(ethnl_string_array_t names, unsigned int n_names, const char *name) { unsigned int i; if (!names) return -ENOENT; for (i = 0; i < n_names; i++) { /* names[i] may not be null terminated */ if (!strncmp(names[i], name, ETH_GSTRING_LEN) && strlen(name) <= ETH_GSTRING_LEN) return i; } return -ENOENT; } static int ethnl_parse_bit(unsigned int *index, bool *val, unsigned int nbits, const struct nlattr *bit_attr, bool no_mask, ethnl_string_array_t names, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(bit_policy)]; int ret, idx; ret = nla_parse_nested(tb, ARRAY_SIZE(bit_policy) - 1, bit_attr, bit_policy, extack); if (ret < 0) return ret; if (tb[ETHTOOL_A_BITSET_BIT_INDEX]) { const char *name; idx = nla_get_u32(tb[ETHTOOL_A_BITSET_BIT_INDEX]); if (idx >= nbits) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_BIT_INDEX], "bit index too high"); return -EOPNOTSUPP; } name = names ? names[idx] : NULL; if (tb[ETHTOOL_A_BITSET_BIT_NAME] && name && strncmp(nla_data(tb[ETHTOOL_A_BITSET_BIT_NAME]), name, nla_len(tb[ETHTOOL_A_BITSET_BIT_NAME]))) { NL_SET_ERR_MSG_ATTR(extack, bit_attr, "bit index and name mismatch"); return -EINVAL; } } else if (tb[ETHTOOL_A_BITSET_BIT_NAME]) { idx = ethnl_name_to_idx(names, nbits, nla_data(tb[ETHTOOL_A_BITSET_BIT_NAME])); if (idx < 0) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_BIT_NAME], "bit name not found"); return -EOPNOTSUPP; } } else { NL_SET_ERR_MSG_ATTR(extack, bit_attr, "neither bit index nor name specified"); return -EINVAL; } *index = idx; *val = no_mask || tb[ETHTOOL_A_BITSET_BIT_VALUE]; return 0; } /** * ethnl_bitmap32_equal() - Compare two bitmaps * @map1: first bitmap * @map2: second bitmap * @nbits: bit size to compare * * Return: true if first @nbits are equal, false if not */ static bool ethnl_bitmap32_equal(const u32 *map1, const u32 *map2, unsigned int nbits) { if (memcmp(map1, map2, nbits / 32 * sizeof(u32))) return false; if (nbits % 32 == 0) return true; return !((map1[nbits / 32] ^ map2[nbits / 32]) & ethnl_lower_bits(nbits % 32)); } static int ethnl_update_bitset32_verbose(u32 *bitmap, unsigned int nbits, const struct nlattr *attr, struct nlattr **tb, ethnl_string_array_t names, struct netlink_ext_ack *extack, bool *mod) { u32 *saved_bitmap = NULL; struct nlattr *bit_attr; bool no_mask; int rem; int ret; if (tb[ETHTOOL_A_BITSET_VALUE]) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_VALUE], "value only allowed in compact bitset"); return -EINVAL; } if (tb[ETHTOOL_A_BITSET_MASK]) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_MASK], "mask only allowed in compact bitset"); return -EINVAL; } no_mask = tb[ETHTOOL_A_BITSET_NOMASK]; if (no_mask) { unsigned int nwords = DIV_ROUND_UP(nbits, 32); unsigned int nbytes = nwords * sizeof(u32); bool dummy; /* The bitmap size is only the size of the map part without * its mask part. */ saved_bitmap = kcalloc(nwords, sizeof(u32), GFP_KERNEL); if (!saved_bitmap) return -ENOMEM; memcpy(saved_bitmap, bitmap, nbytes); ethnl_bitmap32_clear(bitmap, 0, nbits, &dummy); } nla_for_each_nested(bit_attr, tb[ETHTOOL_A_BITSET_BITS], rem) { bool old_val, new_val; unsigned int idx; if (nla_type(bit_attr) != ETHTOOL_A_BITSET_BITS_BIT) { NL_SET_ERR_MSG_ATTR(extack, bit_attr, "only ETHTOOL_A_BITSET_BITS_BIT allowed in ETHTOOL_A_BITSET_BITS"); kfree(saved_bitmap); return -EINVAL; } ret = ethnl_parse_bit(&idx, &new_val, nbits, bit_attr, no_mask, names, extack); if (ret < 0) { kfree(saved_bitmap); return ret; } old_val = bitmap[idx / 32] & ((u32)1 << (idx % 32)); if (new_val != old_val) { if (new_val) bitmap[idx / 32] |= ((u32)1 << (idx % 32)); else bitmap[idx / 32] &= ~((u32)1 << (idx % 32)); if (!no_mask) *mod = true; } } if (no_mask && !ethnl_bitmap32_equal(saved_bitmap, bitmap, nbits)) *mod = true; kfree(saved_bitmap); return 0; } static int ethnl_compact_sanity_checks(unsigned int nbits, const struct nlattr *nest, struct nlattr **tb, struct netlink_ext_ack *extack) { bool no_mask = tb[ETHTOOL_A_BITSET_NOMASK]; unsigned int attr_nbits, attr_nwords; const struct nlattr *test_attr; if (no_mask && tb[ETHTOOL_A_BITSET_MASK]) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_MASK], "mask not allowed in list bitset"); return -EINVAL; } if (!tb[ETHTOOL_A_BITSET_SIZE]) { NL_SET_ERR_MSG_ATTR(extack, nest, "missing size in compact bitset"); return -EINVAL; } if (!tb[ETHTOOL_A_BITSET_VALUE]) { NL_SET_ERR_MSG_ATTR(extack, nest, "missing value in compact bitset"); return -EINVAL; } if (!no_mask && !tb[ETHTOOL_A_BITSET_MASK]) { NL_SET_ERR_MSG_ATTR(extack, nest, "missing mask in compact nonlist bitset"); return -EINVAL; } attr_nbits = nla_get_u32(tb[ETHTOOL_A_BITSET_SIZE]); attr_nwords = DIV_ROUND_UP(attr_nbits, 32); if (nla_len(tb[ETHTOOL_A_BITSET_VALUE]) != attr_nwords * sizeof(u32)) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_VALUE], "bitset value length does not match size"); return -EINVAL; } if (tb[ETHTOOL_A_BITSET_MASK] && nla_len(tb[ETHTOOL_A_BITSET_MASK]) != attr_nwords * sizeof(u32)) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_MASK], "bitset mask length does not match size"); return -EINVAL; } if (attr_nbits <= nbits) return 0; test_attr = no_mask ? tb[ETHTOOL_A_BITSET_VALUE] : tb[ETHTOOL_A_BITSET_MASK]; if (ethnl_bitmap32_not_zero(nla_data(test_attr), nbits, attr_nbits)) { NL_SET_ERR_MSG_ATTR(extack, test_attr, "cannot modify bits past kernel bitset size"); return -EINVAL; } return 0; } /** * ethnl_update_bitset32() - Apply a bitset nest to a u32 based bitmap * @bitmap: bitmap to update * @nbits: size of the updated bitmap in bits * @attr: nest attribute to parse and apply * @names: array of bit names; may be null for compact format * @extack: extack for error reporting * @mod: set this to true if bitmap is modified, leave as it is if not * * Apply bitset netsted attribute to a bitmap. If the attribute represents * a bit list, @bitmap is set to its contents; otherwise, bits in mask are * set to values from value. Bitmaps in the attribute may be longer than * @nbits but the message must not request modifying any bits past @nbits. * * Return: negative error code on failure, 0 on success */ int ethnl_update_bitset32(u32 *bitmap, unsigned int nbits, const struct nlattr *attr, ethnl_string_array_t names, struct netlink_ext_ack *extack, bool *mod) { struct nlattr *tb[ARRAY_SIZE(bitset_policy)]; unsigned int change_bits; bool no_mask; int ret; if (!attr) return 0; ret = nla_parse_nested(tb, ARRAY_SIZE(bitset_policy) - 1, attr, bitset_policy, extack); if (ret < 0) return ret; if (tb[ETHTOOL_A_BITSET_BITS]) return ethnl_update_bitset32_verbose(bitmap, nbits, attr, tb, names, extack, mod); ret = ethnl_compact_sanity_checks(nbits, attr, tb, extack); if (ret < 0) return ret; no_mask = tb[ETHTOOL_A_BITSET_NOMASK]; change_bits = min_t(unsigned int, nla_get_u32(tb[ETHTOOL_A_BITSET_SIZE]), nbits); ethnl_bitmap32_update(bitmap, change_bits, nla_data(tb[ETHTOOL_A_BITSET_VALUE]), no_mask ? NULL : nla_data(tb[ETHTOOL_A_BITSET_MASK]), mod); if (no_mask && change_bits < nbits) ethnl_bitmap32_clear(bitmap, change_bits, nbits, mod); return 0; } /** * ethnl_parse_bitset() - Compute effective value and mask from bitset nest * @val: unsigned long based bitmap to put value into * @mask: unsigned long based bitmap to put mask into * @nbits: size of @val and @mask bitmaps * @attr: nest attribute to parse and apply * @names: array of bit names; may be null for compact format * @extack: extack for error reporting * * Provide @nbits size long bitmaps for value and mask so that * x = (val & mask) | (x & ~mask) would modify any @nbits sized bitmap x * the same way ethnl_update_bitset() with the same bitset attribute would. * * Return: negative error code on failure, 0 on success */ int ethnl_parse_bitset(unsigned long *val, unsigned long *mask, unsigned int nbits, const struct nlattr *attr, ethnl_string_array_t names, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(bitset_policy)]; const struct nlattr *bit_attr; bool no_mask; int rem; int ret; if (!attr) return 0; ret = nla_parse_nested(tb, ARRAY_SIZE(bitset_policy) - 1, attr, bitset_policy, extack); if (ret < 0) return ret; no_mask = tb[ETHTOOL_A_BITSET_NOMASK]; if (!tb[ETHTOOL_A_BITSET_BITS]) { unsigned int change_bits; ret = ethnl_compact_sanity_checks(nbits, attr, tb, extack); if (ret < 0) return ret; change_bits = nla_get_u32(tb[ETHTOOL_A_BITSET_SIZE]); if (change_bits > nbits) change_bits = nbits; bitmap_from_arr32(val, nla_data(tb[ETHTOOL_A_BITSET_VALUE]), change_bits); if (change_bits < nbits) bitmap_clear(val, change_bits, nbits - change_bits); if (no_mask) { bitmap_fill(mask, nbits); } else { bitmap_from_arr32(mask, nla_data(tb[ETHTOOL_A_BITSET_MASK]), change_bits); if (change_bits < nbits) bitmap_clear(mask, change_bits, nbits - change_bits); } return 0; } if (tb[ETHTOOL_A_BITSET_VALUE]) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_VALUE], "value only allowed in compact bitset"); return -EINVAL; } if (tb[ETHTOOL_A_BITSET_MASK]) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_MASK], "mask only allowed in compact bitset"); return -EINVAL; } bitmap_zero(val, nbits); if (no_mask) bitmap_fill(mask, nbits); else bitmap_zero(mask, nbits); nla_for_each_nested(bit_attr, tb[ETHTOOL_A_BITSET_BITS], rem) { unsigned int idx; bool bit_val; ret = ethnl_parse_bit(&idx, &bit_val, nbits, bit_attr, no_mask, names, extack); if (ret < 0) return ret; if (bit_val) __set_bit(idx, val); if (!no_mask) __set_bit(idx, mask); } return 0; } #if BITS_PER_LONG == 64 && defined(__BIG_ENDIAN) /* 64-bit big endian architectures are the only case when u32 based bitmaps * and unsigned long based bitmaps have different memory layout so that we * cannot simply cast the latter to the former and need actual wrappers * converting the latter to the former. * * To reduce the number of slab allocations, the wrappers use fixed size local * variables for bitmaps up to ETHNL_SMALL_BITMAP_BITS bits which is the * majority of bitmaps used by ethtool. */ #define ETHNL_SMALL_BITMAP_BITS 128 #define ETHNL_SMALL_BITMAP_WORDS DIV_ROUND_UP(ETHNL_SMALL_BITMAP_BITS, 32) int ethnl_bitset_size(const unsigned long *val, const unsigned long *mask, unsigned int nbits, ethnl_string_array_t names, bool compact) { u32 small_mask32[ETHNL_SMALL_BITMAP_WORDS]; u32 small_val32[ETHNL_SMALL_BITMAP_WORDS]; u32 *mask32; u32 *val32; int ret; if (nbits > ETHNL_SMALL_BITMAP_BITS) { unsigned int nwords = DIV_ROUND_UP(nbits, 32); val32 = kmalloc_array(2 * nwords, sizeof(u32), GFP_KERNEL); if (!val32) return -ENOMEM; mask32 = val32 + nwords; } else { val32 = small_val32; mask32 = small_mask32; } bitmap_to_arr32(val32, val, nbits); if (mask) bitmap_to_arr32(mask32, mask, nbits); else mask32 = NULL; ret = ethnl_bitset32_size(val32, mask32, nbits, names, compact); if (nbits > ETHNL_SMALL_BITMAP_BITS) kfree(val32); return ret; } int ethnl_put_bitset(struct sk_buff *skb, int attrtype, const unsigned long *val, const unsigned long *mask, unsigned int nbits, ethnl_string_array_t names, bool compact) { u32 small_mask32[ETHNL_SMALL_BITMAP_WORDS]; u32 small_val32[ETHNL_SMALL_BITMAP_WORDS]; u32 *mask32; u32 *val32; int ret; if (nbits > ETHNL_SMALL_BITMAP_BITS) { unsigned int nwords = DIV_ROUND_UP(nbits, 32); val32 = kmalloc_array(2 * nwords, sizeof(u32), GFP_KERNEL); if (!val32) return -ENOMEM; mask32 = val32 + nwords; } else { val32 = small_val32; mask32 = small_mask32; } bitmap_to_arr32(val32, val, nbits); if (mask) bitmap_to_arr32(mask32, mask, nbits); else mask32 = NULL; ret = ethnl_put_bitset32(skb, attrtype, val32, mask32, nbits, names, compact); if (nbits > ETHNL_SMALL_BITMAP_BITS) kfree(val32); return ret; } int ethnl_update_bitset(unsigned long *bitmap, unsigned int nbits, const struct nlattr *attr, ethnl_string_array_t names, struct netlink_ext_ack *extack, bool *mod) { u32 small_bitmap32[ETHNL_SMALL_BITMAP_WORDS]; u32 *bitmap32 = small_bitmap32; bool u32_mod = false; int ret; if (nbits > ETHNL_SMALL_BITMAP_BITS) { unsigned int dst_words = DIV_ROUND_UP(nbits, 32); bitmap32 = kmalloc_array(dst_words, sizeof(u32), GFP_KERNEL); if (!bitmap32) return -ENOMEM; } bitmap_to_arr32(bitmap32, bitmap, nbits); ret = ethnl_update_bitset32(bitmap32, nbits, attr, names, extack, &u32_mod); if (u32_mod) { bitmap_from_arr32(bitmap, bitmap32, nbits); *mod = true; } if (nbits > ETHNL_SMALL_BITMAP_BITS) kfree(bitmap32); return ret; } #else /* On little endian 64-bit and all 32-bit architectures, an unsigned long * based bitmap can be interpreted as u32 based one using a simple cast. */ int ethnl_bitset_size(const unsigned long *val, const unsigned long *mask, unsigned int nbits, ethnl_string_array_t names, bool compact) { return ethnl_bitset32_size((const u32 *)val, (const u32 *)mask, nbits, names, compact); } int ethnl_put_bitset(struct sk_buff *skb, int attrtype, const unsigned long *val, const unsigned long *mask, unsigned int nbits, ethnl_string_array_t names, bool compact) { return ethnl_put_bitset32(skb, attrtype, (const u32 *)val, (const u32 *)mask, nbits, names, compact); } int ethnl_update_bitset(unsigned long *bitmap, unsigned int nbits, const struct nlattr *attr, ethnl_string_array_t names, struct netlink_ext_ack *extack, bool *mod) { return ethnl_update_bitset32((u32 *)bitmap, nbits, attr, names, extack, mod); } #endif /* BITS_PER_LONG == 64 && defined(__BIG_ENDIAN) */
29 29 28 29 29 29 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef _LINUX_RCUREF_H #define _LINUX_RCUREF_H #include <linux/atomic.h> #include <linux/bug.h> #include <linux/limits.h> #include <linux/lockdep.h> #include <linux/preempt.h> #include <linux/rcupdate.h> #define RCUREF_ONEREF 0x00000000U #define RCUREF_MAXREF 0x7FFFFFFFU #define RCUREF_SATURATED 0xA0000000U #define RCUREF_RELEASED 0xC0000000U #define RCUREF_DEAD 0xE0000000U #define RCUREF_NOREF 0xFFFFFFFFU /** * rcuref_init - Initialize a rcuref reference count with the given reference count * @ref: Pointer to the reference count * @cnt: The initial reference count typically '1' */ static inline void rcuref_init(rcuref_t *ref, unsigned int cnt) { atomic_set(&ref->refcnt, cnt - 1); } /** * rcuref_read - Read the number of held reference counts of a rcuref * @ref: Pointer to the reference count * * Return: The number of held references (0 ... N). The value 0 does not * indicate that it is safe to schedule the object, protected by this reference * counter, for deconstruction. * If you want to know if the reference counter has been marked DEAD (as * signaled by rcuref_put()) please use rcuread_is_dead(). */ static inline unsigned int rcuref_read(rcuref_t *ref) { unsigned int c = atomic_read(&ref->refcnt); /* Return 0 if within the DEAD zone. */ return c >= RCUREF_RELEASED ? 0 : c + 1; } /** * rcuref_is_dead - Check if the rcuref has been already marked dead * @ref: Pointer to the reference count * * Return: True if the object has been marked DEAD. This signals that a previous * invocation of rcuref_put() returned true on this reference counter meaning * the protected object can safely be scheduled for deconstruction. * Otherwise, returns false. */ static inline bool rcuref_is_dead(rcuref_t *ref) { unsigned int c = atomic_read(&ref->refcnt); return (c >= RCUREF_RELEASED) && (c < RCUREF_NOREF); } extern __must_check bool rcuref_get_slowpath(rcuref_t *ref); /** * rcuref_get - Acquire one reference on a rcuref reference count * @ref: Pointer to the reference count * * Similar to atomic_inc_not_zero() but saturates at RCUREF_MAXREF. * * Provides no memory ordering, it is assumed the caller has guaranteed the * object memory to be stable (RCU, etc.). It does provide a control dependency * and thereby orders future stores. See documentation in lib/rcuref.c * * Return: * False if the attempt to acquire a reference failed. This happens * when the last reference has been put already * * True if a reference was successfully acquired */ static inline __must_check bool rcuref_get(rcuref_t *ref) { /* * Unconditionally increase the reference count. The saturation and * dead zones provide enough tolerance for this. */ if (likely(!atomic_add_negative_relaxed(1, &ref->refcnt))) return true; /* Handle the cases inside the saturation and dead zones */ return rcuref_get_slowpath(ref); } extern __must_check bool rcuref_put_slowpath(rcuref_t *ref, unsigned int cnt); /* * Internal helper. Do not invoke directly. */ static __always_inline __must_check bool __rcuref_put(rcuref_t *ref) { int cnt; RCU_LOCKDEP_WARN(!rcu_read_lock_held() && preemptible(), "suspicious rcuref_put_rcusafe() usage"); /* * Unconditionally decrease the reference count. The saturation and * dead zones provide enough tolerance for this. */ cnt = atomic_sub_return_release(1, &ref->refcnt); if (likely(cnt >= 0)) return false; /* * Handle the last reference drop and cases inside the saturation * and dead zones. */ return rcuref_put_slowpath(ref, cnt); } /** * rcuref_put_rcusafe -- Release one reference for a rcuref reference count RCU safe * @ref: Pointer to the reference count * * Provides release memory ordering, such that prior loads and stores are done * before, and provides an acquire ordering on success such that free() * must come after. * * Can be invoked from contexts, which guarantee that no grace period can * happen which would free the object concurrently if the decrement drops * the last reference and the slowpath races against a concurrent get() and * put() pair. rcu_read_lock()'ed and atomic contexts qualify. * * Return: * True if this was the last reference with no future references * possible. This signals the caller that it can safely release the * object which is protected by the reference counter. * * False if there are still active references or the put() raced * with a concurrent get()/put() pair. Caller is not allowed to * release the protected object. */ static inline __must_check bool rcuref_put_rcusafe(rcuref_t *ref) { return __rcuref_put(ref); } /** * rcuref_put -- Release one reference for a rcuref reference count * @ref: Pointer to the reference count * * Can be invoked from any context. * * Provides release memory ordering, such that prior loads and stores are done * before, and provides an acquire ordering on success such that free() * must come after. * * Return: * * True if this was the last reference with no future references * possible. This signals the caller that it can safely schedule the * object, which is protected by the reference counter, for * deconstruction. * * False if there are still active references or the put() raced * with a concurrent get()/put() pair. Caller is not allowed to * deconstruct the protected object. */ static inline __must_check bool rcuref_put(rcuref_t *ref) { bool released; preempt_disable(); released = __rcuref_put(ref); preempt_enable(); return released; } #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_GENERIC_NETLINK_H #define __NET_GENERIC_NETLINK_H #include <linux/net.h> #include <net/netlink.h> #include <net/net_namespace.h> #include <uapi/linux/genetlink.h> #define GENLMSG_DEFAULT_SIZE (NLMSG_DEFAULT_SIZE - GENL_HDRLEN) /* Non-parallel generic netlink requests are serialized by a global lock. */ void genl_lock(void); void genl_unlock(void); #define MODULE_ALIAS_GENL_FAMILY(family) \ MODULE_ALIAS_NET_PF_PROTO_NAME(PF_NETLINK, NETLINK_GENERIC, "-family-" family) /* Binding to multicast group requires %CAP_NET_ADMIN */ #define GENL_MCAST_CAP_NET_ADMIN BIT(0) /* Binding to multicast group requires %CAP_SYS_ADMIN */ #define GENL_MCAST_CAP_SYS_ADMIN BIT(1) /** * struct genl_multicast_group - generic netlink multicast group * @name: name of the multicast group, names are per-family * @flags: GENL_MCAST_* flags */ struct genl_multicast_group { char name[GENL_NAMSIZ]; u8 flags; }; struct genl_split_ops; struct genl_info; /** * struct genl_family - generic netlink family * @hdrsize: length of user specific header in bytes * @name: name of family * @version: protocol version * @maxattr: maximum number of attributes supported * @policy: netlink policy * @netnsok: set to true if the family can handle network * namespaces and should be presented in all of them * @parallel_ops: operations can be called in parallel and aren't * synchronized by the core genetlink code * @pre_doit: called before an operation's doit callback, it may * do additional, common, filtering and return an error * @post_doit: called after an operation's doit callback, it may * undo operations done by pre_doit, for example release locks * @bind: called when family multicast group is added to a netlink socket * @unbind: called when family multicast group is removed from a netlink socket * @module: pointer to the owning module (set to THIS_MODULE) * @mcgrps: multicast groups used by this family * @n_mcgrps: number of multicast groups * @resv_start_op: first operation for which reserved fields of the header * can be validated and policies are required (see below); * new families should leave this field at zero * @ops: the operations supported by this family * @n_ops: number of operations supported by this family * @small_ops: the small-struct operations supported by this family * @n_small_ops: number of small-struct operations supported by this family * @split_ops: the split do/dump form of operation definition * @n_split_ops: number of entries in @split_ops, not that with split do/dump * ops the number of entries is not the same as number of commands * @sock_priv_size: the size of per-socket private memory * @sock_priv_init: the per-socket private memory initializer * @sock_priv_destroy: the per-socket private memory destructor * * Attribute policies (the combination of @policy and @maxattr fields) * can be attached at the family level or at the operation level. * If both are present the per-operation policy takes precedence. * For operations before @resv_start_op lack of policy means that the core * will perform no attribute parsing or validation. For newer operations * if policy is not provided core will reject all TLV attributes. */ struct genl_family { unsigned int hdrsize; char name[GENL_NAMSIZ]; unsigned int version; unsigned int maxattr; u8 netnsok:1; u8 parallel_ops:1; u8 n_ops; u8 n_small_ops; u8 n_split_ops; u8 n_mcgrps; u8 resv_start_op; const struct nla_policy *policy; int (*pre_doit)(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); void (*post_doit)(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); int (*bind)(int mcgrp); void (*unbind)(int mcgrp); const struct genl_ops * ops; const struct genl_small_ops *small_ops; const struct genl_split_ops *split_ops; const struct genl_multicast_group *mcgrps; struct module *module; size_t sock_priv_size; void (*sock_priv_init)(void *priv); void (*sock_priv_destroy)(void *priv); /* private: internal use only */ /* protocol family identifier */ int id; /* starting number of multicast group IDs in this family */ unsigned int mcgrp_offset; /* list of per-socket privs */ struct xarray *sock_privs; }; /** * struct genl_info - receiving information * @snd_seq: sending sequence number * @snd_portid: netlink portid of sender * @family: generic netlink family * @nlhdr: netlink message header * @genlhdr: generic netlink message header * @attrs: netlink attributes * @_net: network namespace * @ctx: storage space for the use by the family * @user_ptr: user pointers (deprecated, use ctx instead) * @extack: extended ACK report struct */ struct genl_info { u32 snd_seq; u32 snd_portid; const struct genl_family *family; const struct nlmsghdr * nlhdr; struct genlmsghdr * genlhdr; struct nlattr ** attrs; possible_net_t _net; union { u8 ctx[NETLINK_CTX_SIZE]; void * user_ptr[2]; }; struct netlink_ext_ack *extack; }; static inline struct net *genl_info_net(const struct genl_info *info) { return read_pnet(&info->_net); } static inline void genl_info_net_set(struct genl_info *info, struct net *net) { write_pnet(&info->_net, net); } static inline void *genl_info_userhdr(const struct genl_info *info) { return (u8 *)info->genlhdr + GENL_HDRLEN; } #define GENL_SET_ERR_MSG(info, msg) NL_SET_ERR_MSG((info)->extack, msg) #define GENL_SET_ERR_MSG_FMT(info, msg, args...) \ NL_SET_ERR_MSG_FMT((info)->extack, msg, ##args) /* Report that a root attribute is missing */ #define GENL_REQ_ATTR_CHECK(info, attr) ({ \ const struct genl_info *__info = (info); \ \ NL_REQ_ATTR_CHECK(__info->extack, NULL, __info->attrs, (attr)); \ }) enum genl_validate_flags { GENL_DONT_VALIDATE_STRICT = BIT(0), GENL_DONT_VALIDATE_DUMP = BIT(1), GENL_DONT_VALIDATE_DUMP_STRICT = BIT(2), }; /** * struct genl_small_ops - generic netlink operations (small version) * @cmd: command identifier * @internal_flags: flags used by the family * @flags: GENL_* flags (%GENL_ADMIN_PERM or %GENL_UNS_ADMIN_PERM) * @validate: validation flags from enum genl_validate_flags * @doit: standard command callback * @dumpit: callback for dumpers * * This is a cut-down version of struct genl_ops for users who don't need * most of the ancillary infra and want to save space. */ struct genl_small_ops { int (*doit)(struct sk_buff *skb, struct genl_info *info); int (*dumpit)(struct sk_buff *skb, struct netlink_callback *cb); u8 cmd; u8 internal_flags; u8 flags; u8 validate; }; /** * struct genl_ops - generic netlink operations * @cmd: command identifier * @internal_flags: flags used by the family * @flags: GENL_* flags (%GENL_ADMIN_PERM or %GENL_UNS_ADMIN_PERM) * @maxattr: maximum number of attributes supported * @policy: netlink policy (takes precedence over family policy) * @validate: validation flags from enum genl_validate_flags * @doit: standard command callback * @start: start callback for dumps * @dumpit: callback for dumpers * @done: completion callback for dumps */ struct genl_ops { int (*doit)(struct sk_buff *skb, struct genl_info *info); int (*start)(struct netlink_callback *cb); int (*dumpit)(struct sk_buff *skb, struct netlink_callback *cb); int (*done)(struct netlink_callback *cb); const struct nla_policy *policy; unsigned int maxattr; u8 cmd; u8 internal_flags; u8 flags; u8 validate; }; /** * struct genl_split_ops - generic netlink operations (do/dump split version) * @cmd: command identifier * @internal_flags: flags used by the family * @flags: GENL_* flags (%GENL_ADMIN_PERM or %GENL_UNS_ADMIN_PERM) * @validate: validation flags from enum genl_validate_flags * @policy: netlink policy (takes precedence over family policy) * @maxattr: maximum number of attributes supported * * Do callbacks: * @pre_doit: called before an operation's @doit callback, it may * do additional, common, filtering and return an error * @doit: standard command callback * @post_doit: called after an operation's @doit callback, it may * undo operations done by pre_doit, for example release locks * * Dump callbacks: * @start: start callback for dumps * @dumpit: callback for dumpers * @done: completion callback for dumps * * Do callbacks can be used if %GENL_CMD_CAP_DO is set in @flags. * Dump callbacks can be used if %GENL_CMD_CAP_DUMP is set in @flags. * Exactly one of those flags must be set. */ struct genl_split_ops { union { struct { int (*pre_doit)(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); int (*doit)(struct sk_buff *skb, struct genl_info *info); void (*post_doit)(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); }; struct { int (*start)(struct netlink_callback *cb); int (*dumpit)(struct sk_buff *skb, struct netlink_callback *cb); int (*done)(struct netlink_callback *cb); }; }; const struct nla_policy *policy; unsigned int maxattr; u8 cmd; u8 internal_flags; u8 flags; u8 validate; }; /** * struct genl_dumpit_info - info that is available during dumpit op call * @op: generic netlink ops - for internal genl code usage * @attrs: netlink attributes * @info: struct genl_info describing the request */ struct genl_dumpit_info { struct genl_split_ops op; struct genl_info info; }; static inline const struct genl_dumpit_info * genl_dumpit_info(struct netlink_callback *cb) { return cb->data; } static inline const struct genl_info * genl_info_dump(struct netlink_callback *cb) { return &genl_dumpit_info(cb)->info; } /** * genl_info_init_ntf() - initialize genl_info for notifications * @info: genl_info struct to set up * @family: pointer to the genetlink family * @cmd: command to be used in the notification * * Initialize a locally declared struct genl_info to pass to various APIs. * Intended to be used when creating notifications. */ static inline void genl_info_init_ntf(struct genl_info *info, const struct genl_family *family, u8 cmd) { struct genlmsghdr *hdr = (void *) &info->user_ptr[0]; memset(info, 0, sizeof(*info)); info->family = family; info->genlhdr = hdr; hdr->cmd = cmd; } static inline bool genl_info_is_ntf(const struct genl_info *info) { return !info->nlhdr; } void *__genl_sk_priv_get(struct genl_family *family, struct sock *sk); void *genl_sk_priv_get(struct genl_family *family, struct sock *sk); int genl_register_family(struct genl_family *family); int genl_unregister_family(const struct genl_family *family); void genl_notify(const struct genl_family *family, struct sk_buff *skb, struct genl_info *info, u32 group, gfp_t flags); void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, const struct genl_family *family, int flags, u8 cmd); static inline void * __genlmsg_iput(struct sk_buff *skb, const struct genl_info *info, int flags) { return genlmsg_put(skb, info->snd_portid, info->snd_seq, info->family, flags, info->genlhdr->cmd); } /** * genlmsg_iput - start genetlink message based on genl_info * @skb: skb in which message header will be placed * @info: genl_info as provided to do/dump handlers * * Convenience wrapper which starts a genetlink message based on * information in user request. @info should be either the struct passed * by genetlink core to do/dump handlers (when constructing replies to * such requests) or a struct initialized by genl_info_init_ntf() * when constructing notifications. * * Returns: pointer to new genetlink header. */ static inline void * genlmsg_iput(struct sk_buff *skb, const struct genl_info *info) { return __genlmsg_iput(skb, info, 0); } /** * genlmsg_nlhdr - Obtain netlink header from user specified header * @user_hdr: user header as returned from genlmsg_put() * * Returns: pointer to netlink header. */ static inline struct nlmsghdr *genlmsg_nlhdr(void *user_hdr) { return (struct nlmsghdr *)((char *)user_hdr - GENL_HDRLEN - NLMSG_HDRLEN); } /** * genlmsg_parse_deprecated - parse attributes of a genetlink message * @nlh: netlink message header * @family: genetlink message family * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @policy: validation policy * @extack: extended ACK report struct */ static inline int genlmsg_parse_deprecated(const struct nlmsghdr *nlh, const struct genl_family *family, struct nlattr *tb[], int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nlmsg_parse(nlh, family->hdrsize + GENL_HDRLEN, tb, maxtype, policy, NL_VALIDATE_LIBERAL, extack); } /** * genlmsg_parse - parse attributes of a genetlink message * @nlh: netlink message header * @family: genetlink message family * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @policy: validation policy * @extack: extended ACK report struct */ static inline int genlmsg_parse(const struct nlmsghdr *nlh, const struct genl_family *family, struct nlattr *tb[], int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nlmsg_parse(nlh, family->hdrsize + GENL_HDRLEN, tb, maxtype, policy, NL_VALIDATE_STRICT, extack); } /** * genl_dump_check_consistent - check if sequence is consistent and advertise if not * @cb: netlink callback structure that stores the sequence number * @user_hdr: user header as returned from genlmsg_put() * * Cf. nl_dump_check_consistent(), this just provides a wrapper to make it * simpler to use with generic netlink. */ static inline void genl_dump_check_consistent(struct netlink_callback *cb, void *user_hdr) { nl_dump_check_consistent(cb, genlmsg_nlhdr(user_hdr)); } /** * genlmsg_put_reply - Add generic netlink header to a reply message * @skb: socket buffer holding the message * @info: receiver info * @family: generic netlink family * @flags: netlink message flags * @cmd: generic netlink command * * Returns: pointer to user specific header */ static inline void *genlmsg_put_reply(struct sk_buff *skb, struct genl_info *info, const struct genl_family *family, int flags, u8 cmd) { return genlmsg_put(skb, info->snd_portid, info->snd_seq, family, flags, cmd); } /** * genlmsg_end - Finalize a generic netlink message * @skb: socket buffer the message is stored in * @hdr: user specific header */ static inline void genlmsg_end(struct sk_buff *skb, void *hdr) { nlmsg_end(skb, hdr - GENL_HDRLEN - NLMSG_HDRLEN); } /** * genlmsg_cancel - Cancel construction of a generic netlink message * @skb: socket buffer the message is stored in * @hdr: generic netlink message header */ static inline void genlmsg_cancel(struct sk_buff *skb, void *hdr) { if (hdr) nlmsg_cancel(skb, hdr - GENL_HDRLEN - NLMSG_HDRLEN); } /** * genlmsg_multicast_netns_filtered - multicast a netlink message * to a specific netns with filter * function * @family: the generic netlink family * @net: the net namespace * @skb: netlink message as socket buffer * @portid: own netlink portid to avoid sending to yourself * @group: offset of multicast group in groups array * @flags: allocation flags * @filter: filter function * @filter_data: filter function private data * * Return: 0 on success, negative error code for failure. */ static inline int genlmsg_multicast_netns_filtered(const struct genl_family *family, struct net *net, struct sk_buff *skb, u32 portid, unsigned int group, gfp_t flags, netlink_filter_fn filter, void *filter_data) { if (WARN_ON_ONCE(group >= family->n_mcgrps)) return -EINVAL; group = family->mcgrp_offset + group; return nlmsg_multicast_filtered(net->genl_sock, skb, portid, group, flags, filter, filter_data); } /** * genlmsg_multicast_netns - multicast a netlink message to a specific netns * @family: the generic netlink family * @net: the net namespace * @skb: netlink message as socket buffer * @portid: own netlink portid to avoid sending to yourself * @group: offset of multicast group in groups array * @flags: allocation flags */ static inline int genlmsg_multicast_netns(const struct genl_family *family, struct net *net, struct sk_buff *skb, u32 portid, unsigned int group, gfp_t flags) { return genlmsg_multicast_netns_filtered(family, net, skb, portid, group, flags, NULL, NULL); } /** * genlmsg_multicast - multicast a netlink message to the default netns * @family: the generic netlink family * @skb: netlink message as socket buffer * @portid: own netlink portid to avoid sending to yourself * @group: offset of multicast group in groups array * @flags: allocation flags */ static inline int genlmsg_multicast(const struct genl_family *family, struct sk_buff *skb, u32 portid, unsigned int group, gfp_t flags) { return genlmsg_multicast_netns(family, &init_net, skb, portid, group, flags); } /** * genlmsg_multicast_allns - multicast a netlink message to all net namespaces * @family: the generic netlink family * @skb: netlink message as socket buffer * @portid: own netlink portid to avoid sending to yourself * @group: offset of multicast group in groups array * * This function must hold the RTNL or rcu_read_lock(). */ int genlmsg_multicast_allns(const struct genl_family *family, struct sk_buff *skb, u32 portid, unsigned int group); /** * genlmsg_unicast - unicast a netlink message * @net: network namespace to look up @portid in * @skb: netlink message as socket buffer * @portid: netlink portid of the destination socket */ static inline int genlmsg_unicast(struct net *net, struct sk_buff *skb, u32 portid) { return nlmsg_unicast(net->genl_sock, skb, portid); } /** * genlmsg_reply - reply to a request * @skb: netlink message to be sent back * @info: receiver information */ static inline int genlmsg_reply(struct sk_buff *skb, struct genl_info *info) { return genlmsg_unicast(genl_info_net(info), skb, info->snd_portid); } /** * genlmsg_data - head of message payload * @gnlh: genetlink message header */ static inline void *genlmsg_data(const struct genlmsghdr *gnlh) { return ((unsigned char *) gnlh + GENL_HDRLEN); } /** * genlmsg_len - length of message payload * @gnlh: genetlink message header */ static inline int genlmsg_len(const struct genlmsghdr *gnlh) { struct nlmsghdr *nlh = (struct nlmsghdr *)((unsigned char *)gnlh - NLMSG_HDRLEN); return (nlh->nlmsg_len - GENL_HDRLEN - NLMSG_HDRLEN); } /** * genlmsg_msg_size - length of genetlink message not including padding * @payload: length of message payload */ static inline int genlmsg_msg_size(int payload) { return GENL_HDRLEN + payload; } /** * genlmsg_total_size - length of genetlink message including padding * @payload: length of message payload */ static inline int genlmsg_total_size(int payload) { return NLMSG_ALIGN(genlmsg_msg_size(payload)); } /** * genlmsg_new - Allocate a new generic netlink message * @payload: size of the message payload * @flags: the type of memory to allocate. */ static inline struct sk_buff *genlmsg_new(size_t payload, gfp_t flags) { return nlmsg_new(genlmsg_total_size(payload), flags); } /** * genl_set_err - report error to genetlink broadcast listeners * @family: the generic netlink family * @net: the network namespace to report the error to * @portid: the PORTID of a process that we want to skip (if any) * @group: the broadcast group that will notice the error * (this is the offset of the multicast group in the groups array) * @code: error code, must be negative (as usual in kernelspace) * * This function returns the number of broadcast listeners that have set the * NETLINK_RECV_NO_ENOBUFS socket option. */ static inline int genl_set_err(const struct genl_family *family, struct net *net, u32 portid, u32 group, int code) { if (WARN_ON_ONCE(group >= family->n_mcgrps)) return -EINVAL; group = family->mcgrp_offset + group; return netlink_set_err(net->genl_sock, portid, group, code); } static inline int genl_has_listeners(const struct genl_family *family, struct net *net, unsigned int group) { if (WARN_ON_ONCE(group >= family->n_mcgrps)) return -EINVAL; group = family->mcgrp_offset + group; return netlink_has_listeners(net->genl_sock, group); } #endif /* __NET_GENERIC_NETLINK_H */
13 1 24 18 14 54 54 6 47 30 1 31 143 104 36 16 16 16 56 56 36 75 61 31 28 35 35 54 54 54 54 54 54 54 1 54 54 1 37 37 37 37 37 31 31 31 37 37 37 37 1 2 2 1 1 1 1 1 1 1 1 2 2 2 2 2 1 1 36 36 36 35 36 36 36 71 54 20 6 28 29 29 54 53 54 54 54 53 1 54 1 55 55 55 53 54 5 1 4 5 54 54 54 1 54 48 6 48 48 48 48 47 49 49 49 1 48 60 60 60 60 54 1 1 19 5 5 5 72 56 16 71 72 71 71 72 72 6 6 6 6 5 5 4 6 6 6 6 6 7 1 6 14 14 14 14 14 2 2 2 14 12 10 10 5 16 16 16 7 72 36 36 36 56 56 56 56 56 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 /* * Resizable virtual memory filesystem for Linux. * * Copyright (C) 2000 Linus Torvalds. * 2000 Transmeta Corp. * 2000-2001 Christoph Rohland * 2000-2001 SAP AG * 2002 Red Hat Inc. * Copyright (C) 2002-2011 Hugh Dickins. * Copyright (C) 2011 Google Inc. * Copyright (C) 2002-2005 VERITAS Software Corporation. * Copyright (C) 2004 Andi Kleen, SuSE Labs * * Extended attribute support for tmpfs: * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net> * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> * * tiny-shmem: * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com> * * This file is released under the GPL. */ #include <linux/fs.h> #include <linux/init.h> #include <linux/vfs.h> #include <linux/mount.h> #include <linux/ramfs.h> #include <linux/pagemap.h> #include <linux/file.h> #include <linux/fileattr.h> #include <linux/mm.h> #include <linux/random.h> #include <linux/sched/signal.h> #include <linux/export.h> #include <linux/shmem_fs.h> #include <linux/swap.h> #include <linux/uio.h> #include <linux/hugetlb.h> #include <linux/fs_parser.h> #include <linux/swapfile.h> #include <linux/iversion.h> #include <linux/unicode.h> #include "swap.h" static struct vfsmount *shm_mnt __ro_after_init; #ifdef CONFIG_SHMEM /* * This virtual memory filesystem is heavily based on the ramfs. It * extends ramfs by the ability to use swap and honor resource limits * which makes it a completely usable filesystem. */ #include <linux/xattr.h> #include <linux/exportfs.h> #include <linux/posix_acl.h> #include <linux/posix_acl_xattr.h> #include <linux/mman.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/backing-dev.h> #include <linux/writeback.h> #include <linux/pagevec.h> #include <linux/percpu_counter.h> #include <linux/falloc.h> #include <linux/splice.h> #include <linux/security.h> #include <linux/swapops.h> #include <linux/mempolicy.h> #include <linux/namei.h> #include <linux/ctype.h> #include <linux/migrate.h> #include <linux/highmem.h> #include <linux/seq_file.h> #include <linux/magic.h> #include <linux/syscalls.h> #include <linux/fcntl.h> #include <uapi/linux/memfd.h> #include <linux/rmap.h> #include <linux/uuid.h> #include <linux/quotaops.h> #include <linux/rcupdate_wait.h> #include <linux/uaccess.h> #include "internal.h" #define VM_ACCT(size) (PAGE_ALIGN(size) >> PAGE_SHIFT) /* Pretend that each entry is of this size in directory's i_size */ #define BOGO_DIRENT_SIZE 20 /* Pretend that one inode + its dentry occupy this much memory */ #define BOGO_INODE_SIZE 1024 /* Symlink up to this size is kmalloc'ed instead of using a swappable page */ #define SHORT_SYMLINK_LEN 128 /* * shmem_fallocate communicates with shmem_fault or shmem_writeout via * inode->i_private (with i_rwsem making sure that it has only one user at * a time): we would prefer not to enlarge the shmem inode just for that. */ struct shmem_falloc { wait_queue_head_t *waitq; /* faults into hole wait for punch to end */ pgoff_t start; /* start of range currently being fallocated */ pgoff_t next; /* the next page offset to be fallocated */ pgoff_t nr_falloced; /* how many new pages have been fallocated */ pgoff_t nr_unswapped; /* how often writeout refused to swap out */ }; struct shmem_options { unsigned long long blocks; unsigned long long inodes; struct mempolicy *mpol; kuid_t uid; kgid_t gid; umode_t mode; bool full_inums; int huge; int seen; bool noswap; unsigned short quota_types; struct shmem_quota_limits qlimits; #if IS_ENABLED(CONFIG_UNICODE) struct unicode_map *encoding; bool strict_encoding; #endif #define SHMEM_SEEN_BLOCKS 1 #define SHMEM_SEEN_INODES 2 #define SHMEM_SEEN_HUGE 4 #define SHMEM_SEEN_INUMS 8 #define SHMEM_SEEN_NOSWAP 16 #define SHMEM_SEEN_QUOTA 32 }; #ifdef CONFIG_TRANSPARENT_HUGEPAGE static unsigned long huge_shmem_orders_always __read_mostly; static unsigned long huge_shmem_orders_madvise __read_mostly; static unsigned long huge_shmem_orders_inherit __read_mostly; static unsigned long huge_shmem_orders_within_size __read_mostly; static bool shmem_orders_configured __initdata; #endif #ifdef CONFIG_TMPFS static unsigned long shmem_default_max_blocks(void) { return totalram_pages() / 2; } static unsigned long shmem_default_max_inodes(void) { unsigned long nr_pages = totalram_pages(); return min3(nr_pages - totalhigh_pages(), nr_pages / 2, ULONG_MAX / BOGO_INODE_SIZE); } #endif static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp, gfp_t gfp, struct vm_area_struct *vma, vm_fault_t *fault_type); static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) { return sb->s_fs_info; } /* * shmem_file_setup pre-accounts the whole fixed size of a VM object, * for shared memory and for shared anonymous (/dev/zero) mappings * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1), * consistent with the pre-accounting of private mappings ... */ static inline int shmem_acct_size(unsigned long flags, loff_t size) { return (flags & VM_NORESERVE) ? 0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size)); } static inline void shmem_unacct_size(unsigned long flags, loff_t size) { if (!(flags & VM_NORESERVE)) vm_unacct_memory(VM_ACCT(size)); } static inline int shmem_reacct_size(unsigned long flags, loff_t oldsize, loff_t newsize) { if (!(flags & VM_NORESERVE)) { if (VM_ACCT(newsize) > VM_ACCT(oldsize)) return security_vm_enough_memory_mm(current->mm, VM_ACCT(newsize) - VM_ACCT(oldsize)); else if (VM_ACCT(newsize) < VM_ACCT(oldsize)) vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize)); } return 0; } /* * ... whereas tmpfs objects are accounted incrementally as * pages are allocated, in order to allow large sparse files. * shmem_get_folio reports shmem_acct_blocks failure as -ENOSPC not -ENOMEM, * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM. */ static inline int shmem_acct_blocks(unsigned long flags, long pages) { if (!(flags & VM_NORESERVE)) return 0; return security_vm_enough_memory_mm(current->mm, pages * VM_ACCT(PAGE_SIZE)); } static inline void shmem_unacct_blocks(unsigned long flags, long pages) { if (flags & VM_NORESERVE) vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE)); } static int shmem_inode_acct_blocks(struct inode *inode, long pages) { struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); int err = -ENOSPC; if (shmem_acct_blocks(info->flags, pages)) return err; might_sleep(); /* when quotas */ if (sbinfo->max_blocks) { if (!percpu_counter_limited_add(&sbinfo->used_blocks, sbinfo->max_blocks, pages)) goto unacct; err = dquot_alloc_block_nodirty(inode, pages); if (err) { percpu_counter_sub(&sbinfo->used_blocks, pages); goto unacct; } } else { err = dquot_alloc_block_nodirty(inode, pages); if (err) goto unacct; } return 0; unacct: shmem_unacct_blocks(info->flags, pages); return err; } static void shmem_inode_unacct_blocks(struct inode *inode, long pages) { struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); might_sleep(); /* when quotas */ dquot_free_block_nodirty(inode, pages); if (sbinfo->max_blocks) percpu_counter_sub(&sbinfo->used_blocks, pages); shmem_unacct_blocks(info->flags, pages); } static const struct super_operations shmem_ops; static const struct address_space_operations shmem_aops; static const struct file_operations shmem_file_operations; static const struct inode_operations shmem_inode_operations; static const struct inode_operations shmem_dir_inode_operations; static const struct inode_operations shmem_special_inode_operations; static const struct vm_operations_struct shmem_vm_ops; static const struct vm_operations_struct shmem_anon_vm_ops; static struct file_system_type shmem_fs_type; bool shmem_mapping(struct address_space *mapping) { return mapping->a_ops == &shmem_aops; } EXPORT_SYMBOL_GPL(shmem_mapping); bool vma_is_anon_shmem(struct vm_area_struct *vma) { return vma->vm_ops == &shmem_anon_vm_ops; } bool vma_is_shmem(struct vm_area_struct *vma) { return vma_is_anon_shmem(vma) || vma->vm_ops == &shmem_vm_ops; } static LIST_HEAD(shmem_swaplist); static DEFINE_SPINLOCK(shmem_swaplist_lock); #ifdef CONFIG_TMPFS_QUOTA static int shmem_enable_quotas(struct super_block *sb, unsigned short quota_types) { int type, err = 0; sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY; for (type = 0; type < SHMEM_MAXQUOTAS; type++) { if (!(quota_types & (1 << type))) continue; err = dquot_load_quota_sb(sb, type, QFMT_SHMEM, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); if (err) goto out_err; } return 0; out_err: pr_warn("tmpfs: failed to enable quota tracking (type=%d, err=%d)\n", type, err); for (type--; type >= 0; type--) dquot_quota_off(sb, type); return err; } static void shmem_disable_quotas(struct super_block *sb) { int type; for (type = 0; type < SHMEM_MAXQUOTAS; type++) dquot_quota_off(sb, type); } static struct dquot __rcu **shmem_get_dquots(struct inode *inode) { return SHMEM_I(inode)->i_dquot; } #endif /* CONFIG_TMPFS_QUOTA */ /* * shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and * produces a novel ino for the newly allocated inode. * * It may also be called when making a hard link to permit the space needed by * each dentry. However, in that case, no new inode number is needed since that * internally draws from another pool of inode numbers (currently global * get_next_ino()). This case is indicated by passing NULL as inop. */ #define SHMEM_INO_BATCH 1024 static int shmem_reserve_inode(struct super_block *sb, ino_t *inop) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); ino_t ino; if (!(sb->s_flags & SB_KERNMOUNT)) { raw_spin_lock(&sbinfo->stat_lock); if (sbinfo->max_inodes) { if (sbinfo->free_ispace < BOGO_INODE_SIZE) { raw_spin_unlock(&sbinfo->stat_lock); return -ENOSPC; } sbinfo->free_ispace -= BOGO_INODE_SIZE; } if (inop) { ino = sbinfo->next_ino++; if (unlikely(is_zero_ino(ino))) ino = sbinfo->next_ino++; if (unlikely(!sbinfo->full_inums && ino > UINT_MAX)) { /* * Emulate get_next_ino uint wraparound for * compatibility */ if (IS_ENABLED(CONFIG_64BIT)) pr_warn("%s: inode number overflow on device %d, consider using inode64 mount option\n", __func__, MINOR(sb->s_dev)); sbinfo->next_ino = 1; ino = sbinfo->next_ino++; } *inop = ino; } raw_spin_unlock(&sbinfo->stat_lock); } else if (inop) { /* * __shmem_file_setup, one of our callers, is lock-free: it * doesn't hold stat_lock in shmem_reserve_inode since * max_inodes is always 0, and is called from potentially * unknown contexts. As such, use a per-cpu batched allocator * which doesn't require the per-sb stat_lock unless we are at * the batch boundary. * * We don't need to worry about inode{32,64} since SB_KERNMOUNT * shmem mounts are not exposed to userspace, so we don't need * to worry about things like glibc compatibility. */ ino_t *next_ino; next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu()); ino = *next_ino; if (unlikely(ino % SHMEM_INO_BATCH == 0)) { raw_spin_lock(&sbinfo->stat_lock); ino = sbinfo->next_ino; sbinfo->next_ino += SHMEM_INO_BATCH; raw_spin_unlock(&sbinfo->stat_lock); if (unlikely(is_zero_ino(ino))) ino++; } *inop = ino; *next_ino = ++ino; put_cpu(); } return 0; } static void shmem_free_inode(struct super_block *sb, size_t freed_ispace) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); if (sbinfo->max_inodes) { raw_spin_lock(&sbinfo->stat_lock); sbinfo->free_ispace += BOGO_INODE_SIZE + freed_ispace; raw_spin_unlock(&sbinfo->stat_lock); } } /** * shmem_recalc_inode - recalculate the block usage of an inode * @inode: inode to recalc * @alloced: the change in number of pages allocated to inode * @swapped: the change in number of pages swapped from inode * * We have to calculate the free blocks since the mm can drop * undirtied hole pages behind our back. * * But normally info->alloced == inode->i_mapping->nrpages + info->swapped * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped) * * Return: true if swapped was incremented from 0, for shmem_writeout(). */ static bool shmem_recalc_inode(struct inode *inode, long alloced, long swapped) { struct shmem_inode_info *info = SHMEM_I(inode); bool first_swapped = false; long freed; spin_lock(&info->lock); info->alloced += alloced; info->swapped += swapped; freed = info->alloced - info->swapped - READ_ONCE(inode->i_mapping->nrpages); /* * Special case: whereas normally shmem_recalc_inode() is called * after i_mapping->nrpages has already been adjusted (up or down), * shmem_writeout() has to raise swapped before nrpages is lowered - * to stop a racing shmem_recalc_inode() from thinking that a page has * been freed. Compensate here, to avoid the need for a followup call. */ if (swapped > 0) { if (info->swapped == swapped) first_swapped = true; freed += swapped; } if (freed > 0) info->alloced -= freed; spin_unlock(&info->lock); /* The quota case may block */ if (freed > 0) shmem_inode_unacct_blocks(inode, freed); return first_swapped; } bool shmem_charge(struct inode *inode, long pages) { struct address_space *mapping = inode->i_mapping; if (shmem_inode_acct_blocks(inode, pages)) return false; /* nrpages adjustment first, then shmem_recalc_inode() when balanced */ xa_lock_irq(&mapping->i_pages); mapping->nrpages += pages; xa_unlock_irq(&mapping->i_pages); shmem_recalc_inode(inode, pages, 0); return true; } void shmem_uncharge(struct inode *inode, long pages) { /* pages argument is currently unused: keep it to help debugging */ /* nrpages adjustment done by __filemap_remove_folio() or caller */ shmem_recalc_inode(inode, 0, 0); } /* * Replace item expected in xarray by a new item, while holding xa_lock. */ static int shmem_replace_entry(struct address_space *mapping, pgoff_t index, void *expected, void *replacement) { XA_STATE(xas, &mapping->i_pages, index); void *item; VM_BUG_ON(!expected); VM_BUG_ON(!replacement); item = xas_load(&xas); if (item != expected) return -ENOENT; xas_store(&xas, replacement); return 0; } /* * Sometimes, before we decide whether to proceed or to fail, we must check * that an entry was not already brought back or split by a racing thread. * * Checking folio is not enough: by the time a swapcache folio is locked, it * might be reused, and again be swapcache, using the same swap as before. * Returns the swap entry's order if it still presents, else returns -1. */ static int shmem_confirm_swap(struct address_space *mapping, pgoff_t index, swp_entry_t swap) { XA_STATE(xas, &mapping->i_pages, index); int ret = -1; void *entry; rcu_read_lock(); do { entry = xas_load(&xas); if (entry == swp_to_radix_entry(swap)) ret = xas_get_order(&xas); } while (xas_retry(&xas, entry)); rcu_read_unlock(); return ret; } /* * Definitions for "huge tmpfs": tmpfs mounted with the huge= option * * SHMEM_HUGE_NEVER: * disables huge pages for the mount; * SHMEM_HUGE_ALWAYS: * enables huge pages for the mount; * SHMEM_HUGE_WITHIN_SIZE: * only allocate huge pages if the page will be fully within i_size, * also respect madvise() hints; * SHMEM_HUGE_ADVISE: * only allocate huge pages if requested with madvise(); */ #define SHMEM_HUGE_NEVER 0 #define SHMEM_HUGE_ALWAYS 1 #define SHMEM_HUGE_WITHIN_SIZE 2 #define SHMEM_HUGE_ADVISE 3 /* * Special values. * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled: * * SHMEM_HUGE_DENY: * disables huge on shm_mnt and all mounts, for emergency use; * SHMEM_HUGE_FORCE: * enables huge on shm_mnt and all mounts, w/o needing option, for testing; * */ #define SHMEM_HUGE_DENY (-1) #define SHMEM_HUGE_FORCE (-2) #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* ifdef here to avoid bloating shmem.o when not necessary */ static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER; static int tmpfs_huge __read_mostly = SHMEM_HUGE_NEVER; /** * shmem_mapping_size_orders - Get allowable folio orders for the given file size. * @mapping: Target address_space. * @index: The page index. * @write_end: end of a write, could extend inode size. * * This returns huge orders for folios (when supported) based on the file size * which the mapping currently allows at the given index. The index is relevant * due to alignment considerations the mapping might have. The returned order * may be less than the size passed. * * Return: The orders. */ static inline unsigned int shmem_mapping_size_orders(struct address_space *mapping, pgoff_t index, loff_t write_end) { unsigned int order; size_t size; if (!mapping_large_folio_support(mapping) || !write_end) return 0; /* Calculate the write size based on the write_end */ size = write_end - (index << PAGE_SHIFT); order = filemap_get_order(size); if (!order) return 0; /* If we're not aligned, allocate a smaller folio */ if (index & ((1UL << order) - 1)) order = __ffs(index); order = min_t(size_t, order, MAX_PAGECACHE_ORDER); return order > 0 ? BIT(order + 1) - 1 : 0; } static unsigned int shmem_get_orders_within_size(struct inode *inode, unsigned long within_size_orders, pgoff_t index, loff_t write_end) { pgoff_t aligned_index; unsigned long order; loff_t i_size; order = highest_order(within_size_orders); while (within_size_orders) { aligned_index = round_up(index + 1, 1 << order); i_size = max(write_end, i_size_read(inode)); i_size = round_up(i_size, PAGE_SIZE); if (i_size >> PAGE_SHIFT >= aligned_index) return within_size_orders; order = next_order(&within_size_orders, order); } return 0; } static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index, loff_t write_end, bool shmem_huge_force, struct vm_area_struct *vma, vm_flags_t vm_flags) { unsigned int maybe_pmd_order = HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER ? 0 : BIT(HPAGE_PMD_ORDER); unsigned long within_size_orders; if (!S_ISREG(inode->i_mode)) return 0; if (shmem_huge == SHMEM_HUGE_DENY) return 0; if (shmem_huge_force || shmem_huge == SHMEM_HUGE_FORCE) return maybe_pmd_order; /* * The huge order allocation for anon shmem is controlled through * the mTHP interface, so we still use PMD-sized huge order to * check whether global control is enabled. * * For tmpfs mmap()'s huge order, we still use PMD-sized order to * allocate huge pages due to lack of a write size hint. * * Otherwise, tmpfs will allow getting a highest order hint based on * the size of write and fallocate paths, then will try each allowable * huge orders. */ switch (SHMEM_SB(inode->i_sb)->huge) { case SHMEM_HUGE_ALWAYS: if (vma) return maybe_pmd_order; return shmem_mapping_size_orders(inode->i_mapping, index, write_end); case SHMEM_HUGE_WITHIN_SIZE: if (vma) within_size_orders = maybe_pmd_order; else within_size_orders = shmem_mapping_size_orders(inode->i_mapping, index, write_end); within_size_orders = shmem_get_orders_within_size(inode, within_size_orders, index, write_end); if (within_size_orders > 0) return within_size_orders; fallthrough; case SHMEM_HUGE_ADVISE: if (vm_flags & VM_HUGEPAGE) return maybe_pmd_order; fallthrough; default: return 0; } } static int shmem_parse_huge(const char *str) { int huge; if (!str) return -EINVAL; if (!strcmp(str, "never")) huge = SHMEM_HUGE_NEVER; else if (!strcmp(str, "always")) huge = SHMEM_HUGE_ALWAYS; else if (!strcmp(str, "within_size")) huge = SHMEM_HUGE_WITHIN_SIZE; else if (!strcmp(str, "advise")) huge = SHMEM_HUGE_ADVISE; else if (!strcmp(str, "deny")) huge = SHMEM_HUGE_DENY; else if (!strcmp(str, "force")) huge = SHMEM_HUGE_FORCE; else return -EINVAL; if (!has_transparent_hugepage() && huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY) return -EINVAL; /* Do not override huge allocation policy with non-PMD sized mTHP */ if (huge == SHMEM_HUGE_FORCE && huge_shmem_orders_inherit != BIT(HPAGE_PMD_ORDER)) return -EINVAL; return huge; } #if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS) static const char *shmem_format_huge(int huge) { switch (huge) { case SHMEM_HUGE_NEVER: return "never"; case SHMEM_HUGE_ALWAYS: return "always"; case SHMEM_HUGE_WITHIN_SIZE: return "within_size"; case SHMEM_HUGE_ADVISE: return "advise"; case SHMEM_HUGE_DENY: return "deny"; case SHMEM_HUGE_FORCE: return "force"; default: VM_BUG_ON(1); return "bad_val"; } } #endif static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, struct shrink_control *sc, unsigned long nr_to_free) { LIST_HEAD(list), *pos, *next; struct inode *inode; struct shmem_inode_info *info; struct folio *folio; unsigned long batch = sc ? sc->nr_to_scan : 128; unsigned long split = 0, freed = 0; if (list_empty(&sbinfo->shrinklist)) return SHRINK_STOP; spin_lock(&sbinfo->shrinklist_lock); list_for_each_safe(pos, next, &sbinfo->shrinklist) { info = list_entry(pos, struct shmem_inode_info, shrinklist); /* pin the inode */ inode = igrab(&info->vfs_inode); /* inode is about to be evicted */ if (!inode) { list_del_init(&info->shrinklist); goto next; } list_move(&info->shrinklist, &list); next: sbinfo->shrinklist_len--; if (!--batch) break; } spin_unlock(&sbinfo->shrinklist_lock); list_for_each_safe(pos, next, &list) { pgoff_t next, end; loff_t i_size; int ret; info = list_entry(pos, struct shmem_inode_info, shrinklist); inode = &info->vfs_inode; if (nr_to_free && freed >= nr_to_free) goto move_back; i_size = i_size_read(inode); folio = filemap_get_entry(inode->i_mapping, i_size / PAGE_SIZE); if (!folio || xa_is_value(folio)) goto drop; /* No large folio at the end of the file: nothing to split */ if (!folio_test_large(folio)) { folio_put(folio); goto drop; } /* Check if there is anything to gain from splitting */ next = folio_next_index(folio); end = shmem_fallocend(inode, DIV_ROUND_UP(i_size, PAGE_SIZE)); if (end <= folio->index || end >= next) { folio_put(folio); goto drop; } /* * Move the inode on the list back to shrinklist if we failed * to lock the page at this time. * * Waiting for the lock may lead to deadlock in the * reclaim path. */ if (!folio_trylock(folio)) { folio_put(folio); goto move_back; } ret = split_folio(folio); folio_unlock(folio); folio_put(folio); /* If split failed move the inode on the list back to shrinklist */ if (ret) goto move_back; freed += next - end; split++; drop: list_del_init(&info->shrinklist); goto put; move_back: /* * Make sure the inode is either on the global list or deleted * from any local list before iput() since it could be deleted * in another thread once we put the inode (then the local list * is corrupted). */ spin_lock(&sbinfo->shrinklist_lock); list_move(&info->shrinklist, &sbinfo->shrinklist); sbinfo->shrinklist_len++; spin_unlock(&sbinfo->shrinklist_lock); put: iput(inode); } return split; } static long shmem_unused_huge_scan(struct super_block *sb, struct shrink_control *sc) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); if (!READ_ONCE(sbinfo->shrinklist_len)) return SHRINK_STOP; return shmem_unused_huge_shrink(sbinfo, sc, 0); } static long shmem_unused_huge_count(struct super_block *sb, struct shrink_control *sc) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); return READ_ONCE(sbinfo->shrinklist_len); } #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ #define shmem_huge SHMEM_HUGE_DENY static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, struct shrink_control *sc, unsigned long nr_to_free) { return 0; } static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index, loff_t write_end, bool shmem_huge_force, struct vm_area_struct *vma, vm_flags_t vm_flags) { return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static void shmem_update_stats(struct folio *folio, int nr_pages) { if (folio_test_pmd_mappable(folio)) __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr_pages); __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages); __lruvec_stat_mod_folio(folio, NR_SHMEM, nr_pages); } /* * Somewhat like filemap_add_folio, but error if expected item has gone. */ static int shmem_add_to_page_cache(struct folio *folio, struct address_space *mapping, pgoff_t index, void *expected, gfp_t gfp) { XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio)); unsigned long nr = folio_nr_pages(folio); swp_entry_t iter, swap; void *entry; VM_BUG_ON_FOLIO(index != round_down(index, nr), folio); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio); folio_ref_add(folio, nr); folio->mapping = mapping; folio->index = index; gfp &= GFP_RECLAIM_MASK; folio_throttle_swaprate(folio, gfp); swap = radix_to_swp_entry(expected); do { iter = swap; xas_lock_irq(&xas); xas_for_each_conflict(&xas, entry) { /* * The range must either be empty, or filled with * expected swap entries. Shmem swap entries are never * partially freed without split of both entry and * folio, so there shouldn't be any holes. */ if (!expected || entry != swp_to_radix_entry(iter)) { xas_set_err(&xas, -EEXIST); goto unlock; } iter.val += 1 << xas_get_order(&xas); } if (expected && iter.val - nr != swap.val) { xas_set_err(&xas, -EEXIST); goto unlock; } xas_store(&xas, folio); if (xas_error(&xas)) goto unlock; shmem_update_stats(folio, nr); mapping->nrpages += nr; unlock: xas_unlock_irq(&xas); } while (xas_nomem(&xas, gfp)); if (xas_error(&xas)) { folio->mapping = NULL; folio_ref_sub(folio, nr); return xas_error(&xas); } return 0; } /* * Somewhat like filemap_remove_folio, but substitutes swap for @folio. */ static void shmem_delete_from_page_cache(struct folio *folio, void *radswap) { struct address_space *mapping = folio->mapping; long nr = folio_nr_pages(folio); int error; xa_lock_irq(&mapping->i_pages); error = shmem_replace_entry(mapping, folio->index, folio, radswap); folio->mapping = NULL; mapping->nrpages -= nr; shmem_update_stats(folio, -nr); xa_unlock_irq(&mapping->i_pages); folio_put_refs(folio, nr); BUG_ON(error); } /* * Remove swap entry from page cache, free the swap and its page cache. Returns * the number of pages being freed. 0 means entry not found in XArray (0 pages * being freed). */ static long shmem_free_swap(struct address_space *mapping, pgoff_t index, void *radswap) { int order = xa_get_order(&mapping->i_pages, index); void *old; old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0); if (old != radswap) return 0; free_swap_and_cache_nr(radix_to_swp_entry(radswap), 1 << order); return 1 << order; } /* * Determine (in bytes) how many of the shmem object's pages mapped by the * given offsets are swapped out. * * This is safe to call without i_rwsem or the i_pages lock thanks to RCU, * as long as the inode doesn't go away and racy results are not a problem. */ unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end) { XA_STATE(xas, &mapping->i_pages, start); struct page *page; unsigned long swapped = 0; unsigned long max = end - 1; rcu_read_lock(); xas_for_each(&xas, page, max) { if (xas_retry(&xas, page)) continue; if (xa_is_value(page)) swapped += 1 << xas_get_order(&xas); if (xas.xa_index == max) break; if (need_resched()) { xas_pause(&xas); cond_resched_rcu(); } } rcu_read_unlock(); return swapped << PAGE_SHIFT; } /* * Determine (in bytes) how many of the shmem object's pages mapped by the * given vma is swapped out. * * This is safe to call without i_rwsem or the i_pages lock thanks to RCU, * as long as the inode doesn't go away and racy results are not a problem. */ unsigned long shmem_swap_usage(struct vm_area_struct *vma) { struct inode *inode = file_inode(vma->vm_file); struct shmem_inode_info *info = SHMEM_I(inode); struct address_space *mapping = inode->i_mapping; unsigned long swapped; /* Be careful as we don't hold info->lock */ swapped = READ_ONCE(info->swapped); /* * The easier cases are when the shmem object has nothing in swap, or * the vma maps it whole. Then we can simply use the stats that we * already track. */ if (!swapped) return 0; if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size) return swapped << PAGE_SHIFT; /* Here comes the more involved part */ return shmem_partial_swap_usage(mapping, vma->vm_pgoff, vma->vm_pgoff + vma_pages(vma)); } /* * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists. */ void shmem_unlock_mapping(struct address_space *mapping) { struct folio_batch fbatch; pgoff_t index = 0; folio_batch_init(&fbatch); /* * Minor point, but we might as well stop if someone else SHM_LOCKs it. */ while (!mapping_unevictable(mapping) && filemap_get_folios(mapping, &index, ~0UL, &fbatch)) { check_move_unevictable_folios(&fbatch); folio_batch_release(&fbatch); cond_resched(); } } static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index) { struct folio *folio; /* * At first avoid shmem_get_folio(,,,SGP_READ): that fails * beyond i_size, and reports fallocated folios as holes. */ folio = filemap_get_entry(inode->i_mapping, index); if (!folio) return folio; if (!xa_is_value(folio)) { folio_lock(folio); if (folio->mapping == inode->i_mapping) return folio; /* The folio has been swapped out */ folio_unlock(folio); folio_put(folio); } /* * But read a folio back from swap if any of it is within i_size * (although in some cases this is just a waste of time). */ folio = NULL; shmem_get_folio(inode, index, 0, &folio, SGP_READ); return folio; } /* * Remove range of pages and swap entries from page cache, and free them. * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate. */ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, bool unfalloc) { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT; pgoff_t end = (lend + 1) >> PAGE_SHIFT; struct folio_batch fbatch; pgoff_t indices[PAGEVEC_SIZE]; struct folio *folio; bool same_folio; long nr_swaps_freed = 0; pgoff_t index; int i; if (lend == -1) end = -1; /* unsigned, so actually very big */ if (info->fallocend > start && info->fallocend <= end && !unfalloc) info->fallocend = start; folio_batch_init(&fbatch); index = start; while (index < end && find_lock_entries(mapping, &index, end - 1, &fbatch, indices)) { for (i = 0; i < folio_batch_count(&fbatch); i++) { folio = fbatch.folios[i]; if (xa_is_value(folio)) { if (unfalloc) continue; nr_swaps_freed += shmem_free_swap(mapping, indices[i], folio); continue; } if (!unfalloc || !folio_test_uptodate(folio)) truncate_inode_folio(mapping, folio); folio_unlock(folio); } folio_batch_remove_exceptionals(&fbatch); folio_batch_release(&fbatch); cond_resched(); } /* * When undoing a failed fallocate, we want none of the partial folio * zeroing and splitting below, but shall want to truncate the whole * folio when !uptodate indicates that it was added by this fallocate, * even when [lstart, lend] covers only a part of the folio. */ if (unfalloc) goto whole_folios; same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT); folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT); if (folio) { same_folio = lend < folio_pos(folio) + folio_size(folio); folio_mark_dirty(folio); if (!truncate_inode_partial_folio(folio, lstart, lend)) { start = folio_next_index(folio); if (same_folio) end = folio->index; } folio_unlock(folio); folio_put(folio); folio = NULL; } if (!same_folio) folio = shmem_get_partial_folio(inode, lend >> PAGE_SHIFT); if (folio) { folio_mark_dirty(folio); if (!truncate_inode_partial_folio(folio, lstart, lend)) end = folio->index; folio_unlock(folio); folio_put(folio); } whole_folios: index = start; while (index < end) { cond_resched(); if (!find_get_entries(mapping, &index, end - 1, &fbatch, indices)) { /* If all gone or hole-punch or unfalloc, we're done */ if (index == start || end != -1) break; /* But if truncating, restart to make sure all gone */ index = start; continue; } for (i = 0; i < folio_batch_count(&fbatch); i++) { folio = fbatch.folios[i]; if (xa_is_value(folio)) { long swaps_freed; if (unfalloc) continue; swaps_freed = shmem_free_swap(mapping, indices[i], folio); if (!swaps_freed) { /* Swap was replaced by page: retry */ index = indices[i]; break; } nr_swaps_freed += swaps_freed; continue; } folio_lock(folio); if (!unfalloc || !folio_test_uptodate(folio)) { if (folio_mapping(folio) != mapping) { /* Page was replaced by swap: retry */ folio_unlock(folio); index = indices[i]; break; } VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); if (!folio_test_large(folio)) { truncate_inode_folio(mapping, folio); } else if (truncate_inode_partial_folio(folio, lstart, lend)) { /* * If we split a page, reset the loop so * that we pick up the new sub pages. * Otherwise the THP was entirely * dropped or the target range was * zeroed, so just continue the loop as * is. */ if (!folio_test_large(folio)) { folio_unlock(folio); index = start; break; } } } folio_unlock(folio); } folio_batch_remove_exceptionals(&fbatch); folio_batch_release(&fbatch); } shmem_recalc_inode(inode, 0, -nr_swaps_freed); } void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) { shmem_undo_range(inode, lstart, lend, false); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); inode_inc_iversion(inode); } EXPORT_SYMBOL_GPL(shmem_truncate_range); static int shmem_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = path->dentry->d_inode; struct shmem_inode_info *info = SHMEM_I(inode); if (info->alloced - info->swapped != inode->i_mapping->nrpages) shmem_recalc_inode(inode, 0, 0); if (info->fsflags & FS_APPEND_FL) stat->attributes |= STATX_ATTR_APPEND; if (info->fsflags & FS_IMMUTABLE_FL) stat->attributes |= STATX_ATTR_IMMUTABLE; if (info->fsflags & FS_NODUMP_FL) stat->attributes |= STATX_ATTR_NODUMP; stat->attributes_mask |= (STATX_ATTR_APPEND | STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP); generic_fillattr(idmap, request_mask, inode, stat); if (shmem_huge_global_enabled(inode, 0, 0, false, NULL, 0)) stat->blksize = HPAGE_PMD_SIZE; if (request_mask & STATX_BTIME) { stat->result_mask |= STATX_BTIME; stat->btime.tv_sec = info->i_crtime.tv_sec; stat->btime.tv_nsec = info->i_crtime.tv_nsec; } return 0; } static int shmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); struct shmem_inode_info *info = SHMEM_I(inode); int error; bool update_mtime = false; bool update_ctime = true; error = setattr_prepare(idmap, dentry, attr); if (error) return error; if ((info->seals & F_SEAL_EXEC) && (attr->ia_valid & ATTR_MODE)) { if ((inode->i_mode ^ attr->ia_mode) & 0111) { return -EPERM; } } if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { loff_t oldsize = inode->i_size; loff_t newsize = attr->ia_size; /* protected by i_rwsem */ if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) || (newsize > oldsize && (info->seals & F_SEAL_GROW))) return -EPERM; if (newsize != oldsize) { error = shmem_reacct_size(SHMEM_I(inode)->flags, oldsize, newsize); if (error) return error; i_size_write(inode, newsize); update_mtime = true; } else { update_ctime = false; } if (newsize <= oldsize) { loff_t holebegin = round_up(newsize, PAGE_SIZE); if (oldsize > holebegin) unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); if (info->alloced) shmem_truncate_range(inode, newsize, (loff_t)-1); /* unmap again to remove racily COWed private pages */ if (oldsize > holebegin) unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); } } if (is_quota_modification(idmap, inode, attr)) { error = dquot_initialize(inode); if (error) return error; } /* Transfer quota accounting */ if (i_uid_needs_update(idmap, attr, inode) || i_gid_needs_update(idmap, attr, inode)) { error = dquot_transfer(idmap, inode, attr); if (error) return error; } setattr_copy(idmap, inode, attr); if (attr->ia_valid & ATTR_MODE) error = posix_acl_chmod(idmap, dentry, inode->i_mode); if (!error && update_ctime) { inode_set_ctime_current(inode); if (update_mtime) inode_set_mtime_to_ts(inode, inode_get_ctime(inode)); inode_inc_iversion(inode); } return error; } static void shmem_evict_inode(struct inode *inode) { struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); size_t freed = 0; if (shmem_mapping(inode->i_mapping)) { shmem_unacct_size(info->flags, inode->i_size); inode->i_size = 0; mapping_set_exiting(inode->i_mapping); shmem_truncate_range(inode, 0, (loff_t)-1); if (!list_empty(&info->shrinklist)) { spin_lock(&sbinfo->shrinklist_lock); if (!list_empty(&info->shrinklist)) { list_del_init(&info->shrinklist); sbinfo->shrinklist_len--; } spin_unlock(&sbinfo->shrinklist_lock); } while (!list_empty(&info->swaplist)) { /* Wait while shmem_unuse() is scanning this inode... */ wait_var_event(&info->stop_eviction, !atomic_read(&info->stop_eviction)); spin_lock(&shmem_swaplist_lock); /* ...but beware of the race if we peeked too early */ if (!atomic_read(&info->stop_eviction)) list_del_init(&info->swaplist); spin_unlock(&shmem_swaplist_lock); } } simple_xattrs_free(&info->xattrs, sbinfo->max_inodes ? &freed : NULL); shmem_free_inode(inode->i_sb, freed); WARN_ON(inode->i_blocks); clear_inode(inode); #ifdef CONFIG_TMPFS_QUOTA dquot_free_inode(inode); dquot_drop(inode); #endif } static unsigned int shmem_find_swap_entries(struct address_space *mapping, pgoff_t start, struct folio_batch *fbatch, pgoff_t *indices, unsigned int type) { XA_STATE(xas, &mapping->i_pages, start); struct folio *folio; swp_entry_t entry; rcu_read_lock(); xas_for_each(&xas, folio, ULONG_MAX) { if (xas_retry(&xas, folio)) continue; if (!xa_is_value(folio)) continue; entry = radix_to_swp_entry(folio); /* * swapin error entries can be found in the mapping. But they're * deliberately ignored here as we've done everything we can do. */ if (swp_type(entry) != type) continue; indices[folio_batch_count(fbatch)] = xas.xa_index; if (!folio_batch_add(fbatch, folio)) break; if (need_resched()) { xas_pause(&xas); cond_resched_rcu(); } } rcu_read_unlock(); return folio_batch_count(fbatch); } /* * Move the swapped pages for an inode to page cache. Returns the count * of pages swapped in, or the error in case of failure. */ static int shmem_unuse_swap_entries(struct inode *inode, struct folio_batch *fbatch, pgoff_t *indices) { int i = 0; int ret = 0; int error = 0; struct address_space *mapping = inode->i_mapping; for (i = 0; i < folio_batch_count(fbatch); i++) { struct folio *folio = fbatch->folios[i]; error = shmem_swapin_folio(inode, indices[i], &folio, SGP_CACHE, mapping_gfp_mask(mapping), NULL, NULL); if (error == 0) { folio_unlock(folio); folio_put(folio); ret++; } if (error == -ENOMEM) break; error = 0; } return error ? error : ret; } /* * If swap found in inode, free it and move page from swapcache to filecache. */ static int shmem_unuse_inode(struct inode *inode, unsigned int type) { struct address_space *mapping = inode->i_mapping; pgoff_t start = 0; struct folio_batch fbatch; pgoff_t indices[PAGEVEC_SIZE]; int ret = 0; do { folio_batch_init(&fbatch); if (!shmem_find_swap_entries(mapping, start, &fbatch, indices, type)) { ret = 0; break; } ret = shmem_unuse_swap_entries(inode, &fbatch, indices); if (ret < 0) break; start = indices[folio_batch_count(&fbatch) - 1]; } while (true); return ret; } /* * Read all the shared memory data that resides in the swap * device 'type' back into memory, so the swap device can be * unused. */ int shmem_unuse(unsigned int type) { struct shmem_inode_info *info, *next; int error = 0; if (list_empty(&shmem_swaplist)) return 0; spin_lock(&shmem_swaplist_lock); start_over: list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) { if (!info->swapped) { list_del_init(&info->swaplist); continue; } /* * Drop the swaplist mutex while searching the inode for swap; * but before doing so, make sure shmem_evict_inode() will not * remove placeholder inode from swaplist, nor let it be freed * (igrab() would protect from unlink, but not from unmount). */ atomic_inc(&info->stop_eviction); spin_unlock(&shmem_swaplist_lock); error = shmem_unuse_inode(&info->vfs_inode, type); cond_resched(); spin_lock(&shmem_swaplist_lock); if (atomic_dec_and_test(&info->stop_eviction)) wake_up_var(&info->stop_eviction); if (error) break; if (list_empty(&info->swaplist)) goto start_over; next = list_next_entry(info, swaplist); if (!info->swapped) list_del_init(&info->swaplist); } spin_unlock(&shmem_swaplist_lock); return error; } /** * shmem_writeout - Write the folio to swap * @folio: The folio to write * @plug: swap plug * @folio_list: list to put back folios on split * * Move the folio from the page cache to the swap cache. */ int shmem_writeout(struct folio *folio, struct swap_iocb **plug, struct list_head *folio_list) { struct address_space *mapping = folio->mapping; struct inode *inode = mapping->host; struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); pgoff_t index; int nr_pages; bool split = false; if ((info->flags & VM_LOCKED) || sbinfo->noswap) goto redirty; if (!total_swap_pages) goto redirty; /* * If CONFIG_THP_SWAP is not enabled, the large folio should be * split when swapping. * * And shrinkage of pages beyond i_size does not split swap, so * swapout of a large folio crossing i_size needs to split too * (unless fallocate has been used to preallocate beyond EOF). */ if (folio_test_large(folio)) { index = shmem_fallocend(inode, DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE)); if ((index > folio->index && index < folio_next_index(folio)) || !IS_ENABLED(CONFIG_THP_SWAP)) split = true; } if (split) { try_split: /* Ensure the subpages are still dirty */ folio_test_set_dirty(folio); if (split_folio_to_list(folio, folio_list)) goto redirty; folio_clear_dirty(folio); } index = folio->index; nr_pages = folio_nr_pages(folio); /* * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC * value into swapfile.c, the only way we can correctly account for a * fallocated folio arriving here is now to initialize it and write it. * * That's okay for a folio already fallocated earlier, but if we have * not yet completed the fallocation, then (a) we want to keep track * of this folio in case we have to undo it, and (b) it may not be a * good idea to continue anyway, once we're pushing into swap. So * reactivate the folio, and let shmem_fallocate() quit when too many. */ if (!folio_test_uptodate(folio)) { if (inode->i_private) { struct shmem_falloc *shmem_falloc; spin_lock(&inode->i_lock); shmem_falloc = inode->i_private; if (shmem_falloc && !shmem_falloc->waitq && index >= shmem_falloc->start && index < shmem_falloc->next) shmem_falloc->nr_unswapped += nr_pages; else shmem_falloc = NULL; spin_unlock(&inode->i_lock); if (shmem_falloc) goto redirty; } folio_zero_range(folio, 0, folio_size(folio)); flush_dcache_folio(folio); folio_mark_uptodate(folio); } if (!folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN)) { bool first_swapped = shmem_recalc_inode(inode, 0, nr_pages); int error; /* * Add inode to shmem_unuse()'s list of swapped-out inodes, * if it's not already there. Do it now before the folio is * removed from page cache, when its pagelock no longer * protects the inode from eviction. And do it now, after * we've incremented swapped, because shmem_unuse() will * prune a !swapped inode from the swaplist. */ if (first_swapped) { spin_lock(&shmem_swaplist_lock); if (list_empty(&info->swaplist)) list_add(&info->swaplist, &shmem_swaplist); spin_unlock(&shmem_swaplist_lock); } swap_shmem_alloc(folio->swap, nr_pages); shmem_delete_from_page_cache(folio, swp_to_radix_entry(folio->swap)); BUG_ON(folio_mapped(folio)); error = swap_writeout(folio, plug); if (error != AOP_WRITEPAGE_ACTIVATE) { /* folio has been unlocked */ return error; } /* * The intention here is to avoid holding on to the swap when * zswap was unable to compress and unable to writeback; but * it will be appropriate if other reactivate cases are added. */ error = shmem_add_to_page_cache(folio, mapping, index, swp_to_radix_entry(folio->swap), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); /* Swap entry might be erased by racing shmem_free_swap() */ if (!error) { shmem_recalc_inode(inode, 0, -nr_pages); swap_free_nr(folio->swap, nr_pages); } /* * The delete_from_swap_cache() below could be left for * shrink_folio_list()'s folio_free_swap() to dispose of; * but I'm a little nervous about letting this folio out of * shmem_writeout() in a hybrid half-tmpfs-half-swap state * e.g. folio_mapping(folio) might give an unexpected answer. */ delete_from_swap_cache(folio); goto redirty; } if (nr_pages > 1) goto try_split; redirty: folio_mark_dirty(folio); return AOP_WRITEPAGE_ACTIVATE; /* Return with folio locked */ } EXPORT_SYMBOL_GPL(shmem_writeout); #if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS) static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) { char buffer[64]; if (!mpol || mpol->mode == MPOL_DEFAULT) return; /* show nothing */ mpol_to_str(buffer, sizeof(buffer), mpol); seq_printf(seq, ",mpol=%s", buffer); } static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) { struct mempolicy *mpol = NULL; if (sbinfo->mpol) { raw_spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */ mpol = sbinfo->mpol; mpol_get(mpol); raw_spin_unlock(&sbinfo->stat_lock); } return mpol; } #else /* !CONFIG_NUMA || !CONFIG_TMPFS */ static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) { } static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) { return NULL; } #endif /* CONFIG_NUMA && CONFIG_TMPFS */ static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, pgoff_t index, unsigned int order, pgoff_t *ilx); static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { struct mempolicy *mpol; pgoff_t ilx; struct folio *folio; mpol = shmem_get_pgoff_policy(info, index, 0, &ilx); folio = swap_cluster_readahead(swap, gfp, mpol, ilx); mpol_cond_put(mpol); return folio; } /* * Make sure huge_gfp is always more limited than limit_gfp. * Some of the flags set permissions, while others set limitations. */ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp) { gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM; gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY; gfp_t zoneflags = limit_gfp & GFP_ZONEMASK; gfp_t result = huge_gfp & ~(allowflags | GFP_ZONEMASK); /* Allow allocations only from the originally specified zones. */ result |= zoneflags; /* * Minimize the result gfp by taking the union with the deny flags, * and the intersection of the allow flags. */ result |= (limit_gfp & denyflags); result |= (huge_gfp & limit_gfp) & allowflags; return result; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE bool shmem_hpage_pmd_enabled(void) { if (shmem_huge == SHMEM_HUGE_DENY) return false; if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_always)) return true; if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_madvise)) return true; if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_within_size)) return true; if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_inherit) && shmem_huge != SHMEM_HUGE_NEVER) return true; return false; } unsigned long shmem_allowable_huge_orders(struct inode *inode, struct vm_area_struct *vma, pgoff_t index, loff_t write_end, bool shmem_huge_force) { unsigned long mask = READ_ONCE(huge_shmem_orders_always); unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size); vm_flags_t vm_flags = vma ? vma->vm_flags : 0; unsigned int global_orders; if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags))) return 0; global_orders = shmem_huge_global_enabled(inode, index, write_end, shmem_huge_force, vma, vm_flags); /* Tmpfs huge pages allocation */ if (!vma || !vma_is_anon_shmem(vma)) return global_orders; /* * Following the 'deny' semantics of the top level, force the huge * option off from all mounts. */ if (shmem_huge == SHMEM_HUGE_DENY) return 0; /* * Only allow inherit orders if the top-level value is 'force', which * means non-PMD sized THP can not override 'huge' mount option now. */ if (shmem_huge == SHMEM_HUGE_FORCE) return READ_ONCE(huge_shmem_orders_inherit); /* Allow mTHP that will be fully within i_size. */ mask |= shmem_get_orders_within_size(inode, within_size_orders, index, 0); if (vm_flags & VM_HUGEPAGE) mask |= READ_ONCE(huge_shmem_orders_madvise); if (global_orders > 0) mask |= READ_ONCE(huge_shmem_orders_inherit); return THP_ORDERS_ALL_FILE_DEFAULT & mask; } static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf, struct address_space *mapping, pgoff_t index, unsigned long orders) { struct vm_area_struct *vma = vmf ? vmf->vma : NULL; pgoff_t aligned_index; unsigned long pages; int order; if (vma) { orders = thp_vma_suitable_orders(vma, vmf->address, orders); if (!orders) return 0; } /* Find the highest order that can add into the page cache */ order = highest_order(orders); while (orders) { pages = 1UL << order; aligned_index = round_down(index, pages); /* * Check for conflict before waiting on a huge allocation. * Conflict might be that a huge page has just been allocated * and added to page cache by a racing thread, or that there * is already at least one small page in the huge extent. * Be careful to retry when appropriate, but not forever! * Elsewhere -EEXIST would be the right code, but not here. */ if (!xa_find(&mapping->i_pages, &aligned_index, aligned_index + pages - 1, XA_PRESENT)) break; order = next_order(&orders, order); } return orders; } #else static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf, struct address_space *mapping, pgoff_t index, unsigned long orders) { return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static struct folio *shmem_alloc_folio(gfp_t gfp, int order, struct shmem_inode_info *info, pgoff_t index) { struct mempolicy *mpol; pgoff_t ilx; struct folio *folio; mpol = shmem_get_pgoff_policy(info, index, order, &ilx); folio = folio_alloc_mpol(gfp, order, mpol, ilx, numa_node_id()); mpol_cond_put(mpol); return folio; } static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf, gfp_t gfp, struct inode *inode, pgoff_t index, struct mm_struct *fault_mm, unsigned long orders) { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); unsigned long suitable_orders = 0; struct folio *folio = NULL; long pages; int error, order; if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) orders = 0; if (orders > 0) { suitable_orders = shmem_suitable_orders(inode, vmf, mapping, index, orders); order = highest_order(suitable_orders); while (suitable_orders) { pages = 1UL << order; index = round_down(index, pages); folio = shmem_alloc_folio(gfp, order, info, index); if (folio) goto allocated; if (pages == HPAGE_PMD_NR) count_vm_event(THP_FILE_FALLBACK); count_mthp_stat(order, MTHP_STAT_SHMEM_FALLBACK); order = next_order(&suitable_orders, order); } } else { pages = 1; folio = shmem_alloc_folio(gfp, 0, info, index); } if (!folio) return ERR_PTR(-ENOMEM); allocated: __folio_set_locked(folio); __folio_set_swapbacked(folio); gfp &= GFP_RECLAIM_MASK; error = mem_cgroup_charge(folio, fault_mm, gfp); if (error) { if (xa_find(&mapping->i_pages, &index, index + pages - 1, XA_PRESENT)) { error = -EEXIST; } else if (pages > 1) { if (pages == HPAGE_PMD_NR) { count_vm_event(THP_FILE_FALLBACK); count_vm_event(THP_FILE_FALLBACK_CHARGE); } count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK); count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK_CHARGE); } goto unlock; } error = shmem_add_to_page_cache(folio, mapping, index, NULL, gfp); if (error) goto unlock; error = shmem_inode_acct_blocks(inode, pages); if (error) { struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); long freed; /* * Try to reclaim some space by splitting a few * large folios beyond i_size on the filesystem. */ shmem_unused_huge_shrink(sbinfo, NULL, pages); /* * And do a shmem_recalc_inode() to account for freed pages: * except our folio is there in cache, so not quite balanced. */ spin_lock(&info->lock); freed = pages + info->alloced - info->swapped - READ_ONCE(mapping->nrpages); if (freed > 0) info->alloced -= freed; spin_unlock(&info->lock); if (freed > 0) shmem_inode_unacct_blocks(inode, freed); error = shmem_inode_acct_blocks(inode, pages); if (error) { filemap_remove_folio(folio); goto unlock; } } shmem_recalc_inode(inode, pages, 0); folio_add_lru(folio); return folio; unlock: folio_unlock(folio); folio_put(folio); return ERR_PTR(error); } static struct folio *shmem_swap_alloc_folio(struct inode *inode, struct vm_area_struct *vma, pgoff_t index, swp_entry_t entry, int order, gfp_t gfp) { struct shmem_inode_info *info = SHMEM_I(inode); int nr_pages = 1 << order; struct folio *new; gfp_t alloc_gfp; void *shadow; /* * We have arrived here because our zones are constrained, so don't * limit chance of success with further cpuset and node constraints. */ gfp &= ~GFP_CONSTRAINT_MASK; alloc_gfp = gfp; if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { if (WARN_ON_ONCE(order)) return ERR_PTR(-EINVAL); } else if (order) { /* * If uffd is active for the vma, we need per-page fault * fidelity to maintain the uffd semantics, then fallback * to swapin order-0 folio, as well as for zswap case. * Any existing sub folio in the swap cache also blocks * mTHP swapin. */ if ((vma && unlikely(userfaultfd_armed(vma))) || !zswap_never_enabled() || non_swapcache_batch(entry, nr_pages) != nr_pages) goto fallback; alloc_gfp = limit_gfp_mask(vma_thp_gfp_mask(vma), gfp); } retry: new = shmem_alloc_folio(alloc_gfp, order, info, index); if (!new) { new = ERR_PTR(-ENOMEM); goto fallback; } if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL, alloc_gfp, entry)) { folio_put(new); new = ERR_PTR(-ENOMEM); goto fallback; } /* * Prevent parallel swapin from proceeding with the swap cache flag. * * Of course there is another possible concurrent scenario as well, * that is to say, the swap cache flag of a large folio has already * been set by swapcache_prepare(), while another thread may have * already split the large swap entry stored in the shmem mapping. * In this case, shmem_add_to_page_cache() will help identify the * concurrent swapin and return -EEXIST. */ if (swapcache_prepare(entry, nr_pages)) { folio_put(new); new = ERR_PTR(-EEXIST); /* Try smaller folio to avoid cache conflict */ goto fallback; } __folio_set_locked(new); __folio_set_swapbacked(new); new->swap = entry; memcg1_swapin(entry, nr_pages); shadow = get_shadow_from_swap_cache(entry); if (shadow) workingset_refault(new, shadow); folio_add_lru(new); swap_read_folio(new, NULL); return new; fallback: /* Order 0 swapin failed, nothing to fallback to, abort */ if (!order) return new; entry.val += index - round_down(index, nr_pages); alloc_gfp = gfp; nr_pages = 1; order = 0; goto retry; } /* * When a page is moved from swapcache to shmem filecache (either by the * usual swapin of shmem_get_folio_gfp(), or by the less common swapoff of * shmem_unuse_inode()), it may have been read in earlier from swap, in * ignorance of the mapping it belongs to. If that mapping has special * constraints (like the gma500 GEM driver, which requires RAM below 4GB), * we may need to copy to a suitable page before moving to filecache. * * In a future release, this may well be extended to respect cpuset and * NUMA mempolicy, and applied also to anonymous pages in do_swap_page(); * but for now it is a simple matter of zone. */ static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp) { return folio_zonenum(folio) > gfp_zone(gfp); } static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index, struct vm_area_struct *vma) { struct folio *new, *old = *foliop; swp_entry_t entry = old->swap; struct address_space *swap_mapping = swap_address_space(entry); pgoff_t swap_index = swap_cache_index(entry); XA_STATE(xas, &swap_mapping->i_pages, swap_index); int nr_pages = folio_nr_pages(old); int error = 0, i; /* * We have arrived here because our zones are constrained, so don't * limit chance of success by further cpuset and node constraints. */ gfp &= ~GFP_CONSTRAINT_MASK; #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (nr_pages > 1) { gfp_t huge_gfp = vma_thp_gfp_mask(vma); gfp = limit_gfp_mask(huge_gfp, gfp); } #endif new = shmem_alloc_folio(gfp, folio_order(old), info, index); if (!new) return -ENOMEM; folio_ref_add(new, nr_pages); folio_copy(new, old); flush_dcache_folio(new); __folio_set_locked(new); __folio_set_swapbacked(new); folio_mark_uptodate(new); new->swap = entry; folio_set_swapcache(new); /* Swap cache still stores N entries instead of a high-order entry */ xa_lock_irq(&swap_mapping->i_pages); for (i = 0; i < nr_pages; i++) { void *item = xas_load(&xas); if (item != old) { error = -ENOENT; break; } xas_store(&xas, new); xas_next(&xas); } if (!error) { mem_cgroup_replace_folio(old, new); shmem_update_stats(new, nr_pages); shmem_update_stats(old, -nr_pages); } xa_unlock_irq(&swap_mapping->i_pages); if (unlikely(error)) { /* * Is this possible? I think not, now that our callers * check both the swapcache flag and folio->private * after getting the folio lock; but be defensive. * Reverse old to newpage for clear and free. */ old = new; } else { folio_add_lru(new); *foliop = new; } folio_clear_swapcache(old); old->private = NULL; folio_unlock(old); /* * The old folio are removed from swap cache, drop the 'nr_pages' * reference, as well as one temporary reference getting from swap * cache. */ folio_put_refs(old, nr_pages + 1); return error; } static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, struct folio *folio, swp_entry_t swap, bool skip_swapcache) { struct address_space *mapping = inode->i_mapping; swp_entry_t swapin_error; void *old; int nr_pages; swapin_error = make_poisoned_swp_entry(); old = xa_cmpxchg_irq(&mapping->i_pages, index, swp_to_radix_entry(swap), swp_to_radix_entry(swapin_error), 0); if (old != swp_to_radix_entry(swap)) return; nr_pages = folio_nr_pages(folio); folio_wait_writeback(folio); if (!skip_swapcache) delete_from_swap_cache(folio); /* * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks * won't be 0 when inode is released and thus trigger WARN_ON(i_blocks) * in shmem_evict_inode(). */ shmem_recalc_inode(inode, -nr_pages, -nr_pages); swap_free_nr(swap, nr_pages); } static int shmem_split_large_entry(struct inode *inode, pgoff_t index, swp_entry_t swap, gfp_t gfp) { struct address_space *mapping = inode->i_mapping; XA_STATE_ORDER(xas, &mapping->i_pages, index, 0); int split_order = 0, entry_order; int i; /* Convert user data gfp flags to xarray node gfp flags */ gfp &= GFP_RECLAIM_MASK; for (;;) { void *old = NULL; int cur_order; pgoff_t swap_index; xas_lock_irq(&xas); old = xas_load(&xas); if (!xa_is_value(old) || swp_to_radix_entry(swap) != old) { xas_set_err(&xas, -EEXIST); goto unlock; } entry_order = xas_get_order(&xas); if (!entry_order) goto unlock; /* Try to split large swap entry in pagecache */ cur_order = entry_order; swap_index = round_down(index, 1 << entry_order); split_order = xas_try_split_min_order(cur_order); while (cur_order > 0) { pgoff_t aligned_index = round_down(index, 1 << cur_order); pgoff_t swap_offset = aligned_index - swap_index; xas_set_order(&xas, index, split_order); xas_try_split(&xas, old, cur_order); if (xas_error(&xas)) goto unlock; /* * Re-set the swap entry after splitting, and the swap * offset of the original large entry must be continuous. */ for (i = 0; i < 1 << cur_order; i += (1 << split_order)) { swp_entry_t tmp; tmp = swp_entry(swp_type(swap), swp_offset(swap) + swap_offset + i); __xa_store(&mapping->i_pages, aligned_index + i, swp_to_radix_entry(tmp), 0); } cur_order = split_order; split_order = xas_try_split_min_order(split_order); } unlock: xas_unlock_irq(&xas); if (!xas_nomem(&xas, gfp)) break; } if (xas_error(&xas)) return xas_error(&xas); return 0; } /* * Swap in the folio pointed to by *foliop. * Caller has to make sure that *foliop contains a valid swapped folio. * Returns 0 and the folio in foliop if success. On failure, returns the * error code and NULL in *foliop. */ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp, gfp_t gfp, struct vm_area_struct *vma, vm_fault_t *fault_type) { struct address_space *mapping = inode->i_mapping; struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL; struct shmem_inode_info *info = SHMEM_I(inode); swp_entry_t swap, index_entry; struct swap_info_struct *si; struct folio *folio = NULL; bool skip_swapcache = false; int error, nr_pages, order; pgoff_t offset; VM_BUG_ON(!*foliop || !xa_is_value(*foliop)); index_entry = radix_to_swp_entry(*foliop); swap = index_entry; *foliop = NULL; if (is_poisoned_swp_entry(index_entry)) return -EIO; si = get_swap_device(index_entry); order = shmem_confirm_swap(mapping, index, index_entry); if (unlikely(!si)) { if (order < 0) return -EEXIST; else return -EINVAL; } if (unlikely(order < 0)) { put_swap_device(si); return -EEXIST; } /* index may point to the middle of a large entry, get the sub entry */ if (order) { offset = index - round_down(index, 1 << order); swap = swp_entry(swp_type(swap), swp_offset(swap) + offset); } /* Look it up and read it in.. */ folio = swap_cache_get_folio(swap, NULL, 0); if (!folio) { if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) { /* Direct swapin skipping swap cache & readahead */ folio = shmem_swap_alloc_folio(inode, vma, index, index_entry, order, gfp); if (IS_ERR(folio)) { error = PTR_ERR(folio); folio = NULL; goto failed; } skip_swapcache = true; } else { /* Cached swapin only supports order 0 folio */ folio = shmem_swapin_cluster(swap, gfp, info, index); if (!folio) { error = -ENOMEM; goto failed; } } if (fault_type) { *fault_type |= VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); count_memcg_event_mm(fault_mm, PGMAJFAULT); } } if (order > folio_order(folio)) { /* * Swapin may get smaller folios due to various reasons: * It may fallback to order 0 due to memory pressure or race, * swap readahead may swap in order 0 folios into swapcache * asynchronously, while the shmem mapping can still stores * large swap entries. In such cases, we should split the * large swap entry to prevent possible data corruption. */ error = shmem_split_large_entry(inode, index, index_entry, gfp); if (error) goto failed_nolock; } /* * If the folio is large, round down swap and index by folio size. * No matter what race occurs, the swap layer ensures we either get * a valid folio that has its swap entry aligned by size, or a * temporarily invalid one which we'll abort very soon and retry. * * shmem_add_to_page_cache ensures the whole range contains expected * entries and prevents any corruption, so any race split is fine * too, it will succeed as long as the entries are still there. */ nr_pages = folio_nr_pages(folio); if (nr_pages > 1) { swap.val = round_down(swap.val, nr_pages); index = round_down(index, nr_pages); } /* * We have to do this with the folio locked to prevent races. * The shmem_confirm_swap below only checks if the first swap * entry matches the folio, that's enough to ensure the folio * is not used outside of shmem, as shmem swap entries * and swap cache folios are never partially freed. */ folio_lock(folio); if ((!skip_swapcache && !folio_test_swapcache(folio)) || shmem_confirm_swap(mapping, index, swap) < 0 || folio->swap.val != swap.val) { error = -EEXIST; goto unlock; } if (!folio_test_uptodate(folio)) { error = -EIO; goto failed; } folio_wait_writeback(folio); nr_pages = folio_nr_pages(folio); /* * Some architectures may have to restore extra metadata to the * folio after reading from swap. */ arch_swap_restore(folio_swap(swap, folio), folio); if (shmem_should_replace_folio(folio, gfp)) { error = shmem_replace_folio(&folio, gfp, info, index, vma); if (error) goto failed; } error = shmem_add_to_page_cache(folio, mapping, index, swp_to_radix_entry(swap), gfp); if (error) goto failed; shmem_recalc_inode(inode, 0, -nr_pages); if (sgp == SGP_WRITE) folio_mark_accessed(folio); if (skip_swapcache) { folio->swap.val = 0; swapcache_clear(si, swap, nr_pages); } else { delete_from_swap_cache(folio); } folio_mark_dirty(folio); swap_free_nr(swap, nr_pages); put_swap_device(si); *foliop = folio; return 0; failed: if (shmem_confirm_swap(mapping, index, swap) < 0) error = -EEXIST; if (error == -EIO) shmem_set_folio_swapin_error(inode, index, folio, swap, skip_swapcache); unlock: if (folio) folio_unlock(folio); failed_nolock: if (skip_swapcache) swapcache_clear(si, folio->swap, folio_nr_pages(folio)); if (folio) folio_put(folio); put_swap_device(si); return error; } /* * shmem_get_folio_gfp - find page in cache, or get from swap, or allocate * * If we allocate a new one we do not mark it dirty. That's up to the * vm. If we swap it in we mark it dirty since we also free the swap * entry since a page cannot live in both the swap and page cache. * * vmf and fault_type are only supplied by shmem_fault: otherwise they are NULL. */ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, loff_t write_end, struct folio **foliop, enum sgp_type sgp, gfp_t gfp, struct vm_fault *vmf, vm_fault_t *fault_type) { struct vm_area_struct *vma = vmf ? vmf->vma : NULL; struct mm_struct *fault_mm; struct folio *folio; int error; bool alloced; unsigned long orders = 0; if (WARN_ON_ONCE(!shmem_mapping(inode->i_mapping))) return -EINVAL; if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT)) return -EFBIG; repeat: if (sgp <= SGP_CACHE && ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) return -EINVAL; alloced = false; fault_mm = vma ? vma->vm_mm : NULL; folio = filemap_get_entry(inode->i_mapping, index); if (folio && vma && userfaultfd_minor(vma)) { if (!xa_is_value(folio)) folio_put(folio); *fault_type = handle_userfault(vmf, VM_UFFD_MINOR); return 0; } if (xa_is_value(folio)) { error = shmem_swapin_folio(inode, index, &folio, sgp, gfp, vma, fault_type); if (error == -EEXIST) goto repeat; *foliop = folio; return error; } if (folio) { folio_lock(folio); /* Has the folio been truncated or swapped out? */ if (unlikely(folio->mapping != inode->i_mapping)) { folio_unlock(folio); folio_put(folio); goto repeat; } if (sgp == SGP_WRITE) folio_mark_accessed(folio); if (folio_test_uptodate(folio)) goto out; /* fallocated folio */ if (sgp != SGP_READ) goto clear; folio_unlock(folio); folio_put(folio); } /* * SGP_READ: succeed on hole, with NULL folio, letting caller zero. * SGP_NOALLOC: fail on hole, with NULL folio, letting caller fail. */ *foliop = NULL; if (sgp == SGP_READ) return 0; if (sgp == SGP_NOALLOC) return -ENOENT; /* * Fast cache lookup and swap lookup did not find it: allocate. */ if (vma && userfaultfd_missing(vma)) { *fault_type = handle_userfault(vmf, VM_UFFD_MISSING); return 0; } /* Find hugepage orders that are allowed for anonymous shmem and tmpfs. */ orders = shmem_allowable_huge_orders(inode, vma, index, write_end, false); if (orders > 0) { gfp_t huge_gfp; huge_gfp = vma_thp_gfp_mask(vma); huge_gfp = limit_gfp_mask(huge_gfp, gfp); folio = shmem_alloc_and_add_folio(vmf, huge_gfp, inode, index, fault_mm, orders); if (!IS_ERR(folio)) { if (folio_test_pmd_mappable(folio)) count_vm_event(THP_FILE_ALLOC); count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_ALLOC); goto alloced; } if (PTR_ERR(folio) == -EEXIST) goto repeat; } folio = shmem_alloc_and_add_folio(vmf, gfp, inode, index, fault_mm, 0); if (IS_ERR(folio)) { error = PTR_ERR(folio); if (error == -EEXIST) goto repeat; folio = NULL; goto unlock; } alloced: alloced = true; if (folio_test_large(folio) && DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < folio_next_index(folio)) { struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); struct shmem_inode_info *info = SHMEM_I(inode); /* * Part of the large folio is beyond i_size: subject * to shrink under memory pressure. */ spin_lock(&sbinfo->shrinklist_lock); /* * _careful to defend against unlocked access to * ->shrink_list in shmem_unused_huge_shrink() */ if (list_empty_careful(&info->shrinklist)) { list_add_tail(&info->shrinklist, &sbinfo->shrinklist); sbinfo->shrinklist_len++; } spin_unlock(&sbinfo->shrinklist_lock); } if (sgp == SGP_WRITE) folio_set_referenced(folio); /* * Let SGP_FALLOC use the SGP_WRITE optimization on a new folio. */ if (sgp == SGP_FALLOC) sgp = SGP_WRITE; clear: /* * Let SGP_WRITE caller clear ends if write does not fill folio; * but SGP_FALLOC on a folio fallocated earlier must initialize * it now, lest undo on failure cancel our earlier guarantee. */ if (sgp != SGP_WRITE && !folio_test_uptodate(folio)) { long i, n = folio_nr_pages(folio); for (i = 0; i < n; i++) clear_highpage(folio_page(folio, i)); flush_dcache_folio(folio); folio_mark_uptodate(folio); } /* Perhaps the file has been truncated since we checked */ if (sgp <= SGP_CACHE && ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { error = -EINVAL; goto unlock; } out: *foliop = folio; return 0; /* * Error recovery. */ unlock: if (alloced) filemap_remove_folio(folio); shmem_recalc_inode(inode, 0, 0); if (folio) { folio_unlock(folio); folio_put(folio); } return error; } /** * shmem_get_folio - find, and lock a shmem folio. * @inode: inode to search * @index: the page index. * @write_end: end of a write, could extend inode size * @foliop: pointer to the folio if found * @sgp: SGP_* flags to control behavior * * Looks up the page cache entry at @inode & @index. If a folio is * present, it is returned locked with an increased refcount. * * If the caller modifies data in the folio, it must call folio_mark_dirty() * before unlocking the folio to ensure that the folio is not reclaimed. * There is no need to reserve space before calling folio_mark_dirty(). * * When no folio is found, the behavior depends on @sgp: * - for SGP_READ, *@foliop is %NULL and 0 is returned * - for SGP_NOALLOC, *@foliop is %NULL and -ENOENT is returned * - for all other flags a new folio is allocated, inserted into the * page cache and returned locked in @foliop. * * Context: May sleep. * Return: 0 if successful, else a negative error code. */ int shmem_get_folio(struct inode *inode, pgoff_t index, loff_t write_end, struct folio **foliop, enum sgp_type sgp) { return shmem_get_folio_gfp(inode, index, write_end, foliop, sgp, mapping_gfp_mask(inode->i_mapping), NULL, NULL); } EXPORT_SYMBOL_GPL(shmem_get_folio); /* * This is like autoremove_wake_function, but it removes the wait queue * entry unconditionally - even if something else had already woken the * target. */ static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key) { int ret = default_wake_function(wait, mode, sync, key); list_del_init(&wait->entry); return ret; } /* * Trinity finds that probing a hole which tmpfs is punching can * prevent the hole-punch from ever completing: which in turn * locks writers out with its hold on i_rwsem. So refrain from * faulting pages into the hole while it's being punched. Although * shmem_undo_range() does remove the additions, it may be unable to * keep up, as each new page needs its own unmap_mapping_range() call, * and the i_mmap tree grows ever slower to scan if new vmas are added. * * It does not matter if we sometimes reach this check just before the * hole-punch begins, so that one fault then races with the punch: * we just need to make racing faults a rare case. * * The implementation below would be much simpler if we just used a * standard mutex or completion: but we cannot take i_rwsem in fault, * and bloating every shmem inode for this unlikely case would be sad. */ static vm_fault_t shmem_falloc_wait(struct vm_fault *vmf, struct inode *inode) { struct shmem_falloc *shmem_falloc; struct file *fpin = NULL; vm_fault_t ret = 0; spin_lock(&inode->i_lock); shmem_falloc = inode->i_private; if (shmem_falloc && shmem_falloc->waitq && vmf->pgoff >= shmem_falloc->start && vmf->pgoff < shmem_falloc->next) { wait_queue_head_t *shmem_falloc_waitq; DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function); ret = VM_FAULT_NOPAGE; fpin = maybe_unlock_mmap_for_io(vmf, NULL); shmem_falloc_waitq = shmem_falloc->waitq; prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait, TASK_UNINTERRUPTIBLE); spin_unlock(&inode->i_lock); schedule(); /* * shmem_falloc_waitq points into the shmem_fallocate() * stack of the hole-punching task: shmem_falloc_waitq * is usually invalid by the time we reach here, but * finish_wait() does not dereference it in that case; * though i_lock needed lest racing with wake_up_all(). */ spin_lock(&inode->i_lock); finish_wait(shmem_falloc_waitq, &shmem_fault_wait); } spin_unlock(&inode->i_lock); if (fpin) { fput(fpin); ret = VM_FAULT_RETRY; } return ret; } static vm_fault_t shmem_fault(struct vm_fault *vmf) { struct inode *inode = file_inode(vmf->vma->vm_file); gfp_t gfp = mapping_gfp_mask(inode->i_mapping); struct folio *folio = NULL; vm_fault_t ret = 0; int err; /* * Trinity finds that probing a hole which tmpfs is punching can * prevent the hole-punch from ever completing: noted in i_private. */ if (unlikely(inode->i_private)) { ret = shmem_falloc_wait(vmf, inode); if (ret) return ret; } WARN_ON_ONCE(vmf->page != NULL); err = shmem_get_folio_gfp(inode, vmf->pgoff, 0, &folio, SGP_CACHE, gfp, vmf, &ret); if (err) return vmf_error(err); if (folio) { vmf->page = folio_file_page(folio, vmf->pgoff); ret |= VM_FAULT_LOCKED; } return ret; } unsigned long shmem_get_unmapped_area(struct file *file, unsigned long uaddr, unsigned long len, unsigned long pgoff, unsigned long flags) { unsigned long addr; unsigned long offset; unsigned long inflated_len; unsigned long inflated_addr; unsigned long inflated_offset; unsigned long hpage_size; if (len > TASK_SIZE) return -ENOMEM; addr = mm_get_unmapped_area(current->mm, file, uaddr, len, pgoff, flags); if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return addr; if (IS_ERR_VALUE(addr)) return addr; if (addr & ~PAGE_MASK) return addr; if (addr > TASK_SIZE - len) return addr; if (shmem_huge == SHMEM_HUGE_DENY) return addr; if (flags & MAP_FIXED) return addr; /* * Our priority is to support MAP_SHARED mapped hugely; * and support MAP_PRIVATE mapped hugely too, until it is COWed. * But if caller specified an address hint and we allocated area there * successfully, respect that as before. */ if (uaddr == addr) return addr; hpage_size = HPAGE_PMD_SIZE; if (shmem_huge != SHMEM_HUGE_FORCE) { struct super_block *sb; unsigned long __maybe_unused hpage_orders; int order = 0; if (file) { VM_BUG_ON(file->f_op != &shmem_file_operations); sb = file_inode(file)->i_sb; } else { /* * Called directly from mm/mmap.c, or drivers/char/mem.c * for "/dev/zero", to create a shared anonymous object. */ if (IS_ERR(shm_mnt)) return addr; sb = shm_mnt->mnt_sb; /* * Find the highest mTHP order used for anonymous shmem to * provide a suitable alignment address. */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE hpage_orders = READ_ONCE(huge_shmem_orders_always); hpage_orders |= READ_ONCE(huge_shmem_orders_within_size); hpage_orders |= READ_ONCE(huge_shmem_orders_madvise); if (SHMEM_SB(sb)->huge != SHMEM_HUGE_NEVER) hpage_orders |= READ_ONCE(huge_shmem_orders_inherit); if (hpage_orders > 0) { order = highest_order(hpage_orders); hpage_size = PAGE_SIZE << order; } #endif } if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER && !order) return addr; } if (len < hpage_size) return addr; offset = (pgoff << PAGE_SHIFT) & (hpage_size - 1); if (offset && offset + len < 2 * hpage_size) return addr; if ((addr & (hpage_size - 1)) == offset) return addr; inflated_len = len + hpage_size - PAGE_SIZE; if (inflated_len > TASK_SIZE) return addr; if (inflated_len < len) return addr; inflated_addr = mm_get_unmapped_area(current->mm, NULL, uaddr, inflated_len, 0, flags); if (IS_ERR_VALUE(inflated_addr)) return addr; if (inflated_addr & ~PAGE_MASK) return addr; inflated_offset = inflated_addr & (hpage_size - 1); inflated_addr += offset - inflated_offset; if (inflated_offset > offset) inflated_addr += hpage_size; if (inflated_addr > TASK_SIZE - len) return addr; return inflated_addr; } #ifdef CONFIG_NUMA static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) { struct inode *inode = file_inode(vma->vm_file); return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol); } static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, unsigned long addr, pgoff_t *ilx) { struct inode *inode = file_inode(vma->vm_file); pgoff_t index; /* * Bias interleave by inode number to distribute better across nodes; * but this interface is independent of which page order is used, so * supplies only that bias, letting caller apply the offset (adjusted * by page order, as in shmem_get_pgoff_policy() and get_vma_policy()). */ *ilx = inode->i_ino; index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index); } static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, pgoff_t index, unsigned int order, pgoff_t *ilx) { struct mempolicy *mpol; /* Bias interleave by inode number to distribute better across nodes */ *ilx = info->vfs_inode.i_ino + (index >> order); mpol = mpol_shared_policy_lookup(&info->policy, index); return mpol ? mpol : get_task_policy(current); } #else static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, pgoff_t index, unsigned int order, pgoff_t *ilx) { *ilx = 0; return NULL; } #endif /* CONFIG_NUMA */ int shmem_lock(struct file *file, int lock, struct ucounts *ucounts) { struct inode *inode = file_inode(file); struct shmem_inode_info *info = SHMEM_I(inode); int retval = -ENOMEM; /* * What serializes the accesses to info->flags? * ipc_lock_object() when called from shmctl_do_lock(), * no serialization needed when called from shm_destroy(). */ if (lock && !(info->flags & VM_LOCKED)) { if (!user_shm_lock(inode->i_size, ucounts)) goto out_nomem; info->flags |= VM_LOCKED; mapping_set_unevictable(file->f_mapping); } if (!lock && (info->flags & VM_LOCKED) && ucounts) { user_shm_unlock(inode->i_size, ucounts); info->flags &= ~VM_LOCKED; mapping_clear_unevictable(file->f_mapping); } retval = 0; out_nomem: return retval; } static int shmem_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); file_accessed(file); /* This is anonymous shared memory if it is unlinked at the time of mmap */ if (inode->i_nlink) vma->vm_ops = &shmem_vm_ops; else vma->vm_ops = &shmem_anon_vm_ops; return 0; } static int shmem_file_open(struct inode *inode, struct file *file) { file->f_mode |= FMODE_CAN_ODIRECT; return generic_file_open(inode, file); } #ifdef CONFIG_TMPFS_XATTR static int shmem_initxattrs(struct inode *, const struct xattr *, void *); #if IS_ENABLED(CONFIG_UNICODE) /* * shmem_inode_casefold_flags - Deal with casefold file attribute flag * * The casefold file attribute needs some special checks. I can just be added to * an empty dir, and can't be removed from a non-empty dir. */ static int shmem_inode_casefold_flags(struct inode *inode, unsigned int fsflags, struct dentry *dentry, unsigned int *i_flags) { unsigned int old = inode->i_flags; struct super_block *sb = inode->i_sb; if (fsflags & FS_CASEFOLD_FL) { if (!(old & S_CASEFOLD)) { if (!sb->s_encoding) return -EOPNOTSUPP; if (!S_ISDIR(inode->i_mode)) return -ENOTDIR; if (dentry && !simple_empty(dentry)) return -ENOTEMPTY; } *i_flags = *i_flags | S_CASEFOLD; } else if (old & S_CASEFOLD) { if (dentry && !simple_empty(dentry)) return -ENOTEMPTY; } return 0; } #else static int shmem_inode_casefold_flags(struct inode *inode, unsigned int fsflags, struct dentry *dentry, unsigned int *i_flags) { if (fsflags & FS_CASEFOLD_FL) return -EOPNOTSUPP; return 0; } #endif /* * chattr's fsflags are unrelated to extended attributes, * but tmpfs has chosen to enable them under the same config option. */ static int shmem_set_inode_flags(struct inode *inode, unsigned int fsflags, struct dentry *dentry) { unsigned int i_flags = 0; int ret; ret = shmem_inode_casefold_flags(inode, fsflags, dentry, &i_flags); if (ret) return ret; if (fsflags & FS_NOATIME_FL) i_flags |= S_NOATIME; if (fsflags & FS_APPEND_FL) i_flags |= S_APPEND; if (fsflags & FS_IMMUTABLE_FL) i_flags |= S_IMMUTABLE; /* * But FS_NODUMP_FL does not require any action in i_flags. */ inode_set_flags(inode, i_flags, S_NOATIME | S_APPEND | S_IMMUTABLE | S_CASEFOLD); return 0; } #else static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags, struct dentry *dentry) { } #define shmem_initxattrs NULL #endif static struct offset_ctx *shmem_get_offset_ctx(struct inode *inode) { return &SHMEM_I(inode)->dir_offsets; } static struct inode *__shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb, struct inode *dir, umode_t mode, dev_t dev, unsigned long flags) { struct inode *inode; struct shmem_inode_info *info; struct shmem_sb_info *sbinfo = SHMEM_SB(sb); ino_t ino; int err; err = shmem_reserve_inode(sb, &ino); if (err) return ERR_PTR(err); inode = new_inode(sb); if (!inode) { shmem_free_inode(sb, 0); return ERR_PTR(-ENOSPC); } inode->i_ino = ino; inode_init_owner(idmap, inode, dir, mode); inode->i_blocks = 0; simple_inode_init_ts(inode); inode->i_generation = get_random_u32(); info = SHMEM_I(inode); memset(info, 0, (char *)inode - (char *)info); spin_lock_init(&info->lock); atomic_set(&info->stop_eviction, 0); info->seals = F_SEAL_SEAL; info->flags = flags & VM_NORESERVE; info->i_crtime = inode_get_mtime(inode); info->fsflags = (dir == NULL) ? 0 : SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED; if (info->fsflags) shmem_set_inode_flags(inode, info->fsflags, NULL); INIT_LIST_HEAD(&info->shrinklist); INIT_LIST_HEAD(&info->swaplist); simple_xattrs_init(&info->xattrs); cache_no_acl(inode); if (sbinfo->noswap) mapping_set_unevictable(inode->i_mapping); /* Don't consider 'deny' for emergencies and 'force' for testing */ if (sbinfo->huge) mapping_set_large_folios(inode->i_mapping); switch (mode & S_IFMT) { default: inode->i_op = &shmem_special_inode_operations; init_special_inode(inode, mode, dev); break; case S_IFREG: inode->i_mapping->a_ops = &shmem_aops; inode->i_op = &shmem_inode_operations; inode->i_fop = &shmem_file_operations; mpol_shared_policy_init(&info->policy, shmem_get_sbmpol(sbinfo)); break; case S_IFDIR: inc_nlink(inode); /* Some things misbehave if size == 0 on a directory */ inode->i_size = 2 * BOGO_DIRENT_SIZE; inode->i_op = &shmem_dir_inode_operations; inode->i_fop = &simple_offset_dir_operations; simple_offset_init(shmem_get_offset_ctx(inode)); break; case S_IFLNK: /* * Must not load anything in the rbtree, * mpol_free_shared_policy will not be called. */ mpol_shared_policy_init(&info->policy, NULL); break; } lockdep_annotate_inode_mutex_key(inode); return inode; } #ifdef CONFIG_TMPFS_QUOTA static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb, struct inode *dir, umode_t mode, dev_t dev, unsigned long flags) { int err; struct inode *inode; inode = __shmem_get_inode(idmap, sb, dir, mode, dev, flags); if (IS_ERR(inode)) return inode; err = dquot_initialize(inode); if (err) goto errout; err = dquot_alloc_inode(inode); if (err) { dquot_drop(inode); goto errout; } return inode; errout: inode->i_flags |= S_NOQUOTA; iput(inode); return ERR_PTR(err); } #else static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb, struct inode *dir, umode_t mode, dev_t dev, unsigned long flags) { return __shmem_get_inode(idmap, sb, dir, mode, dev, flags); } #endif /* CONFIG_TMPFS_QUOTA */ #ifdef CONFIG_USERFAULTFD int shmem_mfill_atomic_pte(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, uffd_flags_t flags, struct folio **foliop) { struct inode *inode = file_inode(dst_vma->vm_file); struct shmem_inode_info *info = SHMEM_I(inode); struct address_space *mapping = inode->i_mapping; gfp_t gfp = mapping_gfp_mask(mapping); pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); void *page_kaddr; struct folio *folio; int ret; pgoff_t max_off; if (shmem_inode_acct_blocks(inode, 1)) { /* * We may have got a page, returned -ENOENT triggering a retry, * and now we find ourselves with -ENOMEM. Release the page, to * avoid a BUG_ON in our caller. */ if (unlikely(*foliop)) { folio_put(*foliop); *foliop = NULL; } return -ENOMEM; } if (!*foliop) { ret = -ENOMEM; folio = shmem_alloc_folio(gfp, 0, info, pgoff); if (!folio) goto out_unacct_blocks; if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) { page_kaddr = kmap_local_folio(folio, 0); /* * The read mmap_lock is held here. Despite the * mmap_lock being read recursive a deadlock is still * possible if a writer has taken a lock. For example: * * process A thread 1 takes read lock on own mmap_lock * process A thread 2 calls mmap, blocks taking write lock * process B thread 1 takes page fault, read lock on own mmap lock * process B thread 2 calls mmap, blocks taking write lock * process A thread 1 blocks taking read lock on process B * process B thread 1 blocks taking read lock on process A * * Disable page faults to prevent potential deadlock * and retry the copy outside the mmap_lock. */ pagefault_disable(); ret = copy_from_user(page_kaddr, (const void __user *)src_addr, PAGE_SIZE); pagefault_enable(); kunmap_local(page_kaddr); /* fallback to copy_from_user outside mmap_lock */ if (unlikely(ret)) { *foliop = folio; ret = -ENOENT; /* don't free the page */ goto out_unacct_blocks; } flush_dcache_folio(folio); } else { /* ZEROPAGE */ clear_user_highpage(&folio->page, dst_addr); } } else { folio = *foliop; VM_BUG_ON_FOLIO(folio_test_large(folio), folio); *foliop = NULL; } VM_BUG_ON(folio_test_locked(folio)); VM_BUG_ON(folio_test_swapbacked(folio)); __folio_set_locked(folio); __folio_set_swapbacked(folio); __folio_mark_uptodate(folio); ret = -EFAULT; max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); if (unlikely(pgoff >= max_off)) goto out_release; ret = mem_cgroup_charge(folio, dst_vma->vm_mm, gfp); if (ret) goto out_release; ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, gfp); if (ret) goto out_release; ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, &folio->page, true, flags); if (ret) goto out_delete_from_cache; shmem_recalc_inode(inode, 1, 0); folio_unlock(folio); return 0; out_delete_from_cache: filemap_remove_folio(folio); out_release: folio_unlock(folio); folio_put(folio); out_unacct_blocks: shmem_inode_unacct_blocks(inode, 1); return ret; } #endif /* CONFIG_USERFAULTFD */ #ifdef CONFIG_TMPFS static const struct inode_operations shmem_symlink_inode_operations; static const struct inode_operations shmem_short_symlink_operations; static int shmem_write_begin(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { struct inode *inode = mapping->host; struct shmem_inode_info *info = SHMEM_I(inode); pgoff_t index = pos >> PAGE_SHIFT; struct folio *folio; int ret = 0; /* i_rwsem is held by caller */ if (unlikely(info->seals & (F_SEAL_GROW | F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) { if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) return -EPERM; if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size) return -EPERM; } ret = shmem_get_folio(inode, index, pos + len, &folio, SGP_WRITE); if (ret) return ret; if (folio_contain_hwpoisoned_page(folio)) { folio_unlock(folio); folio_put(folio); return -EIO; } *foliop = folio; return 0; } static int shmem_write_end(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; if (pos + copied > inode->i_size) i_size_write(inode, pos + copied); if (!folio_test_uptodate(folio)) { if (copied < folio_size(folio)) { size_t from = offset_in_folio(folio, pos); folio_zero_segments(folio, 0, from, from + copied, folio_size(folio)); } folio_mark_uptodate(folio); } folio_mark_dirty(folio); folio_unlock(folio); folio_put(folio); return copied; } static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct address_space *mapping = inode->i_mapping; pgoff_t index; unsigned long offset; int error = 0; ssize_t retval = 0; for (;;) { struct folio *folio = NULL; struct page *page = NULL; unsigned long nr, ret; loff_t end_offset, i_size = i_size_read(inode); bool fallback_page_copy = false; size_t fsize; if (unlikely(iocb->ki_pos >= i_size)) break; index = iocb->ki_pos >> PAGE_SHIFT; error = shmem_get_folio(inode, index, 0, &folio, SGP_READ); if (error) { if (error == -EINVAL) error = 0; break; } if (folio) { folio_unlock(folio); page = folio_file_page(folio, index); if (PageHWPoison(page)) { folio_put(folio); error = -EIO; break; } if (folio_test_large(folio) && folio_test_has_hwpoisoned(folio)) fallback_page_copy = true; } /* * We must evaluate after, since reads (unlike writes) * are called without i_rwsem protection against truncate */ i_size = i_size_read(inode); if (unlikely(iocb->ki_pos >= i_size)) { if (folio) folio_put(folio); break; } end_offset = min_t(loff_t, i_size, iocb->ki_pos + to->count); if (folio && likely(!fallback_page_copy)) fsize = folio_size(folio); else fsize = PAGE_SIZE; offset = iocb->ki_pos & (fsize - 1); nr = min_t(loff_t, end_offset - iocb->ki_pos, fsize - offset); if (folio) { /* * If users can be writing to this page using arbitrary * virtual addresses, take care about potential aliasing * before reading the page on the kernel side. */ if (mapping_writably_mapped(mapping)) { if (likely(!fallback_page_copy)) flush_dcache_folio(folio); else flush_dcache_page(page); } /* * Mark the folio accessed if we read the beginning. */ if (!offset) folio_mark_accessed(folio); /* * Ok, we have the page, and it's up-to-date, so * now we can copy it to user space... */ if (likely(!fallback_page_copy)) ret = copy_folio_to_iter(folio, offset, nr, to); else ret = copy_page_to_iter(page, offset, nr, to); folio_put(folio); } else if (user_backed_iter(to)) { /* * Copy to user tends to be so well optimized, but * clear_user() not so much, that it is noticeably * faster to copy the zero page instead of clearing. */ ret = copy_page_to_iter(ZERO_PAGE(0), offset, nr, to); } else { /* * But submitting the same page twice in a row to * splice() - or others? - can result in confusion: * so don't attempt that optimization on pipes etc. */ ret = iov_iter_zero(nr, to); } retval += ret; iocb->ki_pos += ret; if (!iov_iter_count(to)) break; if (ret < nr) { error = -EFAULT; break; } cond_resched(); } file_accessed(file); return retval ? retval : error; } static ssize_t shmem_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; ssize_t ret; inode_lock(inode); ret = generic_write_checks(iocb, from); if (ret <= 0) goto unlock; ret = file_remove_privs(file); if (ret) goto unlock; ret = file_update_time(file); if (ret) goto unlock; ret = generic_perform_write(iocb, from); unlock: inode_unlock(inode); return ret; } static bool zero_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { return true; } static void zero_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { } static bool zero_pipe_buf_try_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { return false; } static const struct pipe_buf_operations zero_pipe_buf_ops = { .release = zero_pipe_buf_release, .try_steal = zero_pipe_buf_try_steal, .get = zero_pipe_buf_get, }; static size_t splice_zeropage_into_pipe(struct pipe_inode_info *pipe, loff_t fpos, size_t size) { size_t offset = fpos & ~PAGE_MASK; size = min_t(size_t, size, PAGE_SIZE - offset); if (!pipe_is_full(pipe)) { struct pipe_buffer *buf = pipe_head_buf(pipe); *buf = (struct pipe_buffer) { .ops = &zero_pipe_buf_ops, .page = ZERO_PAGE(0), .offset = offset, .len = size, }; pipe->head++; } return size; } static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct inode *inode = file_inode(in); struct address_space *mapping = inode->i_mapping; struct folio *folio = NULL; size_t total_spliced = 0, used, npages, n, part; loff_t isize; int error = 0; /* Work out how much data we can actually add into the pipe */ used = pipe_buf_usage(pipe); npages = max_t(ssize_t, pipe->max_usage - used, 0); len = min_t(size_t, len, npages * PAGE_SIZE); do { bool fallback_page_splice = false; struct page *page = NULL; pgoff_t index; size_t size; if (*ppos >= i_size_read(inode)) break; index = *ppos >> PAGE_SHIFT; error = shmem_get_folio(inode, index, 0, &folio, SGP_READ); if (error) { if (error == -EINVAL) error = 0; break; } if (folio) { folio_unlock(folio); page = folio_file_page(folio, index); if (PageHWPoison(page)) { error = -EIO; break; } if (folio_test_large(folio) && folio_test_has_hwpoisoned(folio)) fallback_page_splice = true; } /* * i_size must be checked after we know the pages are Uptodate. * * Checking i_size after the check allows us to calculate * the correct value for "nr", which means the zero-filled * part of the page is not copied back to userspace (unless * another truncate extends the file - this is desired though). */ isize = i_size_read(inode); if (unlikely(*ppos >= isize)) break; /* * Fallback to PAGE_SIZE splice if the large folio has hwpoisoned * pages. */ size = len; if (unlikely(fallback_page_splice)) { size_t offset = *ppos & ~PAGE_MASK; size = umin(size, PAGE_SIZE - offset); } part = min_t(loff_t, isize - *ppos, size); if (folio) { /* * If users can be writing to this page using arbitrary * virtual addresses, take care about potential aliasing * before reading the page on the kernel side. */ if (mapping_writably_mapped(mapping)) { if (likely(!fallback_page_splice)) flush_dcache_folio(folio); else flush_dcache_page(page); } folio_mark_accessed(folio); /* * Ok, we have the page, and it's up-to-date, so we can * now splice it into the pipe. */ n = splice_folio_into_pipe(pipe, folio, *ppos, part); folio_put(folio); folio = NULL; } else { n = splice_zeropage_into_pipe(pipe, *ppos, part); } if (!n) break; len -= n; total_spliced += n; *ppos += n; in->f_ra.prev_pos = *ppos; if (pipe_is_full(pipe)) break; cond_resched(); } while (len); if (folio) folio_put(folio); file_accessed(in); return total_spliced ? total_spliced : error; } static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) { struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; if (whence != SEEK_DATA && whence != SEEK_HOLE) return generic_file_llseek_size(file, offset, whence, MAX_LFS_FILESIZE, i_size_read(inode)); if (offset < 0) return -ENXIO; inode_lock(inode); /* We're holding i_rwsem so we can access i_size directly */ offset = mapping_seek_hole_data(mapping, offset, inode->i_size, whence); if (offset >= 0) offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE); inode_unlock(inode); return offset; } static long shmem_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_falloc shmem_falloc; pgoff_t start, index, end, undo_fallocend; int error; if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) return -EOPNOTSUPP; inode_lock(inode); if (mode & FALLOC_FL_PUNCH_HOLE) { struct address_space *mapping = file->f_mapping; loff_t unmap_start = round_up(offset, PAGE_SIZE); loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq); /* protected by i_rwsem */ if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { error = -EPERM; goto out; } shmem_falloc.waitq = &shmem_falloc_waitq; shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT; shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT; spin_lock(&inode->i_lock); inode->i_private = &shmem_falloc; spin_unlock(&inode->i_lock); if ((u64)unmap_end > (u64)unmap_start) unmap_mapping_range(mapping, unmap_start, 1 + unmap_end - unmap_start, 0); shmem_truncate_range(inode, offset, offset + len - 1); /* No need to unmap again: hole-punching leaves COWed pages */ spin_lock(&inode->i_lock); inode->i_private = NULL; wake_up_all(&shmem_falloc_waitq); WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head)); spin_unlock(&inode->i_lock); error = 0; goto out; } /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ error = inode_newsize_ok(inode, offset + len); if (error) goto out; if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) { error = -EPERM; goto out; } start = offset >> PAGE_SHIFT; end = (offset + len + PAGE_SIZE - 1) >> PAGE_SHIFT; /* Try to avoid a swapstorm if len is impossible to satisfy */ if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) { error = -ENOSPC; goto out; } shmem_falloc.waitq = NULL; shmem_falloc.start = start; shmem_falloc.next = start; shmem_falloc.nr_falloced = 0; shmem_falloc.nr_unswapped = 0; spin_lock(&inode->i_lock); inode->i_private = &shmem_falloc; spin_unlock(&inode->i_lock); /* * info->fallocend is only relevant when huge pages might be * involved: to prevent split_huge_page() freeing fallocated * pages when FALLOC_FL_KEEP_SIZE committed beyond i_size. */ undo_fallocend = info->fallocend; if (info->fallocend < end) info->fallocend = end; for (index = start; index < end; ) { struct folio *folio; /* * Check for fatal signal so that we abort early in OOM * situations. We don't want to abort in case of non-fatal * signals as large fallocate can take noticeable time and * e.g. periodic timers may result in fallocate constantly * restarting. */ if (fatal_signal_pending(current)) error = -EINTR; else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced) error = -ENOMEM; else error = shmem_get_folio(inode, index, offset + len, &folio, SGP_FALLOC); if (error) { info->fallocend = undo_fallocend; /* Remove the !uptodate folios we added */ if (index > start) { shmem_undo_range(inode, (loff_t)start << PAGE_SHIFT, ((loff_t)index << PAGE_SHIFT) - 1, true); } goto undone; } /* * Here is a more important optimization than it appears: * a second SGP_FALLOC on the same large folio will clear it, * making it uptodate and un-undoable if we fail later. */ index = folio_next_index(folio); /* Beware 32-bit wraparound */ if (!index) index--; /* * Inform shmem_writeout() how far we have reached. * No need for lock or barrier: we have the page lock. */ if (!folio_test_uptodate(folio)) shmem_falloc.nr_falloced += index - shmem_falloc.next; shmem_falloc.next = index; /* * If !uptodate, leave it that way so that freeable folios * can be recognized if we need to rollback on error later. * But mark it dirty so that memory pressure will swap rather * than free the folios we are allocating (and SGP_CACHE folios * might still be clean: we now need to mark those dirty too). */ folio_mark_dirty(folio); folio_unlock(folio); folio_put(folio); cond_resched(); } if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) i_size_write(inode, offset + len); undone: spin_lock(&inode->i_lock); inode->i_private = NULL; spin_unlock(&inode->i_lock); out: if (!error) file_modified(file); inode_unlock(inode); return error; } static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) { struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); buf->f_type = TMPFS_MAGIC; buf->f_bsize = PAGE_SIZE; buf->f_namelen = NAME_MAX; if (sbinfo->max_blocks) { buf->f_blocks = sbinfo->max_blocks; buf->f_bavail = buf->f_bfree = sbinfo->max_blocks - percpu_counter_sum(&sbinfo->used_blocks); } if (sbinfo->max_inodes) { buf->f_files = sbinfo->max_inodes; buf->f_ffree = sbinfo->free_ispace / BOGO_INODE_SIZE; } /* else leave those fields 0 like simple_statfs */ buf->f_fsid = uuid_to_fsid(dentry->d_sb->s_uuid.b); return 0; } /* * File creation. Allocate an inode, and we're done.. */ static int shmem_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { struct inode *inode; int error; if (!generic_ci_validate_strict_name(dir, &dentry->d_name)) return -EINVAL; inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, dev, VM_NORESERVE); if (IS_ERR(inode)) return PTR_ERR(inode); error = simple_acl_create(dir, inode); if (error) goto out_iput; error = security_inode_init_security(inode, dir, &dentry->d_name, shmem_initxattrs, NULL); if (error && error != -EOPNOTSUPP) goto out_iput; error = simple_offset_add(shmem_get_offset_ctx(dir), dentry); if (error) goto out_iput; dir->i_size += BOGO_DIRENT_SIZE; inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); inode_inc_iversion(dir); if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) d_add(dentry, inode); else d_instantiate(dentry, inode); dget(dentry); /* Extra count - pin the dentry in core */ return error; out_iput: iput(inode); return error; } static int shmem_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { struct inode *inode; int error; inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0, VM_NORESERVE); if (IS_ERR(inode)) { error = PTR_ERR(inode); goto err_out; } error = security_inode_init_security(inode, dir, NULL, shmem_initxattrs, NULL); if (error && error != -EOPNOTSUPP) goto out_iput; error = simple_acl_create(dir, inode); if (error) goto out_iput; d_tmpfile(file, inode); err_out: return finish_open_simple(file, error); out_iput: iput(inode); return error; } static struct dentry *shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { int error; error = shmem_mknod(idmap, dir, dentry, mode | S_IFDIR, 0); if (error) return ERR_PTR(error); inc_nlink(dir); return NULL; } static int shmem_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { return shmem_mknod(idmap, dir, dentry, mode | S_IFREG, 0); } /* * Link a file.. */ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(old_dentry); int ret = 0; /* * No ordinary (disk based) filesystem counts links as inodes; * but each new link needs a new dentry, pinning lowmem, and * tmpfs dentries cannot be pruned until they are unlinked. * But if an O_TMPFILE file is linked into the tmpfs, the * first link must skip that, to get the accounting right. */ if (inode->i_nlink) { ret = shmem_reserve_inode(inode->i_sb, NULL); if (ret) goto out; } ret = simple_offset_add(shmem_get_offset_ctx(dir), dentry); if (ret) { if (inode->i_nlink) shmem_free_inode(inode->i_sb, 0); goto out; } dir->i_size += BOGO_DIRENT_SIZE; inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode))); inode_inc_iversion(dir); inc_nlink(inode); ihold(inode); /* New dentry reference */ dget(dentry); /* Extra pinning count for the created dentry */ if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) d_add(dentry, inode); else d_instantiate(dentry, inode); out: return ret; } static int shmem_unlink(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) shmem_free_inode(inode->i_sb, 0); simple_offset_remove(shmem_get_offset_ctx(dir), dentry); dir->i_size -= BOGO_DIRENT_SIZE; inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode))); inode_inc_iversion(dir); drop_nlink(inode); dput(dentry); /* Undo the count from "create" - does all the work */ /* * For now, VFS can't deal with case-insensitive negative dentries, so * we invalidate them */ if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) d_invalidate(dentry); return 0; } static int shmem_rmdir(struct inode *dir, struct dentry *dentry) { if (!simple_empty(dentry)) return -ENOTEMPTY; drop_nlink(d_inode(dentry)); drop_nlink(dir); return shmem_unlink(dir, dentry); } static int shmem_whiteout(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry) { struct dentry *whiteout; int error; whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name); if (!whiteout) return -ENOMEM; error = shmem_mknod(idmap, old_dir, whiteout, S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); dput(whiteout); if (error) return error; /* * Cheat and hash the whiteout while the old dentry is still in * place, instead of playing games with FS_RENAME_DOES_D_MOVE. * * d_lookup() will consistently find one of them at this point, * not sure which one, but that isn't even important. */ d_rehash(whiteout); return 0; } /* * The VFS layer already does all the dentry stuff for rename, * we just have to decrement the usage count for the target if * it exists so that the VFS layer correctly free's it when it * gets overwritten. */ static int shmem_rename2(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { struct inode *inode = d_inode(old_dentry); int they_are_dirs = S_ISDIR(inode->i_mode); int error; if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; if (flags & RENAME_EXCHANGE) return simple_offset_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); if (!simple_empty(new_dentry)) return -ENOTEMPTY; if (flags & RENAME_WHITEOUT) { error = shmem_whiteout(idmap, old_dir, old_dentry); if (error) return error; } error = simple_offset_rename(old_dir, old_dentry, new_dir, new_dentry); if (error) return error; if (d_really_is_positive(new_dentry)) { (void) shmem_unlink(new_dir, new_dentry); if (they_are_dirs) { drop_nlink(d_inode(new_dentry)); drop_nlink(old_dir); } } else if (they_are_dirs) { drop_nlink(old_dir); inc_nlink(new_dir); } old_dir->i_size -= BOGO_DIRENT_SIZE; new_dir->i_size += BOGO_DIRENT_SIZE; simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); inode_inc_iversion(old_dir); inode_inc_iversion(new_dir); return 0; } static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { int error; int len; struct inode *inode; struct folio *folio; char *link; len = strlen(symname) + 1; if (len > PAGE_SIZE) return -ENAMETOOLONG; inode = shmem_get_inode(idmap, dir->i_sb, dir, S_IFLNK | 0777, 0, VM_NORESERVE); if (IS_ERR(inode)) return PTR_ERR(inode); error = security_inode_init_security(inode, dir, &dentry->d_name, shmem_initxattrs, NULL); if (error && error != -EOPNOTSUPP) goto out_iput; error = simple_offset_add(shmem_get_offset_ctx(dir), dentry); if (error) goto out_iput; inode->i_size = len-1; if (len <= SHORT_SYMLINK_LEN) { link = kmemdup(symname, len, GFP_KERNEL); if (!link) { error = -ENOMEM; goto out_remove_offset; } inode->i_op = &shmem_short_symlink_operations; inode_set_cached_link(inode, link, len - 1); } else { inode_nohighmem(inode); inode->i_mapping->a_ops = &shmem_aops; error = shmem_get_folio(inode, 0, 0, &folio, SGP_WRITE); if (error) goto out_remove_offset; inode->i_op = &shmem_symlink_inode_operations; memcpy(folio_address(folio), symname, len); folio_mark_uptodate(folio); folio_mark_dirty(folio); folio_unlock(folio); folio_put(folio); } dir->i_size += BOGO_DIRENT_SIZE; inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); inode_inc_iversion(dir); if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) d_add(dentry, inode); else d_instantiate(dentry, inode); dget(dentry); return 0; out_remove_offset: simple_offset_remove(shmem_get_offset_ctx(dir), dentry); out_iput: iput(inode); return error; } static void shmem_put_link(void *arg) { folio_mark_accessed(arg); folio_put(arg); } static const char *shmem_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { struct folio *folio = NULL; int error; if (!dentry) { folio = filemap_get_folio(inode->i_mapping, 0); if (IS_ERR(folio)) return ERR_PTR(-ECHILD); if (PageHWPoison(folio_page(folio, 0)) || !folio_test_uptodate(folio)) { folio_put(folio); return ERR_PTR(-ECHILD); } } else { error = shmem_get_folio(inode, 0, 0, &folio, SGP_READ); if (error) return ERR_PTR(error); if (!folio) return ERR_PTR(-ECHILD); if (PageHWPoison(folio_page(folio, 0))) { folio_unlock(folio); folio_put(folio); return ERR_PTR(-ECHILD); } folio_unlock(folio); } set_delayed_call(done, shmem_put_link, folio); return folio_address(folio); } #ifdef CONFIG_TMPFS_XATTR static int shmem_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); fileattr_fill_flags(fa, info->fsflags & SHMEM_FL_USER_VISIBLE); return 0; } static int shmem_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); struct shmem_inode_info *info = SHMEM_I(inode); int ret, flags; if (fileattr_has_fsx(fa)) return -EOPNOTSUPP; if (fa->flags & ~SHMEM_FL_USER_MODIFIABLE) return -EOPNOTSUPP; flags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) | (fa->flags & SHMEM_FL_USER_MODIFIABLE); ret = shmem_set_inode_flags(inode, flags, dentry); if (ret) return ret; info->fsflags = flags; inode_set_ctime_current(inode); inode_inc_iversion(inode); return 0; } /* * Superblocks without xattr inode operations may get some security.* xattr * support from the LSM "for free". As soon as we have any other xattrs * like ACLs, we also need to implement the security.* handlers at * filesystem level, though. */ /* * Callback for security_inode_init_security() for acquiring xattrs. */ static int shmem_initxattrs(struct inode *inode, const struct xattr *xattr_array, void *fs_info) { struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); const struct xattr *xattr; struct simple_xattr *new_xattr; size_t ispace = 0; size_t len; if (sbinfo->max_inodes) { for (xattr = xattr_array; xattr->name != NULL; xattr++) { ispace += simple_xattr_space(xattr->name, xattr->value_len + XATTR_SECURITY_PREFIX_LEN); } if (ispace) { raw_spin_lock(&sbinfo->stat_lock); if (sbinfo->free_ispace < ispace) ispace = 0; else sbinfo->free_ispace -= ispace; raw_spin_unlock(&sbinfo->stat_lock); if (!ispace) return -ENOSPC; } } for (xattr = xattr_array; xattr->name != NULL; xattr++) { new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len); if (!new_xattr) break; len = strlen(xattr->name) + 1; new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len, GFP_KERNEL_ACCOUNT); if (!new_xattr->name) { kvfree(new_xattr); break; } memcpy(new_xattr->name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN); memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN, xattr->name, len); simple_xattr_add(&info->xattrs, new_xattr); } if (xattr->name != NULL) { if (ispace) { raw_spin_lock(&sbinfo->stat_lock); sbinfo->free_ispace += ispace; raw_spin_unlock(&sbinfo->stat_lock); } simple_xattrs_free(&info->xattrs, NULL); return -ENOMEM; } return 0; } static int shmem_xattr_handler_get(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *name, void *buffer, size_t size) { struct shmem_inode_info *info = SHMEM_I(inode); name = xattr_full_name(handler, name); return simple_xattr_get(&info->xattrs, name, buffer, size); } static int shmem_xattr_handler_set(const struct xattr_handler *handler, struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) { struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); struct simple_xattr *old_xattr; size_t ispace = 0; name = xattr_full_name(handler, name); if (value && sbinfo->max_inodes) { ispace = simple_xattr_space(name, size); raw_spin_lock(&sbinfo->stat_lock); if (sbinfo->free_ispace < ispace) ispace = 0; else sbinfo->free_ispace -= ispace; raw_spin_unlock(&sbinfo->stat_lock); if (!ispace) return -ENOSPC; } old_xattr = simple_xattr_set(&info->xattrs, name, value, size, flags); if (!IS_ERR(old_xattr)) { ispace = 0; if (old_xattr && sbinfo->max_inodes) ispace = simple_xattr_space(old_xattr->name, old_xattr->size); simple_xattr_free(old_xattr); old_xattr = NULL; inode_set_ctime_current(inode); inode_inc_iversion(inode); } if (ispace) { raw_spin_lock(&sbinfo->stat_lock); sbinfo->free_ispace += ispace; raw_spin_unlock(&sbinfo->stat_lock); } return PTR_ERR(old_xattr); } static const struct xattr_handler shmem_security_xattr_handler = { .prefix = XATTR_SECURITY_PREFIX, .get = shmem_xattr_handler_get, .set = shmem_xattr_handler_set, }; static const struct xattr_handler shmem_trusted_xattr_handler = { .prefix = XATTR_TRUSTED_PREFIX, .get = shmem_xattr_handler_get, .set = shmem_xattr_handler_set, }; static const struct xattr_handler shmem_user_xattr_handler = { .prefix = XATTR_USER_PREFIX, .get = shmem_xattr_handler_get, .set = shmem_xattr_handler_set, }; static const struct xattr_handler * const shmem_xattr_handlers[] = { &shmem_security_xattr_handler, &shmem_trusted_xattr_handler, &shmem_user_xattr_handler, NULL }; static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) { struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); return simple_xattr_list(d_inode(dentry), &info->xattrs, buffer, size); } #endif /* CONFIG_TMPFS_XATTR */ static const struct inode_operations shmem_short_symlink_operations = { .getattr = shmem_getattr, .setattr = shmem_setattr, .get_link = simple_get_link, #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, #endif }; static const struct inode_operations shmem_symlink_inode_operations = { .getattr = shmem_getattr, .setattr = shmem_setattr, .get_link = shmem_get_link, #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, #endif }; static struct dentry *shmem_get_parent(struct dentry *child) { return ERR_PTR(-ESTALE); } static int shmem_match(struct inode *ino, void *vfh) { __u32 *fh = vfh; __u64 inum = fh[2]; inum = (inum << 32) | fh[1]; return ino->i_ino == inum && fh[0] == ino->i_generation; } /* Find any alias of inode, but prefer a hashed alias */ static struct dentry *shmem_find_alias(struct inode *inode) { struct dentry *alias = d_find_alias(inode); return alias ?: d_find_any_alias(inode); } static struct dentry *shmem_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { struct inode *inode; struct dentry *dentry = NULL; u64 inum; if (fh_len < 3) return NULL; inum = fid->raw[2]; inum = (inum << 32) | fid->raw[1]; inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]), shmem_match, fid->raw); if (inode) { dentry = shmem_find_alias(inode); iput(inode); } return dentry; } static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len, struct inode *parent) { if (*len < 3) { *len = 3; return FILEID_INVALID; } if (inode_unhashed(inode)) { /* Unfortunately insert_inode_hash is not idempotent, * so as we hash inodes here rather than at creation * time, we need a lock to ensure we only try * to do it once */ static DEFINE_SPINLOCK(lock); spin_lock(&lock); if (inode_unhashed(inode)) __insert_inode_hash(inode, inode->i_ino + inode->i_generation); spin_unlock(&lock); } fh[0] = inode->i_generation; fh[1] = inode->i_ino; fh[2] = ((__u64)inode->i_ino) >> 32; *len = 3; return 1; } static const struct export_operations shmem_export_ops = { .get_parent = shmem_get_parent, .encode_fh = shmem_encode_fh, .fh_to_dentry = shmem_fh_to_dentry, }; enum shmem_param { Opt_gid, Opt_huge, Opt_mode, Opt_mpol, Opt_nr_blocks, Opt_nr_inodes, Opt_size, Opt_uid, Opt_inode32, Opt_inode64, Opt_noswap, Opt_quota, Opt_usrquota, Opt_grpquota, Opt_usrquota_block_hardlimit, Opt_usrquota_inode_hardlimit, Opt_grpquota_block_hardlimit, Opt_grpquota_inode_hardlimit, Opt_casefold_version, Opt_casefold, Opt_strict_encoding, }; static const struct constant_table shmem_param_enums_huge[] = { {"never", SHMEM_HUGE_NEVER }, {"always", SHMEM_HUGE_ALWAYS }, {"within_size", SHMEM_HUGE_WITHIN_SIZE }, {"advise", SHMEM_HUGE_ADVISE }, {} }; const struct fs_parameter_spec shmem_fs_parameters[] = { fsparam_gid ("gid", Opt_gid), fsparam_enum ("huge", Opt_huge, shmem_param_enums_huge), fsparam_u32oct("mode", Opt_mode), fsparam_string("mpol", Opt_mpol), fsparam_string("nr_blocks", Opt_nr_blocks), fsparam_string("nr_inodes", Opt_nr_inodes), fsparam_string("size", Opt_size), fsparam_uid ("uid", Opt_uid), fsparam_flag ("inode32", Opt_inode32), fsparam_flag ("inode64", Opt_inode64), fsparam_flag ("noswap", Opt_noswap), #ifdef CONFIG_TMPFS_QUOTA fsparam_flag ("quota", Opt_quota), fsparam_flag ("usrquota", Opt_usrquota), fsparam_flag ("grpquota", Opt_grpquota), fsparam_string("usrquota_block_hardlimit", Opt_usrquota_block_hardlimit), fsparam_string("usrquota_inode_hardlimit", Opt_usrquota_inode_hardlimit), fsparam_string("grpquota_block_hardlimit", Opt_grpquota_block_hardlimit), fsparam_string("grpquota_inode_hardlimit", Opt_grpquota_inode_hardlimit), #endif fsparam_string("casefold", Opt_casefold_version), fsparam_flag ("casefold", Opt_casefold), fsparam_flag ("strict_encoding", Opt_strict_encoding), {} }; #if IS_ENABLED(CONFIG_UNICODE) static int shmem_parse_opt_casefold(struct fs_context *fc, struct fs_parameter *param, bool latest_version) { struct shmem_options *ctx = fc->fs_private; int version = UTF8_LATEST; struct unicode_map *encoding; char *version_str = param->string + 5; if (!latest_version) { if (strncmp(param->string, "utf8-", 5)) return invalfc(fc, "Only UTF-8 encodings are supported " "in the format: utf8-<version number>"); version = utf8_parse_version(version_str); if (version < 0) return invalfc(fc, "Invalid UTF-8 version: %s", version_str); } encoding = utf8_load(version); if (IS_ERR(encoding)) { return invalfc(fc, "Failed loading UTF-8 version: utf8-%u.%u.%u\n", unicode_major(version), unicode_minor(version), unicode_rev(version)); } pr_info("tmpfs: Using encoding : utf8-%u.%u.%u\n", unicode_major(version), unicode_minor(version), unicode_rev(version)); ctx->encoding = encoding; return 0; } #else static int shmem_parse_opt_casefold(struct fs_context *fc, struct fs_parameter *param, bool latest_version) { return invalfc(fc, "tmpfs: Kernel not built with CONFIG_UNICODE\n"); } #endif static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) { struct shmem_options *ctx = fc->fs_private; struct fs_parse_result result; unsigned long long size; char *rest; int opt; kuid_t kuid; kgid_t kgid; opt = fs_parse(fc, shmem_fs_parameters, param, &result); if (opt < 0) return opt; switch (opt) { case Opt_size: size = memparse(param->string, &rest); if (*rest == '%') { size <<= PAGE_SHIFT; size *= totalram_pages(); do_div(size, 100); rest++; } if (*rest) goto bad_value; ctx->blocks = DIV_ROUND_UP(size, PAGE_SIZE); ctx->seen |= SHMEM_SEEN_BLOCKS; break; case Opt_nr_blocks: ctx->blocks = memparse(param->string, &rest); if (*rest || ctx->blocks > LONG_MAX) goto bad_value; ctx->seen |= SHMEM_SEEN_BLOCKS; break; case Opt_nr_inodes: ctx->inodes = memparse(param->string, &rest); if (*rest || ctx->inodes > ULONG_MAX / BOGO_INODE_SIZE) goto bad_value; ctx->seen |= SHMEM_SEEN_INODES; break; case Opt_mode: ctx->mode = result.uint_32 & 07777; break; case Opt_uid: kuid = result.uid; /* * The requested uid must be representable in the * filesystem's idmapping. */ if (!kuid_has_mapping(fc->user_ns, kuid)) goto bad_value; ctx->uid = kuid; break; case Opt_gid: kgid = result.gid; /* * The requested gid must be representable in the * filesystem's idmapping. */ if (!kgid_has_mapping(fc->user_ns, kgid)) goto bad_value; ctx->gid = kgid; break; case Opt_huge: ctx->huge = result.uint_32; if (ctx->huge != SHMEM_HUGE_NEVER && !(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && has_transparent_hugepage())) goto unsupported_parameter; ctx->seen |= SHMEM_SEEN_HUGE; break; case Opt_mpol: if (IS_ENABLED(CONFIG_NUMA)) { mpol_put(ctx->mpol); ctx->mpol = NULL; if (mpol_parse_str(param->string, &ctx->mpol)) goto bad_value; break; } goto unsupported_parameter; case Opt_inode32: ctx->full_inums = false; ctx->seen |= SHMEM_SEEN_INUMS; break; case Opt_inode64: if (sizeof(ino_t) < 8) { return invalfc(fc, "Cannot use inode64 with <64bit inums in kernel\n"); } ctx->full_inums = true; ctx->seen |= SHMEM_SEEN_INUMS; break; case Opt_noswap: if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN)) { return invalfc(fc, "Turning off swap in unprivileged tmpfs mounts unsupported"); } ctx->noswap = true; ctx->seen |= SHMEM_SEEN_NOSWAP; break; case Opt_quota: if (fc->user_ns != &init_user_ns) return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported"); ctx->seen |= SHMEM_SEEN_QUOTA; ctx->quota_types |= (QTYPE_MASK_USR | QTYPE_MASK_GRP); break; case Opt_usrquota: if (fc->user_ns != &init_user_ns) return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported"); ctx->seen |= SHMEM_SEEN_QUOTA; ctx->quota_types |= QTYPE_MASK_USR; break; case Opt_grpquota: if (fc->user_ns != &init_user_ns) return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported"); ctx->seen |= SHMEM_SEEN_QUOTA; ctx->quota_types |= QTYPE_MASK_GRP; break; case Opt_usrquota_block_hardlimit: size = memparse(param->string, &rest); if (*rest || !size) goto bad_value; if (size > SHMEM_QUOTA_MAX_SPC_LIMIT) return invalfc(fc, "User quota block hardlimit too large."); ctx->qlimits.usrquota_bhardlimit = size; break; case Opt_grpquota_block_hardlimit: size = memparse(param->string, &rest); if (*rest || !size) goto bad_value; if (size > SHMEM_QUOTA_MAX_SPC_LIMIT) return invalfc(fc, "Group quota block hardlimit too large."); ctx->qlimits.grpquota_bhardlimit = size; break; case Opt_usrquota_inode_hardlimit: size = memparse(param->string, &rest); if (*rest || !size) goto bad_value; if (size > SHMEM_QUOTA_MAX_INO_LIMIT) return invalfc(fc, "User quota inode hardlimit too large."); ctx->qlimits.usrquota_ihardlimit = size; break; case Opt_grpquota_inode_hardlimit: size = memparse(param->string, &rest); if (*rest || !size) goto bad_value; if (size > SHMEM_QUOTA_MAX_INO_LIMIT) return invalfc(fc, "Group quota inode hardlimit too large."); ctx->qlimits.grpquota_ihardlimit = size; break; case Opt_casefold_version: return shmem_parse_opt_casefold(fc, param, false); case Opt_casefold: return shmem_parse_opt_casefold(fc, param, true); case Opt_strict_encoding: #if IS_ENABLED(CONFIG_UNICODE) ctx->strict_encoding = true; break; #else return invalfc(fc, "tmpfs: Kernel not built with CONFIG_UNICODE\n"); #endif } return 0; unsupported_parameter: return invalfc(fc, "Unsupported parameter '%s'", param->key); bad_value: return invalfc(fc, "Bad value for '%s'", param->key); } static char *shmem_next_opt(char **s) { char *sbegin = *s; char *p; if (sbegin == NULL) return NULL; /* * NUL-terminate this option: unfortunately, * mount options form a comma-separated list, * but mpol's nodelist may also contain commas. */ for (;;) { p = strchr(*s, ','); if (p == NULL) break; *s = p + 1; if (!isdigit(*(p+1))) { *p = '\0'; return sbegin; } } *s = NULL; return sbegin; } static int shmem_parse_monolithic(struct fs_context *fc, void *data) { return vfs_parse_monolithic_sep(fc, data, shmem_next_opt); } /* * Reconfigure a shmem filesystem. */ static int shmem_reconfigure(struct fs_context *fc) { struct shmem_options *ctx = fc->fs_private; struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb); unsigned long used_isp; struct mempolicy *mpol = NULL; const char *err; raw_spin_lock(&sbinfo->stat_lock); used_isp = sbinfo->max_inodes * BOGO_INODE_SIZE - sbinfo->free_ispace; if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) { if (!sbinfo->max_blocks) { err = "Cannot retroactively limit size"; goto out; } if (percpu_counter_compare(&sbinfo->used_blocks, ctx->blocks) > 0) { err = "Too small a size for current use"; goto out; } } if ((ctx->seen & SHMEM_SEEN_INODES) && ctx->inodes) { if (!sbinfo->max_inodes) { err = "Cannot retroactively limit inodes"; goto out; } if (ctx->inodes * BOGO_INODE_SIZE < used_isp) { err = "Too few inodes for current use"; goto out; } } if ((ctx->seen & SHMEM_SEEN_INUMS) && !ctx->full_inums && sbinfo->next_ino > UINT_MAX) { err = "Current inum too high to switch to 32-bit inums"; goto out; } if ((ctx->seen & SHMEM_SEEN_NOSWAP) && ctx->noswap && !sbinfo->noswap) { err = "Cannot disable swap on remount"; goto out; } if (!(ctx->seen & SHMEM_SEEN_NOSWAP) && !ctx->noswap && sbinfo->noswap) { err = "Cannot enable swap on remount if it was disabled on first mount"; goto out; } if (ctx->seen & SHMEM_SEEN_QUOTA && !sb_any_quota_loaded(fc->root->d_sb)) { err = "Cannot enable quota on remount"; goto out; } #ifdef CONFIG_TMPFS_QUOTA #define CHANGED_LIMIT(name) \ (ctx->qlimits.name## hardlimit && \ (ctx->qlimits.name## hardlimit != sbinfo->qlimits.name## hardlimit)) if (CHANGED_LIMIT(usrquota_b) || CHANGED_LIMIT(usrquota_i) || CHANGED_LIMIT(grpquota_b) || CHANGED_LIMIT(grpquota_i)) { err = "Cannot change global quota limit on remount"; goto out; } #endif /* CONFIG_TMPFS_QUOTA */ if (ctx->seen & SHMEM_SEEN_HUGE) sbinfo->huge = ctx->huge; if (ctx->seen & SHMEM_SEEN_INUMS) sbinfo->full_inums = ctx->full_inums; if (ctx->seen & SHMEM_SEEN_BLOCKS) sbinfo->max_blocks = ctx->blocks; if (ctx->seen & SHMEM_SEEN_INODES) { sbinfo->max_inodes = ctx->inodes; sbinfo->free_ispace = ctx->inodes * BOGO_INODE_SIZE - used_isp; } /* * Preserve previous mempolicy unless mpol remount option was specified. */ if (ctx->mpol) { mpol = sbinfo->mpol; sbinfo->mpol = ctx->mpol; /* transfers initial ref */ ctx->mpol = NULL; } if (ctx->noswap) sbinfo->noswap = true; raw_spin_unlock(&sbinfo->stat_lock); mpol_put(mpol); return 0; out: raw_spin_unlock(&sbinfo->stat_lock); return invalfc(fc, "%s", err); } static int shmem_show_options(struct seq_file *seq, struct dentry *root) { struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb); struct mempolicy *mpol; if (sbinfo->max_blocks != shmem_default_max_blocks()) seq_printf(seq, ",size=%luk", K(sbinfo->max_blocks)); if (sbinfo->max_inodes != shmem_default_max_inodes()) seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes); if (sbinfo->mode != (0777 | S_ISVTX)) seq_printf(seq, ",mode=%03ho", sbinfo->mode); if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID)) seq_printf(seq, ",uid=%u", from_kuid_munged(&init_user_ns, sbinfo->uid)); if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID)) seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, sbinfo->gid)); /* * Showing inode{64,32} might be useful even if it's the system default, * since then people don't have to resort to checking both here and * /proc/config.gz to confirm 64-bit inums were successfully applied * (which may not even exist if IKCONFIG_PROC isn't enabled). * * We hide it when inode64 isn't the default and we are using 32-bit * inodes, since that probably just means the feature isn't even under * consideration. * * As such: * * +-----------------+-----------------+ * | TMPFS_INODE64=y | TMPFS_INODE64=n | * +------------------+-----------------+-----------------+ * | full_inums=true | show | show | * | full_inums=false | show | hide | * +------------------+-----------------+-----------------+ * */ if (IS_ENABLED(CONFIG_TMPFS_INODE64) || sbinfo->full_inums) seq_printf(seq, ",inode%d", (sbinfo->full_inums ? 64 : 32)); #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* Rightly or wrongly, show huge mount option unmasked by shmem_huge */ if (sbinfo->huge) seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge)); #endif mpol = shmem_get_sbmpol(sbinfo); shmem_show_mpol(seq, mpol); mpol_put(mpol); if (sbinfo->noswap) seq_printf(seq, ",noswap"); #ifdef CONFIG_TMPFS_QUOTA if (sb_has_quota_active(root->d_sb, USRQUOTA)) seq_printf(seq, ",usrquota"); if (sb_has_quota_active(root->d_sb, GRPQUOTA)) seq_printf(seq, ",grpquota"); if (sbinfo->qlimits.usrquota_bhardlimit) seq_printf(seq, ",usrquota_block_hardlimit=%lld", sbinfo->qlimits.usrquota_bhardlimit); if (sbinfo->qlimits.grpquota_bhardlimit) seq_printf(seq, ",grpquota_block_hardlimit=%lld", sbinfo->qlimits.grpquota_bhardlimit); if (sbinfo->qlimits.usrquota_ihardlimit) seq_printf(seq, ",usrquota_inode_hardlimit=%lld", sbinfo->qlimits.usrquota_ihardlimit); if (sbinfo->qlimits.grpquota_ihardlimit) seq_printf(seq, ",grpquota_inode_hardlimit=%lld", sbinfo->qlimits.grpquota_ihardlimit); #endif return 0; } #endif /* CONFIG_TMPFS */ static void shmem_put_super(struct super_block *sb) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); #if IS_ENABLED(CONFIG_UNICODE) if (sb->s_encoding) utf8_unload(sb->s_encoding); #endif #ifdef CONFIG_TMPFS_QUOTA shmem_disable_quotas(sb); #endif free_percpu(sbinfo->ino_batch); percpu_counter_destroy(&sbinfo->used_blocks); mpol_put(sbinfo->mpol); kfree(sbinfo); sb->s_fs_info = NULL; } #if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_TMPFS) static const struct dentry_operations shmem_ci_dentry_ops = { .d_hash = generic_ci_d_hash, .d_compare = generic_ci_d_compare, }; #endif static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) { struct shmem_options *ctx = fc->fs_private; struct inode *inode; struct shmem_sb_info *sbinfo; int error = -ENOMEM; /* Round up to L1_CACHE_BYTES to resist false sharing */ sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info), L1_CACHE_BYTES), GFP_KERNEL); if (!sbinfo) return error; sb->s_fs_info = sbinfo; #ifdef CONFIG_TMPFS /* * Per default we only allow half of the physical ram per * tmpfs instance, limiting inodes to one per page of lowmem; * but the internal instance is left unlimited. */ if (!(sb->s_flags & SB_KERNMOUNT)) { if (!(ctx->seen & SHMEM_SEEN_BLOCKS)) ctx->blocks = shmem_default_max_blocks(); if (!(ctx->seen & SHMEM_SEEN_INODES)) ctx->inodes = shmem_default_max_inodes(); if (!(ctx->seen & SHMEM_SEEN_INUMS)) ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64); sbinfo->noswap = ctx->noswap; } else { sb->s_flags |= SB_NOUSER; } sb->s_export_op = &shmem_export_ops; sb->s_flags |= SB_NOSEC | SB_I_VERSION; #if IS_ENABLED(CONFIG_UNICODE) if (!ctx->encoding && ctx->strict_encoding) { pr_err("tmpfs: strict_encoding option without encoding is forbidden\n"); error = -EINVAL; goto failed; } if (ctx->encoding) { sb->s_encoding = ctx->encoding; set_default_d_op(sb, &shmem_ci_dentry_ops); if (ctx->strict_encoding) sb->s_encoding_flags = SB_ENC_STRICT_MODE_FL; } #endif #else sb->s_flags |= SB_NOUSER; #endif /* CONFIG_TMPFS */ sb->s_d_flags |= DCACHE_DONTCACHE; sbinfo->max_blocks = ctx->blocks; sbinfo->max_inodes = ctx->inodes; sbinfo->free_ispace = sbinfo->max_inodes * BOGO_INODE_SIZE; if (sb->s_flags & SB_KERNMOUNT) { sbinfo->ino_batch = alloc_percpu(ino_t); if (!sbinfo->ino_batch) goto failed; } sbinfo->uid = ctx->uid; sbinfo->gid = ctx->gid; sbinfo->full_inums = ctx->full_inums; sbinfo->mode = ctx->mode; #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (ctx->seen & SHMEM_SEEN_HUGE) sbinfo->huge = ctx->huge; else sbinfo->huge = tmpfs_huge; #endif sbinfo->mpol = ctx->mpol; ctx->mpol = NULL; raw_spin_lock_init(&sbinfo->stat_lock); if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL)) goto failed; spin_lock_init(&sbinfo->shrinklist_lock); INIT_LIST_HEAD(&sbinfo->shrinklist); sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = TMPFS_MAGIC; sb->s_op = &shmem_ops; sb->s_time_gran = 1; #ifdef CONFIG_TMPFS_XATTR sb->s_xattr = shmem_xattr_handlers; #endif #ifdef CONFIG_TMPFS_POSIX_ACL sb->s_flags |= SB_POSIXACL; #endif uuid_t uuid; uuid_gen(&uuid); super_set_uuid(sb, uuid.b, sizeof(uuid)); #ifdef CONFIG_TMPFS_QUOTA if (ctx->seen & SHMEM_SEEN_QUOTA) { sb->dq_op = &shmem_quota_operations; sb->s_qcop = &dquot_quotactl_sysfile_ops; sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; /* Copy the default limits from ctx into sbinfo */ memcpy(&sbinfo->qlimits, &ctx->qlimits, sizeof(struct shmem_quota_limits)); if (shmem_enable_quotas(sb, ctx->quota_types)) goto failed; } #endif /* CONFIG_TMPFS_QUOTA */ inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE); if (IS_ERR(inode)) { error = PTR_ERR(inode); goto failed; } inode->i_uid = sbinfo->uid; inode->i_gid = sbinfo->gid; sb->s_root = d_make_root(inode); if (!sb->s_root) goto failed; return 0; failed: shmem_put_super(sb); return error; } static int shmem_get_tree(struct fs_context *fc) { return get_tree_nodev(fc, shmem_fill_super); } static void shmem_free_fc(struct fs_context *fc) { struct shmem_options *ctx = fc->fs_private; if (ctx) { mpol_put(ctx->mpol); kfree(ctx); } } static const struct fs_context_operations shmem_fs_context_ops = { .free = shmem_free_fc, .get_tree = shmem_get_tree, #ifdef CONFIG_TMPFS .parse_monolithic = shmem_parse_monolithic, .parse_param = shmem_parse_one, .reconfigure = shmem_reconfigure, #endif }; static struct kmem_cache *shmem_inode_cachep __ro_after_init; static struct inode *shmem_alloc_inode(struct super_block *sb) { struct shmem_inode_info *info; info = alloc_inode_sb(sb, shmem_inode_cachep, GFP_KERNEL); if (!info) return NULL; return &info->vfs_inode; } static void shmem_free_in_core_inode(struct inode *inode) { if (S_ISLNK(inode->i_mode)) kfree(inode->i_link); kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); } static void shmem_destroy_inode(struct inode *inode) { if (S_ISREG(inode->i_mode)) mpol_free_shared_policy(&SHMEM_I(inode)->policy); if (S_ISDIR(inode->i_mode)) simple_offset_destroy(shmem_get_offset_ctx(inode)); } static void shmem_init_inode(void *foo) { struct shmem_inode_info *info = foo; inode_init_once(&info->vfs_inode); } static void __init shmem_init_inodecache(void) { shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", sizeof(struct shmem_inode_info), 0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode); } static void __init shmem_destroy_inodecache(void) { kmem_cache_destroy(shmem_inode_cachep); } /* Keep the page in page cache instead of truncating it */ static int shmem_error_remove_folio(struct address_space *mapping, struct folio *folio) { return 0; } static const struct address_space_operations shmem_aops = { .dirty_folio = noop_dirty_folio, #ifdef CONFIG_TMPFS .write_begin = shmem_write_begin, .write_end = shmem_write_end, #endif #ifdef CONFIG_MIGRATION .migrate_folio = migrate_folio, #endif .error_remove_folio = shmem_error_remove_folio, }; static const struct file_operations shmem_file_operations = { .mmap = shmem_mmap, .open = shmem_file_open, .get_unmapped_area = shmem_get_unmapped_area, #ifdef CONFIG_TMPFS .llseek = shmem_file_llseek, .read_iter = shmem_file_read_iter, .write_iter = shmem_file_write_iter, .fsync = noop_fsync, .splice_read = shmem_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = shmem_fallocate, #endif }; static const struct inode_operations shmem_inode_operations = { .getattr = shmem_getattr, .setattr = shmem_setattr, #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, .set_acl = simple_set_acl, .fileattr_get = shmem_fileattr_get, .fileattr_set = shmem_fileattr_set, #endif }; static const struct inode_operations shmem_dir_inode_operations = { #ifdef CONFIG_TMPFS .getattr = shmem_getattr, .create = shmem_create, .lookup = simple_lookup, .link = shmem_link, .unlink = shmem_unlink, .symlink = shmem_symlink, .mkdir = shmem_mkdir, .rmdir = shmem_rmdir, .mknod = shmem_mknod, .rename = shmem_rename2, .tmpfile = shmem_tmpfile, .get_offset_ctx = shmem_get_offset_ctx, #endif #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, .fileattr_get = shmem_fileattr_get, .fileattr_set = shmem_fileattr_set, #endif #ifdef CONFIG_TMPFS_POSIX_ACL .setattr = shmem_setattr, .set_acl = simple_set_acl, #endif }; static const struct inode_operations shmem_special_inode_operations = { .getattr = shmem_getattr, #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, #endif #ifdef CONFIG_TMPFS_POSIX_ACL .setattr = shmem_setattr, .set_acl = simple_set_acl, #endif }; static const struct super_operations shmem_ops = { .alloc_inode = shmem_alloc_inode, .free_inode = shmem_free_in_core_inode, .destroy_inode = shmem_destroy_inode, #ifdef CONFIG_TMPFS .statfs = shmem_statfs, .show_options = shmem_show_options, #endif #ifdef CONFIG_TMPFS_QUOTA .get_dquots = shmem_get_dquots, #endif .evict_inode = shmem_evict_inode, .drop_inode = generic_delete_inode, .put_super = shmem_put_super, #ifdef CONFIG_TRANSPARENT_HUGEPAGE .nr_cached_objects = shmem_unused_huge_count, .free_cached_objects = shmem_unused_huge_scan, #endif }; static const struct vm_operations_struct shmem_vm_ops = { .fault = shmem_fault, .map_pages = filemap_map_pages, #ifdef CONFIG_NUMA .set_policy = shmem_set_policy, .get_policy = shmem_get_policy, #endif }; static const struct vm_operations_struct shmem_anon_vm_ops = { .fault = shmem_fault, .map_pages = filemap_map_pages, #ifdef CONFIG_NUMA .set_policy = shmem_set_policy, .get_policy = shmem_get_policy, #endif }; int shmem_init_fs_context(struct fs_context *fc) { struct shmem_options *ctx; ctx = kzalloc(sizeof(struct shmem_options), GFP_KERNEL); if (!ctx) return -ENOMEM; ctx->mode = 0777 | S_ISVTX; ctx->uid = current_fsuid(); ctx->gid = current_fsgid(); #if IS_ENABLED(CONFIG_UNICODE) ctx->encoding = NULL; #endif fc->fs_private = ctx; fc->ops = &shmem_fs_context_ops; return 0; } static struct file_system_type shmem_fs_type = { .owner = THIS_MODULE, .name = "tmpfs", .init_fs_context = shmem_init_fs_context, #ifdef CONFIG_TMPFS .parameters = shmem_fs_parameters, #endif .kill_sb = kill_litter_super, .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP | FS_MGTIME, }; #if defined(CONFIG_SYSFS) && defined(CONFIG_TMPFS) #define __INIT_KOBJ_ATTR(_name, _mode, _show, _store) \ { \ .attr = { .name = __stringify(_name), .mode = _mode }, \ .show = _show, \ .store = _store, \ } #define TMPFS_ATTR_W(_name, _store) \ static struct kobj_attribute tmpfs_attr_##_name = \ __INIT_KOBJ_ATTR(_name, 0200, NULL, _store) #define TMPFS_ATTR_RW(_name, _show, _store) \ static struct kobj_attribute tmpfs_attr_##_name = \ __INIT_KOBJ_ATTR(_name, 0644, _show, _store) #define TMPFS_ATTR_RO(_name, _show) \ static struct kobj_attribute tmpfs_attr_##_name = \ __INIT_KOBJ_ATTR(_name, 0444, _show, NULL) #if IS_ENABLED(CONFIG_UNICODE) static ssize_t casefold_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { return sysfs_emit(buf, "supported\n"); } TMPFS_ATTR_RO(casefold, casefold_show); #endif static struct attribute *tmpfs_attributes[] = { #if IS_ENABLED(CONFIG_UNICODE) &tmpfs_attr_casefold.attr, #endif NULL }; static const struct attribute_group tmpfs_attribute_group = { .attrs = tmpfs_attributes, .name = "features" }; static struct kobject *tmpfs_kobj; static int __init tmpfs_sysfs_init(void) { int ret; tmpfs_kobj = kobject_create_and_add("tmpfs", fs_kobj); if (!tmpfs_kobj) return -ENOMEM; ret = sysfs_create_group(tmpfs_kobj, &tmpfs_attribute_group); if (ret) kobject_put(tmpfs_kobj); return ret; } #endif /* CONFIG_SYSFS && CONFIG_TMPFS */ void __init shmem_init(void) { int error; shmem_init_inodecache(); #ifdef CONFIG_TMPFS_QUOTA register_quota_format(&shmem_quota_format); #endif error = register_filesystem(&shmem_fs_type); if (error) { pr_err("Could not register tmpfs\n"); goto out2; } shm_mnt = kern_mount(&shmem_fs_type); if (IS_ERR(shm_mnt)) { error = PTR_ERR(shm_mnt); pr_err("Could not kern_mount tmpfs\n"); goto out1; } #if defined(CONFIG_SYSFS) && defined(CONFIG_TMPFS) error = tmpfs_sysfs_init(); if (error) { pr_err("Could not init tmpfs sysfs\n"); goto out1; } #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY) SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; else shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */ /* * Default to setting PMD-sized THP to inherit the global setting and * disable all other multi-size THPs. */ if (!shmem_orders_configured) huge_shmem_orders_inherit = BIT(HPAGE_PMD_ORDER); #endif return; out1: unregister_filesystem(&shmem_fs_type); out2: #ifdef CONFIG_TMPFS_QUOTA unregister_quota_format(&shmem_quota_format); #endif shmem_destroy_inodecache(); shm_mnt = ERR_PTR(error); } #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS) static ssize_t shmem_enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { static const int values[] = { SHMEM_HUGE_ALWAYS, SHMEM_HUGE_WITHIN_SIZE, SHMEM_HUGE_ADVISE, SHMEM_HUGE_NEVER, SHMEM_HUGE_DENY, SHMEM_HUGE_FORCE, }; int len = 0; int i; for (i = 0; i < ARRAY_SIZE(values); i++) { len += sysfs_emit_at(buf, len, shmem_huge == values[i] ? "%s[%s]" : "%s%s", i ? " " : "", shmem_format_huge(values[i])); } len += sysfs_emit_at(buf, len, "\n"); return len; } static ssize_t shmem_enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { char tmp[16]; int huge, err; if (count + 1 > sizeof(tmp)) return -EINVAL; memcpy(tmp, buf, count); tmp[count] = '\0'; if (count && tmp[count - 1] == '\n') tmp[count - 1] = '\0'; huge = shmem_parse_huge(tmp); if (huge == -EINVAL) return huge; shmem_huge = huge; if (shmem_huge > SHMEM_HUGE_DENY) SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; err = start_stop_khugepaged(); return err ? err : count; } struct kobj_attribute shmem_enabled_attr = __ATTR_RW(shmem_enabled); static DEFINE_SPINLOCK(huge_shmem_orders_lock); static ssize_t thpsize_shmem_enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { int order = to_thpsize(kobj)->order; const char *output; if (test_bit(order, &huge_shmem_orders_always)) output = "[always] inherit within_size advise never"; else if (test_bit(order, &huge_shmem_orders_inherit)) output = "always [inherit] within_size advise never"; else if (test_bit(order, &huge_shmem_orders_within_size)) output = "always inherit [within_size] advise never"; else if (test_bit(order, &huge_shmem_orders_madvise)) output = "always inherit within_size [advise] never"; else output = "always inherit within_size advise [never]"; return sysfs_emit(buf, "%s\n", output); } static ssize_t thpsize_shmem_enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int order = to_thpsize(kobj)->order; ssize_t ret = count; if (sysfs_streq(buf, "always")) { spin_lock(&huge_shmem_orders_lock); clear_bit(order, &huge_shmem_orders_inherit); clear_bit(order, &huge_shmem_orders_madvise); clear_bit(order, &huge_shmem_orders_within_size); set_bit(order, &huge_shmem_orders_always); spin_unlock(&huge_shmem_orders_lock); } else if (sysfs_streq(buf, "inherit")) { /* Do not override huge allocation policy with non-PMD sized mTHP */ if (shmem_huge == SHMEM_HUGE_FORCE && order != HPAGE_PMD_ORDER) return -EINVAL; spin_lock(&huge_shmem_orders_lock); clear_bit(order, &huge_shmem_orders_always); clear_bit(order, &huge_shmem_orders_madvise); clear_bit(order, &huge_shmem_orders_within_size); set_bit(order, &huge_shmem_orders_inherit); spin_unlock(&huge_shmem_orders_lock); } else if (sysfs_streq(buf, "within_size")) { spin_lock(&huge_shmem_orders_lock); clear_bit(order, &huge_shmem_orders_always); clear_bit(order, &huge_shmem_orders_inherit); clear_bit(order, &huge_shmem_orders_madvise); set_bit(order, &huge_shmem_orders_within_size); spin_unlock(&huge_shmem_orders_lock); } else if (sysfs_streq(buf, "advise")) { spin_lock(&huge_shmem_orders_lock); clear_bit(order, &huge_shmem_orders_always); clear_bit(order, &huge_shmem_orders_inherit); clear_bit(order, &huge_shmem_orders_within_size); set_bit(order, &huge_shmem_orders_madvise); spin_unlock(&huge_shmem_orders_lock); } else if (sysfs_streq(buf, "never")) { spin_lock(&huge_shmem_orders_lock); clear_bit(order, &huge_shmem_orders_always); clear_bit(order, &huge_shmem_orders_inherit); clear_bit(order, &huge_shmem_orders_within_size); clear_bit(order, &huge_shmem_orders_madvise); spin_unlock(&huge_shmem_orders_lock); } else { ret = -EINVAL; } if (ret > 0) { int err = start_stop_khugepaged(); if (err) ret = err; } return ret; } struct kobj_attribute thpsize_shmem_enabled_attr = __ATTR(shmem_enabled, 0644, thpsize_shmem_enabled_show, thpsize_shmem_enabled_store); #endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */ #if defined(CONFIG_TRANSPARENT_HUGEPAGE) static int __init setup_transparent_hugepage_shmem(char *str) { int huge; huge = shmem_parse_huge(str); if (huge == -EINVAL) { pr_warn("transparent_hugepage_shmem= cannot parse, ignored\n"); return huge; } shmem_huge = huge; return 1; } __setup("transparent_hugepage_shmem=", setup_transparent_hugepage_shmem); static int __init setup_transparent_hugepage_tmpfs(char *str) { int huge; huge = shmem_parse_huge(str); if (huge < 0) { pr_warn("transparent_hugepage_tmpfs= cannot parse, ignored\n"); return huge; } tmpfs_huge = huge; return 1; } __setup("transparent_hugepage_tmpfs=", setup_transparent_hugepage_tmpfs); static char str_dup[PAGE_SIZE] __initdata; static int __init setup_thp_shmem(char *str) { char *token, *range, *policy, *subtoken; unsigned long always, inherit, madvise, within_size; char *start_size, *end_size; int start, end, nr; char *p; if (!str || strlen(str) + 1 > PAGE_SIZE) goto err; strscpy(str_dup, str); always = huge_shmem_orders_always; inherit = huge_shmem_orders_inherit; madvise = huge_shmem_orders_madvise; within_size = huge_shmem_orders_within_size; p = str_dup; while ((token = strsep(&p, ";")) != NULL) { range = strsep(&token, ":"); policy = token; if (!policy) goto err; while ((subtoken = strsep(&range, ",")) != NULL) { if (strchr(subtoken, '-')) { start_size = strsep(&subtoken, "-"); end_size = subtoken; start = get_order_from_str(start_size, THP_ORDERS_ALL_FILE_DEFAULT); end = get_order_from_str(end_size, THP_ORDERS_ALL_FILE_DEFAULT); } else { start_size = end_size = subtoken; start = end = get_order_from_str(subtoken, THP_ORDERS_ALL_FILE_DEFAULT); } if (start < 0) { pr_err("invalid size %s in thp_shmem boot parameter\n", start_size); goto err; } if (end < 0) { pr_err("invalid size %s in thp_shmem boot parameter\n", end_size); goto err; } if (start > end) goto err; nr = end - start + 1; if (!strcmp(policy, "always")) { bitmap_set(&always, start, nr); bitmap_clear(&inherit, start, nr); bitmap_clear(&madvise, start, nr); bitmap_clear(&within_size, start, nr); } else if (!strcmp(policy, "advise")) { bitmap_set(&madvise, start, nr); bitmap_clear(&inherit, start, nr); bitmap_clear(&always, start, nr); bitmap_clear(&within_size, start, nr); } else if (!strcmp(policy, "inherit")) { bitmap_set(&inherit, start, nr); bitmap_clear(&madvise, start, nr); bitmap_clear(&always, start, nr); bitmap_clear(&within_size, start, nr); } else if (!strcmp(policy, "within_size")) { bitmap_set(&within_size, start, nr); bitmap_clear(&inherit, start, nr); bitmap_clear(&madvise, start, nr); bitmap_clear(&always, start, nr); } else if (!strcmp(policy, "never")) { bitmap_clear(&inherit, start, nr); bitmap_clear(&madvise, start, nr); bitmap_clear(&always, start, nr); bitmap_clear(&within_size, start, nr); } else { pr_err("invalid policy %s in thp_shmem boot parameter\n", policy); goto err; } } } huge_shmem_orders_always = always; huge_shmem_orders_madvise = madvise; huge_shmem_orders_inherit = inherit; huge_shmem_orders_within_size = within_size; shmem_orders_configured = true; return 1; err: pr_warn("thp_shmem=%s: error parsing string, ignoring setting\n", str); return 0; } __setup("thp_shmem=", setup_thp_shmem); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #else /* !CONFIG_SHMEM */ /* * tiny-shmem: simple shmemfs and tmpfs using ramfs code * * This is intended for small system where the benefits of the full * shmem code (swap-backed and resource-limited) are outweighed by * their complexity. On systems without swap this code should be * effectively equivalent, but much lighter weight. */ static struct file_system_type shmem_fs_type = { .name = "tmpfs", .init_fs_context = ramfs_init_fs_context, .parameters = ramfs_fs_parameters, .kill_sb = ramfs_kill_sb, .fs_flags = FS_USERNS_MOUNT, }; void __init shmem_init(void) { BUG_ON(register_filesystem(&shmem_fs_type) != 0); shm_mnt = kern_mount(&shmem_fs_type); BUG_ON(IS_ERR(shm_mnt)); } int shmem_unuse(unsigned int type) { return 0; } int shmem_lock(struct file *file, int lock, struct ucounts *ucounts) { return 0; } void shmem_unlock_mapping(struct address_space *mapping) { } #ifdef CONFIG_MMU unsigned long shmem_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags); } #endif void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) { truncate_inode_pages_range(inode->i_mapping, lstart, lend); } EXPORT_SYMBOL_GPL(shmem_truncate_range); #define shmem_vm_ops generic_file_vm_ops #define shmem_anon_vm_ops generic_file_vm_ops #define shmem_file_operations ramfs_file_operations #define shmem_acct_size(flags, size) 0 #define shmem_unacct_size(flags, size) do {} while (0) static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb, struct inode *dir, umode_t mode, dev_t dev, unsigned long flags) { struct inode *inode = ramfs_get_inode(sb, dir, mode, dev); return inode ? inode : ERR_PTR(-ENOSPC); } #endif /* CONFIG_SHMEM */ /* common code */ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, loff_t size, unsigned long flags, unsigned int i_flags) { struct inode *inode; struct file *res; if (IS_ERR(mnt)) return ERR_CAST(mnt); if (size < 0 || size > MAX_LFS_FILESIZE) return ERR_PTR(-EINVAL); if (is_idmapped_mnt(mnt)) return ERR_PTR(-EINVAL); if (shmem_acct_size(flags, size)) return ERR_PTR(-ENOMEM); inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0, flags); if (IS_ERR(inode)) { shmem_unacct_size(flags, size); return ERR_CAST(inode); } inode->i_flags |= i_flags; inode->i_size = size; clear_nlink(inode); /* It is unlinked */ res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size)); if (!IS_ERR(res)) res = alloc_file_pseudo(inode, mnt, name, O_RDWR, &shmem_file_operations); if (IS_ERR(res)) iput(inode); return res; } /** * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be * kernel internal. There will be NO LSM permission checks against the * underlying inode. So users of this interface must do LSM checks at a * higher layer. The users are the big_key and shm implementations. LSM * checks are provided at the key or shm level rather than the inode. * @name: name for dentry (to be seen in /proc/<pid>/maps) * @size: size to be set for the file * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size */ struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags) { return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE); } EXPORT_SYMBOL_GPL(shmem_kernel_file_setup); /** * shmem_file_setup - get an unlinked file living in tmpfs * @name: name for dentry (to be seen in /proc/<pid>/maps) * @size: size to be set for the file * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size */ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags) { return __shmem_file_setup(shm_mnt, name, size, flags, 0); } EXPORT_SYMBOL_GPL(shmem_file_setup); /** * shmem_file_setup_with_mnt - get an unlinked file living in tmpfs * @mnt: the tmpfs mount where the file will be created * @name: name for dentry (to be seen in /proc/<pid>/maps) * @size: size to be set for the file * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size */ struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name, loff_t size, unsigned long flags) { return __shmem_file_setup(mnt, name, size, flags, 0); } EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt); /** * shmem_zero_setup - setup a shared anonymous mapping * @vma: the vma to be mmapped is prepared by do_mmap */ int shmem_zero_setup(struct vm_area_struct *vma) { struct file *file; loff_t size = vma->vm_end - vma->vm_start; /* * Cloning a new file under mmap_lock leads to a lock ordering conflict * between XFS directory reading and selinux: since this file is only * accessible to the user through its mapping, use S_PRIVATE flag to * bypass file security, in the same way as shmem_kernel_file_setup(). */ file = shmem_kernel_file_setup("dev/zero", size, vma->vm_flags); if (IS_ERR(file)) return PTR_ERR(file); if (vma->vm_file) fput(vma->vm_file); vma->vm_file = file; vma->vm_ops = &shmem_anon_vm_ops; return 0; } /** * shmem_read_folio_gfp - read into page cache, using specified page allocation flags. * @mapping: the folio's address_space * @index: the folio index * @gfp: the page allocator flags to use if allocating * * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)", * with any new page allocations done using the specified allocation flags. * But read_cache_page_gfp() uses the ->read_folio() method: which does not * suit tmpfs, since it may have pages in swapcache, and needs to find those * for itself; although drivers/gpu/drm i915 and ttm rely upon this support. * * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily. */ struct folio *shmem_read_folio_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp) { #ifdef CONFIG_SHMEM struct inode *inode = mapping->host; struct folio *folio; int error; error = shmem_get_folio_gfp(inode, index, i_size_read(inode), &folio, SGP_CACHE, gfp, NULL, NULL); if (error) return ERR_PTR(error); folio_unlock(folio); return folio; #else /* * The tiny !SHMEM case uses ramfs without swap */ return mapping_read_folio_gfp(mapping, index, gfp); #endif } EXPORT_SYMBOL_GPL(shmem_read_folio_gfp); struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp) { struct folio *folio = shmem_read_folio_gfp(mapping, index, gfp); struct page *page; if (IS_ERR(folio)) return &folio->page; page = folio_file_page(folio, index); if (PageHWPoison(page)) { folio_put(folio); return ERR_PTR(-EIO); } return page; } EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);
29 1181 1181 213 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2012 ARM Ltd. */ #ifndef __ASM_FP_H #define __ASM_FP_H #include <asm/errno.h> #include <asm/percpu.h> #include <asm/ptrace.h> #include <asm/processor.h> #include <asm/sigcontext.h> #include <asm/sysreg.h> #ifndef __ASSEMBLY__ #include <linux/bitmap.h> #include <linux/build_bug.h> #include <linux/bug.h> #include <linux/cache.h> #include <linux/init.h> #include <linux/stddef.h> #include <linux/types.h> /* Masks for extracting the FPSR and FPCR from the FPSCR */ #define VFP_FPSCR_STAT_MASK 0xf800009f #define VFP_FPSCR_CTRL_MASK 0x07f79f00 /* * The VFP state has 32x64-bit registers and a single 32-bit * control/status register. */ #define VFP_STATE_SIZE ((32 * 8) + 4) static inline unsigned long cpacr_save_enable_kernel_sve(void) { unsigned long old = read_sysreg(cpacr_el1); unsigned long set = CPACR_EL1_FPEN_EL1EN | CPACR_EL1_ZEN_EL1EN; write_sysreg(old | set, cpacr_el1); isb(); return old; } static inline unsigned long cpacr_save_enable_kernel_sme(void) { unsigned long old = read_sysreg(cpacr_el1); unsigned long set = CPACR_EL1_FPEN_EL1EN | CPACR_EL1_SMEN_EL1EN; write_sysreg(old | set, cpacr_el1); isb(); return old; } static inline void cpacr_restore(unsigned long cpacr) { write_sysreg(cpacr, cpacr_el1); isb(); } /* * When we defined the maximum SVE vector length we defined the ABI so * that the maximum vector length included all the reserved for future * expansion bits in ZCR rather than those just currently defined by * the architecture. Using this length to allocate worst size buffers * results in excessively large allocations, and this effect is even * more pronounced for SME due to ZA. Define more suitable VLs for * these situations. */ #define ARCH_SVE_VQ_MAX ((ZCR_ELx_LEN_MASK >> ZCR_ELx_LEN_SHIFT) + 1) #define SME_VQ_MAX ((SMCR_ELx_LEN_MASK >> SMCR_ELx_LEN_SHIFT) + 1) struct task_struct; extern void fpsimd_save_state(struct user_fpsimd_state *state); extern void fpsimd_load_state(struct user_fpsimd_state *state); extern void fpsimd_thread_switch(struct task_struct *next); extern void fpsimd_flush_thread(void); extern void fpsimd_preserve_current_state(void); extern void fpsimd_restore_current_state(void); extern void fpsimd_update_current_state(struct user_fpsimd_state const *state); struct cpu_fp_state { struct user_fpsimd_state *st; void *sve_state; void *sme_state; u64 *svcr; u64 *fpmr; unsigned int sve_vl; unsigned int sme_vl; enum fp_type *fp_type; enum fp_type to_save; }; DECLARE_PER_CPU(struct cpu_fp_state, fpsimd_last_state); extern void fpsimd_bind_state_to_cpu(struct cpu_fp_state *fp_state); extern void fpsimd_flush_task_state(struct task_struct *target); extern void fpsimd_save_and_flush_current_state(void); extern void fpsimd_save_and_flush_cpu_state(void); static inline bool thread_sm_enabled(struct thread_struct *thread) { return system_supports_sme() && (thread->svcr & SVCR_SM_MASK); } static inline bool thread_za_enabled(struct thread_struct *thread) { return system_supports_sme() && (thread->svcr & SVCR_ZA_MASK); } extern void task_smstop_sm(struct task_struct *task); /* Maximum VL that SVE/SME VL-agnostic software can transparently support */ #define VL_ARCH_MAX 0x100 /* Offset of FFR in the SVE register dump */ static inline size_t sve_ffr_offset(int vl) { return SVE_SIG_FFR_OFFSET(sve_vq_from_vl(vl)) - SVE_SIG_REGS_OFFSET; } static inline void *sve_pffr(struct thread_struct *thread) { unsigned int vl; if (system_supports_sme() && thread_sm_enabled(thread)) vl = thread_get_sme_vl(thread); else vl = thread_get_sve_vl(thread); return (char *)thread->sve_state + sve_ffr_offset(vl); } static inline void *thread_zt_state(struct thread_struct *thread) { /* The ZT register state is stored immediately after the ZA state */ unsigned int sme_vq = sve_vq_from_vl(thread_get_sme_vl(thread)); return thread->sme_state + ZA_SIG_REGS_SIZE(sme_vq); } extern void sve_save_state(void *state, u32 *pfpsr, int save_ffr); extern void sve_load_state(void const *state, u32 const *pfpsr, int restore_ffr); extern void sve_flush_live(bool flush_ffr, unsigned long vq_minus_1); extern unsigned int sve_get_vl(void); extern void sve_set_vq(unsigned long vq_minus_1); extern void sme_set_vq(unsigned long vq_minus_1); extern void sme_save_state(void *state, int zt); extern void sme_load_state(void const *state, int zt); struct arm64_cpu_capabilities; extern void cpu_enable_fpsimd(const struct arm64_cpu_capabilities *__unused); extern void cpu_enable_sve(const struct arm64_cpu_capabilities *__unused); extern void cpu_enable_sme(const struct arm64_cpu_capabilities *__unused); extern void cpu_enable_sme2(const struct arm64_cpu_capabilities *__unused); extern void cpu_enable_fa64(const struct arm64_cpu_capabilities *__unused); extern void cpu_enable_fpmr(const struct arm64_cpu_capabilities *__unused); /* * Helpers to translate bit indices in sve_vq_map to VQ values (and * vice versa). This allows find_next_bit() to be used to find the * _maximum_ VQ not exceeding a certain value. */ static inline unsigned int __vq_to_bit(unsigned int vq) { return SVE_VQ_MAX - vq; } static inline unsigned int __bit_to_vq(unsigned int bit) { return SVE_VQ_MAX - bit; } struct vl_info { enum vec_type type; const char *name; /* For display purposes */ /* Minimum supported vector length across all CPUs */ int min_vl; /* Maximum supported vector length across all CPUs */ int max_vl; int max_virtualisable_vl; /* * Set of available vector lengths, * where length vq encoded as bit __vq_to_bit(vq): */ DECLARE_BITMAP(vq_map, SVE_VQ_MAX); /* Set of vector lengths present on at least one cpu: */ DECLARE_BITMAP(vq_partial_map, SVE_VQ_MAX); }; #ifdef CONFIG_ARM64_SVE extern void sve_alloc(struct task_struct *task, bool flush); extern void fpsimd_release_task(struct task_struct *task); extern void fpsimd_sync_from_effective_state(struct task_struct *task); extern void fpsimd_sync_to_effective_state_zeropad(struct task_struct *task); extern int vec_set_vector_length(struct task_struct *task, enum vec_type type, unsigned long vl, unsigned long flags); extern int sve_set_current_vl(unsigned long arg); extern int sve_get_current_vl(void); static inline void sve_user_disable(void) { sysreg_clear_set(cpacr_el1, CPACR_EL1_ZEN_EL0EN, 0); } static inline void sve_user_enable(void) { sysreg_clear_set(cpacr_el1, 0, CPACR_EL1_ZEN_EL0EN); } #define sve_cond_update_zcr_vq(val, reg) \ do { \ u64 __zcr = read_sysreg_s((reg)); \ u64 __new = __zcr & ~ZCR_ELx_LEN_MASK; \ __new |= (val) & ZCR_ELx_LEN_MASK; \ if (__zcr != __new) \ write_sysreg_s(__new, (reg)); \ } while (0) /* * Probing and setup functions. * Calls to these functions must be serialised with one another. */ enum vec_type; extern void __init vec_init_vq_map(enum vec_type type); extern void vec_update_vq_map(enum vec_type type); extern int vec_verify_vq_map(enum vec_type type); extern void __init sve_setup(void); extern __ro_after_init struct vl_info vl_info[ARM64_VEC_MAX]; static inline void write_vl(enum vec_type type, u64 val) { u64 tmp; switch (type) { #ifdef CONFIG_ARM64_SVE case ARM64_VEC_SVE: tmp = read_sysreg_s(SYS_ZCR_EL1) & ~ZCR_ELx_LEN_MASK; write_sysreg_s(tmp | val, SYS_ZCR_EL1); break; #endif #ifdef CONFIG_ARM64_SME case ARM64_VEC_SME: tmp = read_sysreg_s(SYS_SMCR_EL1) & ~SMCR_ELx_LEN_MASK; write_sysreg_s(tmp | val, SYS_SMCR_EL1); break; #endif default: WARN_ON_ONCE(1); break; } } static inline int vec_max_vl(enum vec_type type) { return vl_info[type].max_vl; } static inline int vec_max_virtualisable_vl(enum vec_type type) { return vl_info[type].max_virtualisable_vl; } static inline int sve_max_vl(void) { return vec_max_vl(ARM64_VEC_SVE); } static inline int sve_max_virtualisable_vl(void) { return vec_max_virtualisable_vl(ARM64_VEC_SVE); } /* Ensure vq >= SVE_VQ_MIN && vq <= SVE_VQ_MAX before calling this function */ static inline bool vq_available(enum vec_type type, unsigned int vq) { return test_bit(__vq_to_bit(vq), vl_info[type].vq_map); } static inline bool sve_vq_available(unsigned int vq) { return vq_available(ARM64_VEC_SVE, vq); } static inline size_t __sve_state_size(unsigned int sve_vl, unsigned int sme_vl) { unsigned int vl = max(sve_vl, sme_vl); return SVE_SIG_REGS_SIZE(sve_vq_from_vl(vl)); } /* * Return how many bytes of memory are required to store the full SVE * state for task, given task's currently configured vector length. */ static inline size_t sve_state_size(struct task_struct const *task) { unsigned int sve_vl = task_get_sve_vl(task); unsigned int sme_vl = task_get_sme_vl(task); return __sve_state_size(sve_vl, sme_vl); } #else /* ! CONFIG_ARM64_SVE */ static inline void sve_alloc(struct task_struct *task, bool flush) { } static inline void fpsimd_release_task(struct task_struct *task) { } static inline void fpsimd_sync_from_effective_state(struct task_struct *task) { } static inline void fpsimd_sync_to_effective_state_zeropad(struct task_struct *task) { } static inline int sve_max_virtualisable_vl(void) { return 0; } static inline int sve_set_current_vl(unsigned long arg) { return -EINVAL; } static inline int sve_get_current_vl(void) { return -EINVAL; } static inline int sve_max_vl(void) { return -EINVAL; } static inline bool sve_vq_available(unsigned int vq) { return false; } static inline void sve_user_disable(void) { BUILD_BUG(); } static inline void sve_user_enable(void) { BUILD_BUG(); } #define sve_cond_update_zcr_vq(val, reg) do { } while (0) static inline void vec_init_vq_map(enum vec_type t) { } static inline void vec_update_vq_map(enum vec_type t) { } static inline int vec_verify_vq_map(enum vec_type t) { return 0; } static inline void sve_setup(void) { } static inline size_t __sve_state_size(unsigned int sve_vl, unsigned int sme_vl) { return 0; } static inline size_t sve_state_size(struct task_struct const *task) { return 0; } #endif /* ! CONFIG_ARM64_SVE */ #ifdef CONFIG_ARM64_SME static inline void sme_user_disable(void) { sysreg_clear_set(cpacr_el1, CPACR_EL1_SMEN_EL0EN, 0); } static inline void sme_user_enable(void) { sysreg_clear_set(cpacr_el1, 0, CPACR_EL1_SMEN_EL0EN); } static inline void sme_smstart_sm(void) { asm volatile(__msr_s(SYS_SVCR_SMSTART_SM_EL0, "xzr")); } static inline void sme_smstop_sm(void) { asm volatile(__msr_s(SYS_SVCR_SMSTOP_SM_EL0, "xzr")); } static inline void sme_smstop(void) { asm volatile(__msr_s(SYS_SVCR_SMSTOP_SMZA_EL0, "xzr")); } extern void __init sme_setup(void); static inline int sme_max_vl(void) { return vec_max_vl(ARM64_VEC_SME); } static inline int sme_max_virtualisable_vl(void) { return vec_max_virtualisable_vl(ARM64_VEC_SME); } extern void sme_alloc(struct task_struct *task, bool flush); extern unsigned int sme_get_vl(void); extern int sme_set_current_vl(unsigned long arg); extern int sme_get_current_vl(void); extern void sme_suspend_exit(void); static inline size_t __sme_state_size(unsigned int sme_vl) { size_t size = ZA_SIG_REGS_SIZE(sve_vq_from_vl(sme_vl)); if (system_supports_sme2()) size += ZT_SIG_REG_SIZE; return size; } /* * Return how many bytes of memory are required to store the full SME * specific state for task, given task's currently configured vector * length. */ static inline size_t sme_state_size(struct task_struct const *task) { return __sme_state_size(task_get_sme_vl(task)); } #else static inline void sme_user_disable(void) { BUILD_BUG(); } static inline void sme_user_enable(void) { BUILD_BUG(); } static inline void sme_smstart_sm(void) { } static inline void sme_smstop_sm(void) { } static inline void sme_smstop(void) { } static inline void sme_alloc(struct task_struct *task, bool flush) { } static inline void sme_setup(void) { } static inline unsigned int sme_get_vl(void) { return 0; } static inline int sme_max_vl(void) { return 0; } static inline int sme_max_virtualisable_vl(void) { return 0; } static inline int sme_set_current_vl(unsigned long arg) { return -EINVAL; } static inline int sme_get_current_vl(void) { return -EINVAL; } static inline void sme_suspend_exit(void) { } static inline size_t __sme_state_size(unsigned int sme_vl) { return 0; } static inline size_t sme_state_size(struct task_struct const *task) { return 0; } #endif /* ! CONFIG_ARM64_SME */ /* For use by EFI runtime services calls only */ extern void __efi_fpsimd_begin(void); extern void __efi_fpsimd_end(void); #endif #endif
4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 // SPDX-License-Identifier: GPL-2.0-or-later /* * NetLabel Unlabeled Support * * This file defines functions for dealing with unlabeled packets for the * NetLabel system. The NetLabel system manages static and dynamic label * mappings for network protocols such as CIPSO and RIPSO. * * Author: Paul Moore <paul@paul-moore.com> */ /* * (c) Copyright Hewlett-Packard Development Company, L.P., 2006 - 2008 */ #include <linux/types.h> #include <linux/rcupdate.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/socket.h> #include <linux/string.h> #include <linux/skbuff.h> #include <linux/audit.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/notifier.h> #include <linux/netdevice.h> #include <linux/security.h> #include <linux/slab.h> #include <net/sock.h> #include <net/netlink.h> #include <net/genetlink.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/net_namespace.h> #include <net/netlabel.h> #include <asm/bug.h> #include <linux/atomic.h> #include "netlabel_user.h" #include "netlabel_addrlist.h" #include "netlabel_domainhash.h" #include "netlabel_unlabeled.h" #include "netlabel_mgmt.h" /* NOTE: at present we always use init's network namespace since we don't * presently support different namespaces even though the majority of * the functions in this file are "namespace safe" */ /* The unlabeled connection hash table which we use to map network interfaces * and addresses of unlabeled packets to a user specified secid value for the * LSM. The hash table is used to lookup the network interface entry * (struct netlbl_unlhsh_iface) and then the interface entry is used to * lookup an IP address match from an ordered list. If a network interface * match can not be found in the hash table then the default entry * (netlbl_unlhsh_def) is used. The IP address entry list * (struct netlbl_unlhsh_addr) is ordered such that the entries with a * larger netmask come first. */ struct netlbl_unlhsh_tbl { struct list_head *tbl; u32 size; }; #define netlbl_unlhsh_addr4_entry(iter) \ container_of(iter, struct netlbl_unlhsh_addr4, list) struct netlbl_unlhsh_addr4 { u32 secid; struct netlbl_af4list list; struct rcu_head rcu; }; #define netlbl_unlhsh_addr6_entry(iter) \ container_of(iter, struct netlbl_unlhsh_addr6, list) struct netlbl_unlhsh_addr6 { u32 secid; struct netlbl_af6list list; struct rcu_head rcu; }; struct netlbl_unlhsh_iface { int ifindex; struct list_head addr4_list; struct list_head addr6_list; u32 valid; struct list_head list; struct rcu_head rcu; }; /* Argument struct for netlbl_unlhsh_walk() */ struct netlbl_unlhsh_walk_arg { struct netlink_callback *nl_cb; struct sk_buff *skb; u32 seq; }; /* Unlabeled connection hash table */ /* updates should be so rare that having one spinlock for the entire * hash table should be okay */ static DEFINE_SPINLOCK(netlbl_unlhsh_lock); #define netlbl_unlhsh_rcu_deref(p) \ rcu_dereference_check(p, lockdep_is_held(&netlbl_unlhsh_lock)) static struct netlbl_unlhsh_tbl __rcu *netlbl_unlhsh; static struct netlbl_unlhsh_iface __rcu *netlbl_unlhsh_def; /* Accept unlabeled packets flag */ static u8 netlabel_unlabel_acceptflg; /* NetLabel Generic NETLINK unlabeled family */ static struct genl_family netlbl_unlabel_gnl_family; /* NetLabel Netlink attribute policy */ static const struct nla_policy netlbl_unlabel_genl_policy[NLBL_UNLABEL_A_MAX + 1] = { [NLBL_UNLABEL_A_ACPTFLG] = { .type = NLA_U8 }, [NLBL_UNLABEL_A_IPV6ADDR] = { .type = NLA_BINARY, .len = sizeof(struct in6_addr) }, [NLBL_UNLABEL_A_IPV6MASK] = { .type = NLA_BINARY, .len = sizeof(struct in6_addr) }, [NLBL_UNLABEL_A_IPV4ADDR] = { .type = NLA_BINARY, .len = sizeof(struct in_addr) }, [NLBL_UNLABEL_A_IPV4MASK] = { .type = NLA_BINARY, .len = sizeof(struct in_addr) }, [NLBL_UNLABEL_A_IFACE] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, [NLBL_UNLABEL_A_SECCTX] = { .type = NLA_BINARY } }; /* * Unlabeled Connection Hash Table Functions */ /** * netlbl_unlhsh_free_iface - Frees an interface entry from the hash table * @entry: the entry's RCU field * * Description: * This function is designed to be used as a callback to the call_rcu() * function so that memory allocated to a hash table interface entry can be * released safely. It is important to note that this function does not free * the IPv4 and IPv6 address lists contained as part of an interface entry. It * is up to the rest of the code to make sure an interface entry is only freed * once it's address lists are empty. * */ static void netlbl_unlhsh_free_iface(struct rcu_head *entry) { struct netlbl_unlhsh_iface *iface; struct netlbl_af4list *iter4; struct netlbl_af4list *tmp4; #if IS_ENABLED(CONFIG_IPV6) struct netlbl_af6list *iter6; struct netlbl_af6list *tmp6; #endif /* IPv6 */ iface = container_of(entry, struct netlbl_unlhsh_iface, rcu); /* no need for locks here since we are the only one with access to this * structure */ netlbl_af4list_foreach_safe(iter4, tmp4, &iface->addr4_list) { netlbl_af4list_remove_entry(iter4); kfree(netlbl_unlhsh_addr4_entry(iter4)); } #if IS_ENABLED(CONFIG_IPV6) netlbl_af6list_foreach_safe(iter6, tmp6, &iface->addr6_list) { netlbl_af6list_remove_entry(iter6); kfree(netlbl_unlhsh_addr6_entry(iter6)); } #endif /* IPv6 */ kfree(iface); } /** * netlbl_unlhsh_hash - Hashing function for the hash table * @ifindex: the network interface/device to hash * * Description: * This is the hashing function for the unlabeled hash table, it returns the * bucket number for the given device/interface. The caller is responsible for * ensuring that the hash table is protected with either a RCU read lock or * the hash table lock. * */ static u32 netlbl_unlhsh_hash(int ifindex) { return ifindex & (netlbl_unlhsh_rcu_deref(netlbl_unlhsh)->size - 1); } /** * netlbl_unlhsh_search_iface - Search for a matching interface entry * @ifindex: the network interface * * Description: * Searches the unlabeled connection hash table and returns a pointer to the * interface entry which matches @ifindex, otherwise NULL is returned. The * caller is responsible for ensuring that the hash table is protected with * either a RCU read lock or the hash table lock. * */ static struct netlbl_unlhsh_iface *netlbl_unlhsh_search_iface(int ifindex) { u32 bkt; struct list_head *bkt_list; struct netlbl_unlhsh_iface *iter; bkt = netlbl_unlhsh_hash(ifindex); bkt_list = &netlbl_unlhsh_rcu_deref(netlbl_unlhsh)->tbl[bkt]; list_for_each_entry_rcu(iter, bkt_list, list, lockdep_is_held(&netlbl_unlhsh_lock)) if (iter->valid && iter->ifindex == ifindex) return iter; return NULL; } /** * netlbl_unlhsh_add_addr4 - Add a new IPv4 address entry to the hash table * @iface: the associated interface entry * @addr: IPv4 address in network byte order * @mask: IPv4 address mask in network byte order * @secid: LSM secid value for entry * * Description: * Add a new address entry into the unlabeled connection hash table using the * interface entry specified by @iface. On success zero is returned, otherwise * a negative value is returned. * */ static int netlbl_unlhsh_add_addr4(struct netlbl_unlhsh_iface *iface, const struct in_addr *addr, const struct in_addr *mask, u32 secid) { int ret_val; struct netlbl_unlhsh_addr4 *entry; entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (entry == NULL) return -ENOMEM; entry->list.addr = addr->s_addr & mask->s_addr; entry->list.mask = mask->s_addr; entry->list.valid = 1; entry->secid = secid; spin_lock(&netlbl_unlhsh_lock); ret_val = netlbl_af4list_add(&entry->list, &iface->addr4_list); spin_unlock(&netlbl_unlhsh_lock); if (ret_val != 0) kfree(entry); return ret_val; } #if IS_ENABLED(CONFIG_IPV6) /** * netlbl_unlhsh_add_addr6 - Add a new IPv6 address entry to the hash table * @iface: the associated interface entry * @addr: IPv6 address in network byte order * @mask: IPv6 address mask in network byte order * @secid: LSM secid value for entry * * Description: * Add a new address entry into the unlabeled connection hash table using the * interface entry specified by @iface. On success zero is returned, otherwise * a negative value is returned. * */ static int netlbl_unlhsh_add_addr6(struct netlbl_unlhsh_iface *iface, const struct in6_addr *addr, const struct in6_addr *mask, u32 secid) { int ret_val; struct netlbl_unlhsh_addr6 *entry; entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (entry == NULL) return -ENOMEM; entry->list.addr = *addr; entry->list.addr.s6_addr32[0] &= mask->s6_addr32[0]; entry->list.addr.s6_addr32[1] &= mask->s6_addr32[1]; entry->list.addr.s6_addr32[2] &= mask->s6_addr32[2]; entry->list.addr.s6_addr32[3] &= mask->s6_addr32[3]; entry->list.mask = *mask; entry->list.valid = 1; entry->secid = secid; spin_lock(&netlbl_unlhsh_lock); ret_val = netlbl_af6list_add(&entry->list, &iface->addr6_list); spin_unlock(&netlbl_unlhsh_lock); if (ret_val != 0) kfree(entry); return 0; } #endif /* IPv6 */ /** * netlbl_unlhsh_add_iface - Adds a new interface entry to the hash table * @ifindex: network interface * * Description: * Add a new, empty, interface entry into the unlabeled connection hash table. * On success a pointer to the new interface entry is returned, on failure NULL * is returned. * */ static struct netlbl_unlhsh_iface *netlbl_unlhsh_add_iface(int ifindex) { u32 bkt; struct netlbl_unlhsh_iface *iface; iface = kzalloc(sizeof(*iface), GFP_ATOMIC); if (iface == NULL) return NULL; iface->ifindex = ifindex; INIT_LIST_HEAD(&iface->addr4_list); INIT_LIST_HEAD(&iface->addr6_list); iface->valid = 1; spin_lock(&netlbl_unlhsh_lock); if (ifindex > 0) { bkt = netlbl_unlhsh_hash(ifindex); if (netlbl_unlhsh_search_iface(ifindex) != NULL) goto add_iface_failure; list_add_tail_rcu(&iface->list, &netlbl_unlhsh_rcu_deref(netlbl_unlhsh)->tbl[bkt]); } else { INIT_LIST_HEAD(&iface->list); if (netlbl_unlhsh_rcu_deref(netlbl_unlhsh_def) != NULL) goto add_iface_failure; rcu_assign_pointer(netlbl_unlhsh_def, iface); } spin_unlock(&netlbl_unlhsh_lock); return iface; add_iface_failure: spin_unlock(&netlbl_unlhsh_lock); kfree(iface); return NULL; } /** * netlbl_unlhsh_add - Adds a new entry to the unlabeled connection hash table * @net: network namespace * @dev_name: interface name * @addr: IP address in network byte order * @mask: address mask in network byte order * @addr_len: length of address/mask (4 for IPv4, 16 for IPv6) * @secid: LSM secid value for the entry * @audit_info: NetLabel audit information * * Description: * Adds a new entry to the unlabeled connection hash table. Returns zero on * success, negative values on failure. * */ int netlbl_unlhsh_add(struct net *net, const char *dev_name, const void *addr, const void *mask, u32 addr_len, u32 secid, struct netlbl_audit *audit_info) { int ret_val; int ifindex; struct net_device *dev; struct netlbl_unlhsh_iface *iface; struct audit_buffer *audit_buf = NULL; struct lsm_context ctx; if (addr_len != sizeof(struct in_addr) && addr_len != sizeof(struct in6_addr)) return -EINVAL; rcu_read_lock(); if (dev_name != NULL) { dev = dev_get_by_name_rcu(net, dev_name); if (dev == NULL) { ret_val = -ENODEV; goto unlhsh_add_return; } ifindex = dev->ifindex; iface = netlbl_unlhsh_search_iface(ifindex); } else { ifindex = 0; iface = rcu_dereference(netlbl_unlhsh_def); } if (iface == NULL) { iface = netlbl_unlhsh_add_iface(ifindex); if (iface == NULL) { ret_val = -ENOMEM; goto unlhsh_add_return; } } audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCADD, audit_info); switch (addr_len) { case sizeof(struct in_addr): { const struct in_addr *addr4 = addr; const struct in_addr *mask4 = mask; ret_val = netlbl_unlhsh_add_addr4(iface, addr4, mask4, secid); if (audit_buf != NULL) netlbl_af4list_audit_addr(audit_buf, 1, dev_name, addr4->s_addr, mask4->s_addr); break; } #if IS_ENABLED(CONFIG_IPV6) case sizeof(struct in6_addr): { const struct in6_addr *addr6 = addr; const struct in6_addr *mask6 = mask; ret_val = netlbl_unlhsh_add_addr6(iface, addr6, mask6, secid); if (audit_buf != NULL) netlbl_af6list_audit_addr(audit_buf, 1, dev_name, addr6, mask6); break; } #endif /* IPv6 */ default: ret_val = -EINVAL; } if (ret_val == 0) atomic_inc(&netlabel_mgmt_protocount); unlhsh_add_return: rcu_read_unlock(); if (audit_buf != NULL) { if (security_secid_to_secctx(secid, &ctx) >= 0) { audit_log_format(audit_buf, " sec_obj=%s", ctx.context); security_release_secctx(&ctx); } audit_log_format(audit_buf, " res=%u", ret_val == 0 ? 1 : 0); audit_log_end(audit_buf); } return ret_val; } /** * netlbl_unlhsh_remove_addr4 - Remove an IPv4 address entry * @net: network namespace * @iface: interface entry * @addr: IP address * @mask: IP address mask * @audit_info: NetLabel audit information * * Description: * Remove an IP address entry from the unlabeled connection hash table. * Returns zero on success, negative values on failure. * */ static int netlbl_unlhsh_remove_addr4(struct net *net, struct netlbl_unlhsh_iface *iface, const struct in_addr *addr, const struct in_addr *mask, struct netlbl_audit *audit_info) { struct netlbl_af4list *list_entry; struct netlbl_unlhsh_addr4 *entry; struct audit_buffer *audit_buf; struct net_device *dev; struct lsm_context ctx; spin_lock(&netlbl_unlhsh_lock); list_entry = netlbl_af4list_remove(addr->s_addr, mask->s_addr, &iface->addr4_list); spin_unlock(&netlbl_unlhsh_lock); if (list_entry != NULL) entry = netlbl_unlhsh_addr4_entry(list_entry); else entry = NULL; audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCDEL, audit_info); if (audit_buf != NULL) { dev = dev_get_by_index(net, iface->ifindex); netlbl_af4list_audit_addr(audit_buf, 1, (dev != NULL ? dev->name : NULL), addr->s_addr, mask->s_addr); dev_put(dev); if (entry != NULL && security_secid_to_secctx(entry->secid, &ctx) >= 0) { audit_log_format(audit_buf, " sec_obj=%s", ctx.context); security_release_secctx(&ctx); } audit_log_format(audit_buf, " res=%u", entry != NULL ? 1 : 0); audit_log_end(audit_buf); } if (entry == NULL) return -ENOENT; kfree_rcu(entry, rcu); return 0; } #if IS_ENABLED(CONFIG_IPV6) /** * netlbl_unlhsh_remove_addr6 - Remove an IPv6 address entry * @net: network namespace * @iface: interface entry * @addr: IP address * @mask: IP address mask * @audit_info: NetLabel audit information * * Description: * Remove an IP address entry from the unlabeled connection hash table. * Returns zero on success, negative values on failure. * */ static int netlbl_unlhsh_remove_addr6(struct net *net, struct netlbl_unlhsh_iface *iface, const struct in6_addr *addr, const struct in6_addr *mask, struct netlbl_audit *audit_info) { struct netlbl_af6list *list_entry; struct netlbl_unlhsh_addr6 *entry; struct audit_buffer *audit_buf; struct net_device *dev; struct lsm_context ctx; spin_lock(&netlbl_unlhsh_lock); list_entry = netlbl_af6list_remove(addr, mask, &iface->addr6_list); spin_unlock(&netlbl_unlhsh_lock); if (list_entry != NULL) entry = netlbl_unlhsh_addr6_entry(list_entry); else entry = NULL; audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCDEL, audit_info); if (audit_buf != NULL) { dev = dev_get_by_index(net, iface->ifindex); netlbl_af6list_audit_addr(audit_buf, 1, (dev != NULL ? dev->name : NULL), addr, mask); dev_put(dev); if (entry != NULL && security_secid_to_secctx(entry->secid, &ctx) >= 0) { audit_log_format(audit_buf, " sec_obj=%s", ctx.context); security_release_secctx(&ctx); } audit_log_format(audit_buf, " res=%u", entry != NULL ? 1 : 0); audit_log_end(audit_buf); } if (entry == NULL) return -ENOENT; kfree_rcu(entry, rcu); return 0; } #endif /* IPv6 */ /** * netlbl_unlhsh_condremove_iface - Remove an interface entry * @iface: the interface entry * * Description: * Remove an interface entry from the unlabeled connection hash table if it is * empty. An interface entry is considered to be empty if there are no * address entries assigned to it. * */ static void netlbl_unlhsh_condremove_iface(struct netlbl_unlhsh_iface *iface) { struct netlbl_af4list *iter4; #if IS_ENABLED(CONFIG_IPV6) struct netlbl_af6list *iter6; #endif /* IPv6 */ spin_lock(&netlbl_unlhsh_lock); netlbl_af4list_foreach_rcu(iter4, &iface->addr4_list) goto unlhsh_condremove_failure; #if IS_ENABLED(CONFIG_IPV6) netlbl_af6list_foreach_rcu(iter6, &iface->addr6_list) goto unlhsh_condremove_failure; #endif /* IPv6 */ iface->valid = 0; if (iface->ifindex > 0) list_del_rcu(&iface->list); else RCU_INIT_POINTER(netlbl_unlhsh_def, NULL); spin_unlock(&netlbl_unlhsh_lock); call_rcu(&iface->rcu, netlbl_unlhsh_free_iface); return; unlhsh_condremove_failure: spin_unlock(&netlbl_unlhsh_lock); } /** * netlbl_unlhsh_remove - Remove an entry from the unlabeled hash table * @net: network namespace * @dev_name: interface name * @addr: IP address in network byte order * @mask: address mask in network byte order * @addr_len: length of address/mask (4 for IPv4, 16 for IPv6) * @audit_info: NetLabel audit information * * Description: * Removes and existing entry from the unlabeled connection hash table. * Returns zero on success, negative values on failure. * */ int netlbl_unlhsh_remove(struct net *net, const char *dev_name, const void *addr, const void *mask, u32 addr_len, struct netlbl_audit *audit_info) { int ret_val; struct net_device *dev; struct netlbl_unlhsh_iface *iface; if (addr_len != sizeof(struct in_addr) && addr_len != sizeof(struct in6_addr)) return -EINVAL; rcu_read_lock(); if (dev_name != NULL) { dev = dev_get_by_name_rcu(net, dev_name); if (dev == NULL) { ret_val = -ENODEV; goto unlhsh_remove_return; } iface = netlbl_unlhsh_search_iface(dev->ifindex); } else iface = rcu_dereference(netlbl_unlhsh_def); if (iface == NULL) { ret_val = -ENOENT; goto unlhsh_remove_return; } switch (addr_len) { case sizeof(struct in_addr): ret_val = netlbl_unlhsh_remove_addr4(net, iface, addr, mask, audit_info); break; #if IS_ENABLED(CONFIG_IPV6) case sizeof(struct in6_addr): ret_val = netlbl_unlhsh_remove_addr6(net, iface, addr, mask, audit_info); break; #endif /* IPv6 */ default: ret_val = -EINVAL; } if (ret_val == 0) { netlbl_unlhsh_condremove_iface(iface); atomic_dec(&netlabel_mgmt_protocount); } unlhsh_remove_return: rcu_read_unlock(); return ret_val; } /* * General Helper Functions */ /** * netlbl_unlhsh_netdev_handler - Network device notification handler * @this: notifier block * @event: the event * @ptr: the netdevice notifier info (cast to void) * * Description: * Handle network device events, although at present all we care about is a * network device going away. In the case of a device going away we clear any * related entries from the unlabeled connection hash table. * */ static int netlbl_unlhsh_netdev_handler(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct netlbl_unlhsh_iface *iface = NULL; if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; /* XXX - should this be a check for NETDEV_DOWN or _UNREGISTER? */ if (event == NETDEV_DOWN) { spin_lock(&netlbl_unlhsh_lock); iface = netlbl_unlhsh_search_iface(dev->ifindex); if (iface != NULL && iface->valid) { iface->valid = 0; list_del_rcu(&iface->list); } else iface = NULL; spin_unlock(&netlbl_unlhsh_lock); } if (iface != NULL) call_rcu(&iface->rcu, netlbl_unlhsh_free_iface); return NOTIFY_DONE; } /** * netlbl_unlabel_acceptflg_set - Set the unlabeled accept flag * @value: desired value * @audit_info: NetLabel audit information * * Description: * Set the value of the unlabeled accept flag to @value. * */ static void netlbl_unlabel_acceptflg_set(u8 value, struct netlbl_audit *audit_info) { struct audit_buffer *audit_buf; u8 old_val; old_val = netlabel_unlabel_acceptflg; netlabel_unlabel_acceptflg = value; audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_ALLOW, audit_info); if (audit_buf != NULL) { audit_log_format(audit_buf, " unlbl_accept=%u old=%u", value, old_val); audit_log_end(audit_buf); } } /** * netlbl_unlabel_addrinfo_get - Get the IPv4/6 address information * @info: the Generic NETLINK info block * @addr: the IP address * @mask: the IP address mask * @len: the address length * * Description: * Examine the Generic NETLINK message and extract the IP address information. * Returns zero on success, negative values on failure. * */ static int netlbl_unlabel_addrinfo_get(struct genl_info *info, void **addr, void **mask, u32 *len) { u32 addr_len; if (info->attrs[NLBL_UNLABEL_A_IPV4ADDR] && info->attrs[NLBL_UNLABEL_A_IPV4MASK]) { addr_len = nla_len(info->attrs[NLBL_UNLABEL_A_IPV4ADDR]); if (addr_len != sizeof(struct in_addr) && addr_len != nla_len(info->attrs[NLBL_UNLABEL_A_IPV4MASK])) return -EINVAL; *len = addr_len; *addr = nla_data(info->attrs[NLBL_UNLABEL_A_IPV4ADDR]); *mask = nla_data(info->attrs[NLBL_UNLABEL_A_IPV4MASK]); return 0; } else if (info->attrs[NLBL_UNLABEL_A_IPV6ADDR]) { addr_len = nla_len(info->attrs[NLBL_UNLABEL_A_IPV6ADDR]); if (addr_len != sizeof(struct in6_addr) && addr_len != nla_len(info->attrs[NLBL_UNLABEL_A_IPV6MASK])) return -EINVAL; *len = addr_len; *addr = nla_data(info->attrs[NLBL_UNLABEL_A_IPV6ADDR]); *mask = nla_data(info->attrs[NLBL_UNLABEL_A_IPV6MASK]); return 0; } return -EINVAL; } /* * NetLabel Command Handlers */ /** * netlbl_unlabel_accept - Handle an ACCEPT message * @skb: the NETLINK buffer * @info: the Generic NETLINK info block * * Description: * Process a user generated ACCEPT message and set the accept flag accordingly. * Returns zero on success, negative values on failure. * */ static int netlbl_unlabel_accept(struct sk_buff *skb, struct genl_info *info) { u8 value; struct netlbl_audit audit_info; if (info->attrs[NLBL_UNLABEL_A_ACPTFLG]) { value = nla_get_u8(info->attrs[NLBL_UNLABEL_A_ACPTFLG]); if (value == 1 || value == 0) { netlbl_netlink_auditinfo(&audit_info); netlbl_unlabel_acceptflg_set(value, &audit_info); return 0; } } return -EINVAL; } /** * netlbl_unlabel_list - Handle a LIST message * @skb: the NETLINK buffer * @info: the Generic NETLINK info block * * Description: * Process a user generated LIST message and respond with the current status. * Returns zero on success, negative values on failure. * */ static int netlbl_unlabel_list(struct sk_buff *skb, struct genl_info *info) { int ret_val = -EINVAL; struct sk_buff *ans_skb; void *data; ans_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (ans_skb == NULL) goto list_failure; data = genlmsg_put_reply(ans_skb, info, &netlbl_unlabel_gnl_family, 0, NLBL_UNLABEL_C_LIST); if (data == NULL) { ret_val = -ENOMEM; goto list_failure; } ret_val = nla_put_u8(ans_skb, NLBL_UNLABEL_A_ACPTFLG, netlabel_unlabel_acceptflg); if (ret_val != 0) goto list_failure; genlmsg_end(ans_skb, data); return genlmsg_reply(ans_skb, info); list_failure: kfree_skb(ans_skb); return ret_val; } /** * netlbl_unlabel_staticadd - Handle a STATICADD message * @skb: the NETLINK buffer * @info: the Generic NETLINK info block * * Description: * Process a user generated STATICADD message and add a new unlabeled * connection entry to the hash table. Returns zero on success, negative * values on failure. * */ static int netlbl_unlabel_staticadd(struct sk_buff *skb, struct genl_info *info) { int ret_val; char *dev_name; void *addr; void *mask; u32 addr_len; u32 secid; struct netlbl_audit audit_info; /* Don't allow users to add both IPv4 and IPv6 addresses for a * single entry. However, allow users to create two entries, one each * for IPv4 and IPv6, with the same LSM security context which should * achieve the same result. */ if (!info->attrs[NLBL_UNLABEL_A_SECCTX] || !info->attrs[NLBL_UNLABEL_A_IFACE] || !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] || !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^ (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] || !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) return -EINVAL; netlbl_netlink_auditinfo(&audit_info); ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); if (ret_val != 0) return ret_val; dev_name = nla_data(info->attrs[NLBL_UNLABEL_A_IFACE]); ret_val = security_secctx_to_secid( nla_data(info->attrs[NLBL_UNLABEL_A_SECCTX]), nla_len(info->attrs[NLBL_UNLABEL_A_SECCTX]), &secid); if (ret_val != 0) return ret_val; return netlbl_unlhsh_add(&init_net, dev_name, addr, mask, addr_len, secid, &audit_info); } /** * netlbl_unlabel_staticadddef - Handle a STATICADDDEF message * @skb: the NETLINK buffer * @info: the Generic NETLINK info block * * Description: * Process a user generated STATICADDDEF message and add a new default * unlabeled connection entry. Returns zero on success, negative values on * failure. * */ static int netlbl_unlabel_staticadddef(struct sk_buff *skb, struct genl_info *info) { int ret_val; void *addr; void *mask; u32 addr_len; u32 secid; struct netlbl_audit audit_info; /* Don't allow users to add both IPv4 and IPv6 addresses for a * single entry. However, allow users to create two entries, one each * for IPv4 and IPv6, with the same LSM security context which should * achieve the same result. */ if (!info->attrs[NLBL_UNLABEL_A_SECCTX] || !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] || !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^ (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] || !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) return -EINVAL; netlbl_netlink_auditinfo(&audit_info); ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); if (ret_val != 0) return ret_val; ret_val = security_secctx_to_secid( nla_data(info->attrs[NLBL_UNLABEL_A_SECCTX]), nla_len(info->attrs[NLBL_UNLABEL_A_SECCTX]), &secid); if (ret_val != 0) return ret_val; return netlbl_unlhsh_add(&init_net, NULL, addr, mask, addr_len, secid, &audit_info); } /** * netlbl_unlabel_staticremove - Handle a STATICREMOVE message * @skb: the NETLINK buffer * @info: the Generic NETLINK info block * * Description: * Process a user generated STATICREMOVE message and remove the specified * unlabeled connection entry. Returns zero on success, negative values on * failure. * */ static int netlbl_unlabel_staticremove(struct sk_buff *skb, struct genl_info *info) { int ret_val; char *dev_name; void *addr; void *mask; u32 addr_len; struct netlbl_audit audit_info; /* See the note in netlbl_unlabel_staticadd() about not allowing both * IPv4 and IPv6 in the same entry. */ if (!info->attrs[NLBL_UNLABEL_A_IFACE] || !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] || !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^ (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] || !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) return -EINVAL; netlbl_netlink_auditinfo(&audit_info); ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); if (ret_val != 0) return ret_val; dev_name = nla_data(info->attrs[NLBL_UNLABEL_A_IFACE]); return netlbl_unlhsh_remove(&init_net, dev_name, addr, mask, addr_len, &audit_info); } /** * netlbl_unlabel_staticremovedef - Handle a STATICREMOVEDEF message * @skb: the NETLINK buffer * @info: the Generic NETLINK info block * * Description: * Process a user generated STATICREMOVEDEF message and remove the default * unlabeled connection entry. Returns zero on success, negative values on * failure. * */ static int netlbl_unlabel_staticremovedef(struct sk_buff *skb, struct genl_info *info) { int ret_val; void *addr; void *mask; u32 addr_len; struct netlbl_audit audit_info; /* See the note in netlbl_unlabel_staticadd() about not allowing both * IPv4 and IPv6 in the same entry. */ if (!((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] || !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^ (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] || !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) return -EINVAL; netlbl_netlink_auditinfo(&audit_info); ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); if (ret_val != 0) return ret_val; return netlbl_unlhsh_remove(&init_net, NULL, addr, mask, addr_len, &audit_info); } /** * netlbl_unlabel_staticlist_gen - Generate messages for STATICLIST[DEF] * @cmd: command/message * @iface: the interface entry * @addr4: the IPv4 address entry * @addr6: the IPv6 address entry * @arg: the netlbl_unlhsh_walk_arg structure * * Description: * This function is designed to be used to generate a response for a * STATICLIST or STATICLISTDEF message. When called either @addr4 or @addr6 * can be specified, not both, the other unspecified entry should be set to * NULL by the caller. Returns the size of the message on success, negative * values on failure. * */ static int netlbl_unlabel_staticlist_gen(u32 cmd, const struct netlbl_unlhsh_iface *iface, const struct netlbl_unlhsh_addr4 *addr4, const struct netlbl_unlhsh_addr6 *addr6, void *arg) { int ret_val = -ENOMEM; struct netlbl_unlhsh_walk_arg *cb_arg = arg; struct net_device *dev; struct lsm_context ctx; void *data; u32 secid; data = genlmsg_put(cb_arg->skb, NETLINK_CB(cb_arg->nl_cb->skb).portid, cb_arg->seq, &netlbl_unlabel_gnl_family, NLM_F_MULTI, cmd); if (data == NULL) goto list_cb_failure; if (iface->ifindex > 0) { dev = dev_get_by_index(&init_net, iface->ifindex); if (!dev) { ret_val = -ENODEV; goto list_cb_failure; } ret_val = nla_put_string(cb_arg->skb, NLBL_UNLABEL_A_IFACE, dev->name); dev_put(dev); if (ret_val != 0) goto list_cb_failure; } if (addr4) { struct in_addr addr_struct; addr_struct.s_addr = addr4->list.addr; ret_val = nla_put_in_addr(cb_arg->skb, NLBL_UNLABEL_A_IPV4ADDR, addr_struct.s_addr); if (ret_val != 0) goto list_cb_failure; addr_struct.s_addr = addr4->list.mask; ret_val = nla_put_in_addr(cb_arg->skb, NLBL_UNLABEL_A_IPV4MASK, addr_struct.s_addr); if (ret_val != 0) goto list_cb_failure; secid = addr4->secid; } else { ret_val = nla_put_in6_addr(cb_arg->skb, NLBL_UNLABEL_A_IPV6ADDR, &addr6->list.addr); if (ret_val != 0) goto list_cb_failure; ret_val = nla_put_in6_addr(cb_arg->skb, NLBL_UNLABEL_A_IPV6MASK, &addr6->list.mask); if (ret_val != 0) goto list_cb_failure; secid = addr6->secid; } ret_val = security_secid_to_secctx(secid, &ctx); if (ret_val < 0) goto list_cb_failure; ret_val = nla_put(cb_arg->skb, NLBL_UNLABEL_A_SECCTX, ctx.len, ctx.context); security_release_secctx(&ctx); if (ret_val != 0) goto list_cb_failure; cb_arg->seq++; genlmsg_end(cb_arg->skb, data); return 0; list_cb_failure: genlmsg_cancel(cb_arg->skb, data); return ret_val; } /** * netlbl_unlabel_staticlist - Handle a STATICLIST message * @skb: the NETLINK buffer * @cb: the NETLINK callback * * Description: * Process a user generated STATICLIST message and dump the unlabeled * connection hash table in a form suitable for use in a kernel generated * STATICLIST message. Returns the length of @skb. * */ static int netlbl_unlabel_staticlist(struct sk_buff *skb, struct netlink_callback *cb) { struct netlbl_unlhsh_walk_arg cb_arg; u32 skip_bkt = cb->args[0]; u32 skip_chain = cb->args[1]; u32 skip_addr4 = cb->args[2]; u32 iter_bkt, iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0; struct netlbl_unlhsh_iface *iface; struct list_head *iter_list; struct netlbl_af4list *addr4; #if IS_ENABLED(CONFIG_IPV6) u32 skip_addr6 = cb->args[3]; struct netlbl_af6list *addr6; #endif cb_arg.nl_cb = cb; cb_arg.skb = skb; cb_arg.seq = cb->nlh->nlmsg_seq; rcu_read_lock(); for (iter_bkt = skip_bkt; iter_bkt < rcu_dereference(netlbl_unlhsh)->size; iter_bkt++) { iter_list = &rcu_dereference(netlbl_unlhsh)->tbl[iter_bkt]; list_for_each_entry_rcu(iface, iter_list, list) { if (!iface->valid || iter_chain++ < skip_chain) continue; netlbl_af4list_foreach_rcu(addr4, &iface->addr4_list) { if (iter_addr4++ < skip_addr4) continue; if (netlbl_unlabel_staticlist_gen( NLBL_UNLABEL_C_STATICLIST, iface, netlbl_unlhsh_addr4_entry(addr4), NULL, &cb_arg) < 0) { iter_addr4--; iter_chain--; goto unlabel_staticlist_return; } } iter_addr4 = 0; skip_addr4 = 0; #if IS_ENABLED(CONFIG_IPV6) netlbl_af6list_foreach_rcu(addr6, &iface->addr6_list) { if (iter_addr6++ < skip_addr6) continue; if (netlbl_unlabel_staticlist_gen( NLBL_UNLABEL_C_STATICLIST, iface, NULL, netlbl_unlhsh_addr6_entry(addr6), &cb_arg) < 0) { iter_addr6--; iter_chain--; goto unlabel_staticlist_return; } } iter_addr6 = 0; skip_addr6 = 0; #endif /* IPv6 */ } iter_chain = 0; skip_chain = 0; } unlabel_staticlist_return: rcu_read_unlock(); cb->args[0] = iter_bkt; cb->args[1] = iter_chain; cb->args[2] = iter_addr4; cb->args[3] = iter_addr6; return skb->len; } /** * netlbl_unlabel_staticlistdef - Handle a STATICLISTDEF message * @skb: the NETLINK buffer * @cb: the NETLINK callback * * Description: * Process a user generated STATICLISTDEF message and dump the default * unlabeled connection entry in a form suitable for use in a kernel generated * STATICLISTDEF message. Returns the length of @skb. * */ static int netlbl_unlabel_staticlistdef(struct sk_buff *skb, struct netlink_callback *cb) { struct netlbl_unlhsh_walk_arg cb_arg; struct netlbl_unlhsh_iface *iface; u32 iter_addr4 = 0, iter_addr6 = 0; struct netlbl_af4list *addr4; #if IS_ENABLED(CONFIG_IPV6) struct netlbl_af6list *addr6; #endif cb_arg.nl_cb = cb; cb_arg.skb = skb; cb_arg.seq = cb->nlh->nlmsg_seq; rcu_read_lock(); iface = rcu_dereference(netlbl_unlhsh_def); if (iface == NULL || !iface->valid) goto unlabel_staticlistdef_return; netlbl_af4list_foreach_rcu(addr4, &iface->addr4_list) { if (iter_addr4++ < cb->args[0]) continue; if (netlbl_unlabel_staticlist_gen(NLBL_UNLABEL_C_STATICLISTDEF, iface, netlbl_unlhsh_addr4_entry(addr4), NULL, &cb_arg) < 0) { iter_addr4--; goto unlabel_staticlistdef_return; } } #if IS_ENABLED(CONFIG_IPV6) netlbl_af6list_foreach_rcu(addr6, &iface->addr6_list) { if (iter_addr6++ < cb->args[1]) continue; if (netlbl_unlabel_staticlist_gen(NLBL_UNLABEL_C_STATICLISTDEF, iface, NULL, netlbl_unlhsh_addr6_entry(addr6), &cb_arg) < 0) { iter_addr6--; goto unlabel_staticlistdef_return; } } #endif /* IPv6 */ unlabel_staticlistdef_return: rcu_read_unlock(); cb->args[0] = iter_addr4; cb->args[1] = iter_addr6; return skb->len; } /* * NetLabel Generic NETLINK Command Definitions */ static const struct genl_small_ops netlbl_unlabel_genl_ops[] = { { .cmd = NLBL_UNLABEL_C_STATICADD, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = netlbl_unlabel_staticadd, .dumpit = NULL, }, { .cmd = NLBL_UNLABEL_C_STATICREMOVE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = netlbl_unlabel_staticremove, .dumpit = NULL, }, { .cmd = NLBL_UNLABEL_C_STATICLIST, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, .doit = NULL, .dumpit = netlbl_unlabel_staticlist, }, { .cmd = NLBL_UNLABEL_C_STATICADDDEF, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = netlbl_unlabel_staticadddef, .dumpit = NULL, }, { .cmd = NLBL_UNLABEL_C_STATICREMOVEDEF, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = netlbl_unlabel_staticremovedef, .dumpit = NULL, }, { .cmd = NLBL_UNLABEL_C_STATICLISTDEF, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, .doit = NULL, .dumpit = netlbl_unlabel_staticlistdef, }, { .cmd = NLBL_UNLABEL_C_ACCEPT, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = netlbl_unlabel_accept, .dumpit = NULL, }, { .cmd = NLBL_UNLABEL_C_LIST, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, .doit = netlbl_unlabel_list, .dumpit = NULL, }, }; static struct genl_family netlbl_unlabel_gnl_family __ro_after_init = { .hdrsize = 0, .name = NETLBL_NLTYPE_UNLABELED_NAME, .version = NETLBL_PROTO_VERSION, .maxattr = NLBL_UNLABEL_A_MAX, .policy = netlbl_unlabel_genl_policy, .module = THIS_MODULE, .small_ops = netlbl_unlabel_genl_ops, .n_small_ops = ARRAY_SIZE(netlbl_unlabel_genl_ops), .resv_start_op = NLBL_UNLABEL_C_STATICLISTDEF + 1, }; /* * NetLabel Generic NETLINK Protocol Functions */ /** * netlbl_unlabel_genl_init - Register the Unlabeled NetLabel component * * Description: * Register the unlabeled packet NetLabel component with the Generic NETLINK * mechanism. Returns zero on success, negative values on failure. * */ int __init netlbl_unlabel_genl_init(void) { return genl_register_family(&netlbl_unlabel_gnl_family); } /* * NetLabel KAPI Hooks */ static struct notifier_block netlbl_unlhsh_netdev_notifier = { .notifier_call = netlbl_unlhsh_netdev_handler, }; /** * netlbl_unlabel_init - Initialize the unlabeled connection hash table * @size: the number of bits to use for the hash buckets * * Description: * Initializes the unlabeled connection hash table and registers a network * device notification handler. This function should only be called by the * NetLabel subsystem itself during initialization. Returns zero on success, * non-zero values on error. * */ int __init netlbl_unlabel_init(u32 size) { u32 iter; struct netlbl_unlhsh_tbl *hsh_tbl; if (size == 0) return -EINVAL; hsh_tbl = kmalloc(sizeof(*hsh_tbl), GFP_KERNEL); if (hsh_tbl == NULL) return -ENOMEM; hsh_tbl->size = 1 << size; hsh_tbl->tbl = kcalloc(hsh_tbl->size, sizeof(struct list_head), GFP_KERNEL); if (hsh_tbl->tbl == NULL) { kfree(hsh_tbl); return -ENOMEM; } for (iter = 0; iter < hsh_tbl->size; iter++) INIT_LIST_HEAD(&hsh_tbl->tbl[iter]); spin_lock(&netlbl_unlhsh_lock); rcu_assign_pointer(netlbl_unlhsh, hsh_tbl); spin_unlock(&netlbl_unlhsh_lock); register_netdevice_notifier(&netlbl_unlhsh_netdev_notifier); return 0; } /** * netlbl_unlabel_getattr - Get the security attributes for an unlabled packet * @skb: the packet * @family: protocol family * @secattr: the security attributes * * Description: * Determine the security attributes, if any, for an unlabled packet and return * them in @secattr. Returns zero on success and negative values on failure. * */ int netlbl_unlabel_getattr(const struct sk_buff *skb, u16 family, struct netlbl_lsm_secattr *secattr) { struct netlbl_unlhsh_iface *iface; rcu_read_lock(); iface = netlbl_unlhsh_search_iface(skb->skb_iif); if (iface == NULL) iface = rcu_dereference(netlbl_unlhsh_def); if (iface == NULL || !iface->valid) goto unlabel_getattr_nolabel; #if IS_ENABLED(CONFIG_IPV6) /* When resolving a fallback label, check the sk_buff version as * it is possible (e.g. SCTP) to have family = PF_INET6 while * receiving ip_hdr(skb)->version = 4. */ if (family == PF_INET6 && ip_hdr(skb)->version == 4) family = PF_INET; #endif /* IPv6 */ switch (family) { case PF_INET: { struct iphdr *hdr4; struct netlbl_af4list *addr4; hdr4 = ip_hdr(skb); addr4 = netlbl_af4list_search(hdr4->saddr, &iface->addr4_list); if (addr4 == NULL) goto unlabel_getattr_nolabel; secattr->attr.secid = netlbl_unlhsh_addr4_entry(addr4)->secid; break; } #if IS_ENABLED(CONFIG_IPV6) case PF_INET6: { struct ipv6hdr *hdr6; struct netlbl_af6list *addr6; hdr6 = ipv6_hdr(skb); addr6 = netlbl_af6list_search(&hdr6->saddr, &iface->addr6_list); if (addr6 == NULL) goto unlabel_getattr_nolabel; secattr->attr.secid = netlbl_unlhsh_addr6_entry(addr6)->secid; break; } #endif /* IPv6 */ default: goto unlabel_getattr_nolabel; } rcu_read_unlock(); secattr->flags |= NETLBL_SECATTR_SECID; secattr->type = NETLBL_NLTYPE_UNLABELED; return 0; unlabel_getattr_nolabel: rcu_read_unlock(); if (netlabel_unlabel_acceptflg == 0) return -ENOMSG; secattr->type = NETLBL_NLTYPE_UNLABELED; return 0; } /** * netlbl_unlabel_defconf - Set the default config to allow unlabeled packets * * Description: * Set the default NetLabel configuration to allow incoming unlabeled packets * and to send unlabeled network traffic by default. * */ int __init netlbl_unlabel_defconf(void) { int ret_val; struct netlbl_dom_map *entry; struct netlbl_audit audit_info; /* Only the kernel is allowed to call this function and the only time * it is called is at bootup before the audit subsystem is reporting * messages so don't worry to much about these values. */ security_current_getlsmprop_subj(&audit_info.prop); audit_info.loginuid = GLOBAL_ROOT_UID; audit_info.sessionid = 0; entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (entry == NULL) return -ENOMEM; entry->family = AF_UNSPEC; entry->def.type = NETLBL_NLTYPE_UNLABELED; ret_val = netlbl_domhsh_add_default(entry, &audit_info); if (ret_val != 0) return ret_val; netlbl_unlabel_acceptflg_set(1, &audit_info); return 0; }
4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 /* * Copyright (c) 2015, Mellanox Technologies inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "core_priv.h" #include <linux/in.h> #include <linux/in6.h> /* For in6_dev_get/in6_dev_put */ #include <net/addrconf.h> #include <net/bonding.h> #include <rdma/ib_cache.h> #include <rdma/ib_addr.h> static struct workqueue_struct *gid_cache_wq; enum gid_op_type { GID_DEL = 0, GID_ADD }; struct update_gid_event_work { struct work_struct work; union ib_gid gid; struct ib_gid_attr gid_attr; enum gid_op_type gid_op; }; #define ROCE_NETDEV_CALLBACK_SZ 3 struct netdev_event_work_cmd { roce_netdev_callback cb; roce_netdev_filter filter; struct net_device *ndev; struct net_device *filter_ndev; }; struct netdev_event_work { struct work_struct work; struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ]; }; static const struct { bool (*is_supported)(const struct ib_device *device, u32 port_num); enum ib_gid_type gid_type; } PORT_CAP_TO_GID_TYPE[] = { {rdma_protocol_roce_eth_encap, IB_GID_TYPE_ROCE}, {rdma_protocol_roce_udp_encap, IB_GID_TYPE_ROCE_UDP_ENCAP}, }; #define CAP_TO_GID_TABLE_SIZE ARRAY_SIZE(PORT_CAP_TO_GID_TYPE) unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u32 port) { int i; unsigned int ret_flags = 0; if (!rdma_protocol_roce(ib_dev, port)) return 1UL << IB_GID_TYPE_IB; for (i = 0; i < CAP_TO_GID_TABLE_SIZE; i++) if (PORT_CAP_TO_GID_TYPE[i].is_supported(ib_dev, port)) ret_flags |= 1UL << PORT_CAP_TO_GID_TYPE[i].gid_type; return ret_flags; } EXPORT_SYMBOL(roce_gid_type_mask_support); static void update_gid(enum gid_op_type gid_op, struct ib_device *ib_dev, u32 port, union ib_gid *gid, struct ib_gid_attr *gid_attr) { int i; unsigned long gid_type_mask = roce_gid_type_mask_support(ib_dev, port); for (i = 0; i < IB_GID_TYPE_SIZE; i++) { if ((1UL << i) & gid_type_mask) { gid_attr->gid_type = i; switch (gid_op) { case GID_ADD: ib_cache_gid_add(ib_dev, port, gid, gid_attr); break; case GID_DEL: ib_cache_gid_del(ib_dev, port, gid, gid_attr); break; } } } } enum bonding_slave_state { BONDING_SLAVE_STATE_ACTIVE = 1UL << 0, BONDING_SLAVE_STATE_INACTIVE = 1UL << 1, /* No primary slave or the device isn't a slave in bonding */ BONDING_SLAVE_STATE_NA = 1UL << 2, }; static enum bonding_slave_state is_eth_active_slave_of_bonding_rcu(struct net_device *dev, struct net_device *upper) { if (upper && netif_is_bond_master(upper)) { struct net_device *pdev = bond_option_active_slave_get_rcu(netdev_priv(upper)); if (pdev) return dev == pdev ? BONDING_SLAVE_STATE_ACTIVE : BONDING_SLAVE_STATE_INACTIVE; } return BONDING_SLAVE_STATE_NA; } #define REQUIRED_BOND_STATES (BONDING_SLAVE_STATE_ACTIVE | \ BONDING_SLAVE_STATE_NA) static bool is_eth_port_of_netdev_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *real_dev; bool res; if (!rdma_ndev) return false; rcu_read_lock(); real_dev = rdma_vlan_dev_real_dev(cookie); if (!real_dev) real_dev = cookie; res = ((rdma_is_upper_dev_rcu(rdma_ndev, cookie) && (is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) & REQUIRED_BOND_STATES)) || real_dev == rdma_ndev); rcu_read_unlock(); return res; } static bool is_eth_port_inactive_slave_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *master_dev; bool res; if (!rdma_ndev) return false; rcu_read_lock(); master_dev = netdev_master_upper_dev_get_rcu(rdma_ndev); res = is_eth_active_slave_of_bonding_rcu(rdma_ndev, master_dev) == BONDING_SLAVE_STATE_INACTIVE; rcu_read_unlock(); return res; } /** * is_ndev_for_default_gid_filter - Check if a given netdevice * can be considered for default GIDs or not. * @ib_dev: IB device to check * @port: Port to consider for adding default GID * @rdma_ndev: rdma netdevice pointer * @cookie: Netdevice to consider to form a default GID * * is_ndev_for_default_gid_filter() returns true if a given netdevice can be * considered for deriving default RoCE GID, returns false otherwise. */ static bool is_ndev_for_default_gid_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *cookie_ndev = cookie; bool res; if (!rdma_ndev) return false; rcu_read_lock(); /* * When rdma netdevice is used in bonding, bonding master netdevice * should be considered for default GIDs. Therefore, ignore slave rdma * netdevices when bonding is considered. * Additionally when event(cookie) netdevice is bond master device, * make sure that it the upper netdevice of rdma netdevice. */ res = ((cookie_ndev == rdma_ndev && !netif_is_bond_slave(rdma_ndev)) || (netif_is_bond_master(cookie_ndev) && rdma_is_upper_dev_rcu(rdma_ndev, cookie_ndev))); rcu_read_unlock(); return res; } static bool pass_all_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { return true; } static bool upper_device_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { bool res; if (!rdma_ndev) return false; if (rdma_ndev == cookie) return true; rcu_read_lock(); res = rdma_is_upper_dev_rcu(rdma_ndev, cookie); rcu_read_unlock(); return res; } /** * is_upper_ndev_bond_master_filter - Check if a given netdevice * is bond master device of netdevice of the RDMA device of port. * @ib_dev: IB device to check * @port: Port to consider for adding default GID * @rdma_ndev: Pointer to rdma netdevice * @cookie: Netdevice to consider to form a default GID * * is_upper_ndev_bond_master_filter() returns true if a cookie_netdev * is bond master device and rdma_ndev is its lower netdevice. It might * not have been established as slave device yet. */ static bool is_upper_ndev_bond_master_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *cookie_ndev = cookie; bool match = false; if (!rdma_ndev) return false; rcu_read_lock(); if (netif_is_bond_master(cookie_ndev) && rdma_is_upper_dev_rcu(rdma_ndev, cookie_ndev)) match = true; rcu_read_unlock(); return match; } static void update_gid_ip(enum gid_op_type gid_op, struct ib_device *ib_dev, u32 port, struct net_device *ndev, struct sockaddr *addr) { union ib_gid gid; struct ib_gid_attr gid_attr; rdma_ip2gid(addr, &gid); memset(&gid_attr, 0, sizeof(gid_attr)); gid_attr.ndev = ndev; update_gid(gid_op, ib_dev, port, &gid, &gid_attr); } static void bond_delete_netdev_default_gids(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, struct net_device *event_ndev) { struct net_device *real_dev = rdma_vlan_dev_real_dev(event_ndev); unsigned long gid_type_mask; if (!rdma_ndev) return; if (!real_dev) real_dev = event_ndev; rcu_read_lock(); if (((rdma_ndev != event_ndev && !rdma_is_upper_dev_rcu(rdma_ndev, event_ndev)) || is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) == BONDING_SLAVE_STATE_INACTIVE)) { rcu_read_unlock(); return; } rcu_read_unlock(); gid_type_mask = roce_gid_type_mask_support(ib_dev, port); ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev, gid_type_mask, IB_CACHE_GID_DEFAULT_MODE_DELETE); } static void enum_netdev_ipv4_ips(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { const struct in_ifaddr *ifa; struct in_device *in_dev; struct sin_list { struct list_head list; struct sockaddr_in ip; }; struct sin_list *sin_iter; struct sin_list *sin_temp; LIST_HEAD(sin_list); if (ndev->reg_state >= NETREG_UNREGISTERING) return; rcu_read_lock(); in_dev = __in_dev_get_rcu(ndev); if (!in_dev) { rcu_read_unlock(); return; } in_dev_for_each_ifa_rcu(ifa, in_dev) { struct sin_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) continue; entry->ip.sin_family = AF_INET; entry->ip.sin_addr.s_addr = ifa->ifa_address; list_add_tail(&entry->list, &sin_list); } rcu_read_unlock(); list_for_each_entry_safe(sin_iter, sin_temp, &sin_list, list) { update_gid_ip(GID_ADD, ib_dev, port, ndev, (struct sockaddr *)&sin_iter->ip); list_del(&sin_iter->list); kfree(sin_iter); } } static void enum_netdev_ipv6_ips(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { struct inet6_ifaddr *ifp; struct inet6_dev *in6_dev; struct sin6_list { struct list_head list; struct sockaddr_in6 sin6; }; struct sin6_list *sin6_iter; struct sin6_list *sin6_temp; struct ib_gid_attr gid_attr = {.ndev = ndev}; LIST_HEAD(sin6_list); if (ndev->reg_state >= NETREG_UNREGISTERING) return; in6_dev = in6_dev_get(ndev); if (!in6_dev) return; read_lock_bh(&in6_dev->lock); list_for_each_entry(ifp, &in6_dev->addr_list, if_list) { struct sin6_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) continue; entry->sin6.sin6_family = AF_INET6; entry->sin6.sin6_addr = ifp->addr; list_add_tail(&entry->list, &sin6_list); } read_unlock_bh(&in6_dev->lock); in6_dev_put(in6_dev); list_for_each_entry_safe(sin6_iter, sin6_temp, &sin6_list, list) { union ib_gid gid; rdma_ip2gid((struct sockaddr *)&sin6_iter->sin6, &gid); update_gid(GID_ADD, ib_dev, port, &gid, &gid_attr); list_del(&sin6_iter->list); kfree(sin6_iter); } } static void _add_netdev_ips(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { enum_netdev_ipv4_ips(ib_dev, port, ndev); if (IS_ENABLED(CONFIG_IPV6)) enum_netdev_ipv6_ips(ib_dev, port, ndev); } static void add_netdev_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { _add_netdev_ips(ib_dev, port, cookie); } static void del_netdev_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { ib_cache_gid_del_all_netdev_gids(ib_dev, port, cookie); } /** * del_default_gids - Delete default GIDs of the event/cookie netdevice * @ib_dev: RDMA device pointer * @port: Port of the RDMA device whose GID table to consider * @rdma_ndev: Unused rdma netdevice * @cookie: Pointer to event netdevice * * del_default_gids() deletes the default GIDs of the event/cookie netdevice. */ static void del_default_gids(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *cookie_ndev = cookie; unsigned long gid_type_mask; gid_type_mask = roce_gid_type_mask_support(ib_dev, port); ib_cache_gid_set_default_gid(ib_dev, port, cookie_ndev, gid_type_mask, IB_CACHE_GID_DEFAULT_MODE_DELETE); } static void add_default_gids(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *event_ndev = cookie; unsigned long gid_type_mask; gid_type_mask = roce_gid_type_mask_support(ib_dev, port); ib_cache_gid_set_default_gid(ib_dev, port, event_ndev, gid_type_mask, IB_CACHE_GID_DEFAULT_MODE_SET); } static void enum_all_gids_of_dev_cb(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net *net; struct net_device *ndev; /* Lock the rtnl to make sure the netdevs does not move under * our feet */ rtnl_lock(); down_read(&net_rwsem); for_each_net(net) for_each_netdev(net, ndev) { /* * Filter and add default GIDs of the primary netdevice * when not in bonding mode, or add default GIDs * of bond master device, when in bonding mode. */ if (is_ndev_for_default_gid_filter(ib_dev, port, rdma_ndev, ndev)) add_default_gids(ib_dev, port, rdma_ndev, ndev); if (is_eth_port_of_netdev_filter(ib_dev, port, rdma_ndev, ndev)) _add_netdev_ips(ib_dev, port, ndev); } up_read(&net_rwsem); rtnl_unlock(); } /** * rdma_roce_rescan_device - Rescan all of the network devices in the system * and add their gids, as needed, to the relevant RoCE devices. * * @ib_dev: the rdma device */ void rdma_roce_rescan_device(struct ib_device *ib_dev) { ib_enum_roce_netdev(ib_dev, pass_all_filter, NULL, enum_all_gids_of_dev_cb, NULL); } EXPORT_SYMBOL(rdma_roce_rescan_device); /** * rdma_roce_rescan_port - Rescan all of the network devices in the system * and add their gids if relevant to the port of the RoCE device. * * @ib_dev: IB device * @port: Port number */ void rdma_roce_rescan_port(struct ib_device *ib_dev, u32 port) { struct net_device *ndev = NULL; if (rdma_protocol_roce(ib_dev, port)) { ndev = ib_device_get_netdev(ib_dev, port); if (!ndev) return; enum_all_gids_of_dev_cb(ib_dev, port, ndev, ndev); dev_put(ndev); } } EXPORT_SYMBOL(rdma_roce_rescan_port); static void callback_for_addr_gid_device_scan(struct ib_device *device, u32 port, struct net_device *rdma_ndev, void *cookie) { struct update_gid_event_work *parsed = cookie; return update_gid(parsed->gid_op, device, port, &parsed->gid, &parsed->gid_attr); } struct upper_list { struct list_head list; struct net_device *upper; }; static int netdev_upper_walk(struct net_device *upper, struct netdev_nested_priv *priv) { struct upper_list *entry = kmalloc(sizeof(*entry), GFP_ATOMIC); struct list_head *upper_list = (struct list_head *)priv->data; if (!entry) return 0; list_add_tail(&entry->list, upper_list); dev_hold(upper); entry->upper = upper; return 0; } static void handle_netdev_upper(struct ib_device *ib_dev, u32 port, void *cookie, void (*handle_netdev)(struct ib_device *ib_dev, u32 port, struct net_device *ndev)) { struct net_device *ndev = cookie; struct netdev_nested_priv priv; struct upper_list *upper_iter; struct upper_list *upper_temp; LIST_HEAD(upper_list); priv.data = &upper_list; rcu_read_lock(); netdev_walk_all_upper_dev_rcu(ndev, netdev_upper_walk, &priv); rcu_read_unlock(); handle_netdev(ib_dev, port, ndev); list_for_each_entry_safe(upper_iter, upper_temp, &upper_list, list) { handle_netdev(ib_dev, port, upper_iter->upper); dev_put(upper_iter->upper); list_del(&upper_iter->list); kfree(upper_iter); } } void roce_del_all_netdev_gids(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { ib_cache_gid_del_all_netdev_gids(ib_dev, port, ndev); } EXPORT_SYMBOL(roce_del_all_netdev_gids); static void del_netdev_upper_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { handle_netdev_upper(ib_dev, port, cookie, roce_del_all_netdev_gids); } static void add_netdev_upper_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { handle_netdev_upper(ib_dev, port, cookie, _add_netdev_ips); } static void del_netdev_default_ips_join(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *master_ndev; rcu_read_lock(); master_ndev = netdev_master_upper_dev_get_rcu(rdma_ndev); dev_hold(master_ndev); rcu_read_unlock(); if (master_ndev) { bond_delete_netdev_default_gids(ib_dev, port, rdma_ndev, master_ndev); dev_put(master_ndev); } } /* The following functions operate on all IB devices. netdevice_event and * addr_event execute ib_enum_all_roce_netdevs through a work. * ib_enum_all_roce_netdevs iterates through all IB devices. */ static void netdevice_event_work_handler(struct work_struct *_work) { struct netdev_event_work *work = container_of(_work, struct netdev_event_work, work); unsigned int i; for (i = 0; i < ARRAY_SIZE(work->cmds) && work->cmds[i].cb; i++) { ib_enum_all_roce_netdevs(work->cmds[i].filter, work->cmds[i].filter_ndev, work->cmds[i].cb, work->cmds[i].ndev); dev_put(work->cmds[i].ndev); dev_put(work->cmds[i].filter_ndev); } kfree(work); } static int netdevice_queue_work(struct netdev_event_work_cmd *cmds, struct net_device *ndev) { unsigned int i; struct netdev_event_work *ndev_work = kmalloc(sizeof(*ndev_work), GFP_KERNEL); if (!ndev_work) return NOTIFY_DONE; memcpy(ndev_work->cmds, cmds, sizeof(ndev_work->cmds)); for (i = 0; i < ARRAY_SIZE(ndev_work->cmds) && ndev_work->cmds[i].cb; i++) { if (!ndev_work->cmds[i].ndev) ndev_work->cmds[i].ndev = ndev; if (!ndev_work->cmds[i].filter_ndev) ndev_work->cmds[i].filter_ndev = ndev; dev_hold(ndev_work->cmds[i].ndev); dev_hold(ndev_work->cmds[i].filter_ndev); } INIT_WORK(&ndev_work->work, netdevice_event_work_handler); queue_work(gid_cache_wq, &ndev_work->work); return NOTIFY_DONE; } static const struct netdev_event_work_cmd add_cmd = { .cb = add_netdev_ips, .filter = is_eth_port_of_netdev_filter }; static const struct netdev_event_work_cmd add_cmd_upper_ips = { .cb = add_netdev_upper_ips, .filter = is_eth_port_of_netdev_filter }; static void ndev_event_unlink(struct netdev_notifier_changeupper_info *changeupper_info, struct netdev_event_work_cmd *cmds) { static const struct netdev_event_work_cmd upper_ips_del_cmd = { .cb = del_netdev_upper_ips, .filter = upper_device_filter }; cmds[0] = upper_ips_del_cmd; cmds[0].ndev = changeupper_info->upper_dev; cmds[1] = add_cmd; } static const struct netdev_event_work_cmd bonding_default_add_cmd = { .cb = add_default_gids, .filter = is_upper_ndev_bond_master_filter }; static void ndev_event_link(struct net_device *event_ndev, struct netdev_notifier_changeupper_info *changeupper_info, struct netdev_event_work_cmd *cmds) { static const struct netdev_event_work_cmd bonding_default_del_cmd = { .cb = del_default_gids, .filter = is_upper_ndev_bond_master_filter }; /* * When a lower netdev is linked to its upper bonding * netdev, delete lower slave netdev's default GIDs. */ cmds[0] = bonding_default_del_cmd; cmds[0].ndev = event_ndev; cmds[0].filter_ndev = changeupper_info->upper_dev; /* Now add bonding upper device default GIDs */ cmds[1] = bonding_default_add_cmd; cmds[1].ndev = changeupper_info->upper_dev; cmds[1].filter_ndev = changeupper_info->upper_dev; /* Now add bonding upper device IP based GIDs */ cmds[2] = add_cmd_upper_ips; cmds[2].ndev = changeupper_info->upper_dev; cmds[2].filter_ndev = changeupper_info->upper_dev; } static void netdevice_event_changeupper(struct net_device *event_ndev, struct netdev_notifier_changeupper_info *changeupper_info, struct netdev_event_work_cmd *cmds) { if (changeupper_info->linking) ndev_event_link(event_ndev, changeupper_info, cmds); else ndev_event_unlink(changeupper_info, cmds); } static const struct netdev_event_work_cmd add_default_gid_cmd = { .cb = add_default_gids, .filter = is_ndev_for_default_gid_filter, }; static int netdevice_event(struct notifier_block *this, unsigned long event, void *ptr) { static const struct netdev_event_work_cmd del_cmd = { .cb = del_netdev_ips, .filter = pass_all_filter}; static const struct netdev_event_work_cmd bonding_default_del_cmd_join = { .cb = del_netdev_default_ips_join, .filter = is_eth_port_inactive_slave_filter }; static const struct netdev_event_work_cmd netdev_del_cmd = { .cb = del_netdev_ips, .filter = is_eth_port_of_netdev_filter }; static const struct netdev_event_work_cmd bonding_event_ips_del_cmd = { .cb = del_netdev_upper_ips, .filter = upper_device_filter}; struct net_device *ndev = netdev_notifier_info_to_dev(ptr); struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ] = { {NULL} }; if (ndev->type != ARPHRD_ETHER) return NOTIFY_DONE; switch (event) { case NETDEV_REGISTER: case NETDEV_UP: cmds[0] = bonding_default_del_cmd_join; cmds[1] = add_default_gid_cmd; cmds[2] = add_cmd; break; case NETDEV_UNREGISTER: if (ndev->reg_state < NETREG_UNREGISTERED) cmds[0] = del_cmd; else return NOTIFY_DONE; break; case NETDEV_CHANGEADDR: cmds[0] = netdev_del_cmd; if (ndev->reg_state == NETREG_REGISTERED) { cmds[1] = add_default_gid_cmd; cmds[2] = add_cmd; } break; case NETDEV_CHANGEUPPER: netdevice_event_changeupper(ndev, container_of(ptr, struct netdev_notifier_changeupper_info, info), cmds); break; case NETDEV_BONDING_FAILOVER: cmds[0] = bonding_event_ips_del_cmd; /* Add default GIDs of the bond device */ cmds[1] = bonding_default_add_cmd; /* Add IP based GIDs of the bond device */ cmds[2] = add_cmd_upper_ips; break; default: return NOTIFY_DONE; } return netdevice_queue_work(cmds, ndev); } static void update_gid_event_work_handler(struct work_struct *_work) { struct update_gid_event_work *work = container_of(_work, struct update_gid_event_work, work); ib_enum_all_roce_netdevs(is_eth_port_of_netdev_filter, work->gid_attr.ndev, callback_for_addr_gid_device_scan, work); dev_put(work->gid_attr.ndev); kfree(work); } static int addr_event(struct notifier_block *this, unsigned long event, struct sockaddr *sa, struct net_device *ndev) { struct update_gid_event_work *work; enum gid_op_type gid_op; if (ndev->type != ARPHRD_ETHER) return NOTIFY_DONE; switch (event) { case NETDEV_UP: gid_op = GID_ADD; break; case NETDEV_DOWN: gid_op = GID_DEL; break; default: return NOTIFY_DONE; } work = kmalloc(sizeof(*work), GFP_ATOMIC); if (!work) return NOTIFY_DONE; INIT_WORK(&work->work, update_gid_event_work_handler); rdma_ip2gid(sa, &work->gid); work->gid_op = gid_op; memset(&work->gid_attr, 0, sizeof(work->gid_attr)); dev_hold(ndev); work->gid_attr.ndev = ndev; queue_work(gid_cache_wq, &work->work); return NOTIFY_DONE; } static int inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct sockaddr_in in; struct net_device *ndev; struct in_ifaddr *ifa = ptr; in.sin_family = AF_INET; in.sin_addr.s_addr = ifa->ifa_address; ndev = ifa->ifa_dev->dev; return addr_event(this, event, (struct sockaddr *)&in, ndev); } static int inet6addr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct sockaddr_in6 in6; struct net_device *ndev; struct inet6_ifaddr *ifa6 = ptr; in6.sin6_family = AF_INET6; in6.sin6_addr = ifa6->addr; ndev = ifa6->idev->dev; return addr_event(this, event, (struct sockaddr *)&in6, ndev); } static struct notifier_block nb_netdevice = { .notifier_call = netdevice_event }; static struct notifier_block nb_inetaddr = { .notifier_call = inetaddr_event }; static struct notifier_block nb_inet6addr = { .notifier_call = inet6addr_event }; int __init roce_gid_mgmt_init(void) { gid_cache_wq = alloc_ordered_workqueue("gid-cache-wq", 0); if (!gid_cache_wq) return -ENOMEM; register_inetaddr_notifier(&nb_inetaddr); if (IS_ENABLED(CONFIG_IPV6)) register_inet6addr_notifier(&nb_inet6addr); /* We relay on the netdevice notifier to enumerate all * existing devices in the system. Register to this notifier * last to make sure we will not miss any IP add/del * callbacks. */ register_netdevice_notifier(&nb_netdevice); return 0; } void __exit roce_gid_mgmt_cleanup(void) { if (IS_ENABLED(CONFIG_IPV6)) unregister_inet6addr_notifier(&nb_inet6addr); unregister_inetaddr_notifier(&nb_inetaddr); unregister_netdevice_notifier(&nb_netdevice); /* Ensure all gid deletion tasks complete before we go down, * to avoid any reference to free'd memory. By the time * ib-core is removed, all physical devices have been removed, * so no issue with remaining hardware contexts. */ destroy_workqueue(gid_cache_wq); }
9 9 9 9 264 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 /* SPDX-License-Identifier: GPL-2.0 */ /* * An extensible bitmap is a bitmap that supports an * arbitrary number of bits. Extensible bitmaps are * used to represent sets of values, such as types, * roles, categories, and classes. * * Each extensible bitmap is implemented as a linked * list of bitmap nodes, where each bitmap node has * an explicitly specified starting bit position within * the total bitmap. * * Author : Stephen Smalley, <stephen.smalley.work@gmail.com> */ #ifndef _SS_EBITMAP_H_ #define _SS_EBITMAP_H_ #include <net/netlabel.h> #ifdef CONFIG_64BIT #define EBITMAP_NODE_SIZE 64 #else #define EBITMAP_NODE_SIZE 32 #endif #define EBITMAP_UNIT_NUMS \ ((EBITMAP_NODE_SIZE - sizeof(void *) - sizeof(u32)) / \ sizeof(unsigned long)) #define EBITMAP_UNIT_SIZE BITS_PER_LONG #define EBITMAP_SIZE (EBITMAP_UNIT_NUMS * EBITMAP_UNIT_SIZE) #define EBITMAP_BIT 1UL #define EBITMAP_SHIFT_UNIT_SIZE(x) \ (((x) >> EBITMAP_UNIT_SIZE / 2) >> EBITMAP_UNIT_SIZE / 2) struct ebitmap_node { struct ebitmap_node *next; unsigned long maps[EBITMAP_UNIT_NUMS]; u32 startbit; }; struct ebitmap { struct ebitmap_node *node; /* first node in the bitmap */ u32 highbit; /* highest position in the total bitmap */ }; #define ebitmap_length(e) ((e)->highbit) static inline u32 ebitmap_start_positive(const struct ebitmap *e, struct ebitmap_node **n) { u32 ofs; for (*n = e->node; *n; *n = (*n)->next) { ofs = find_first_bit((*n)->maps, EBITMAP_SIZE); if (ofs < EBITMAP_SIZE) return (*n)->startbit + ofs; } return ebitmap_length(e); } static inline void ebitmap_init(struct ebitmap *e) { memset(e, 0, sizeof(*e)); } static inline u32 ebitmap_next_positive(const struct ebitmap *e, struct ebitmap_node **n, u32 bit) { u32 ofs; ofs = find_next_bit((*n)->maps, EBITMAP_SIZE, bit - (*n)->startbit + 1); if (ofs < EBITMAP_SIZE) return ofs + (*n)->startbit; for (*n = (*n)->next; *n; *n = (*n)->next) { ofs = find_first_bit((*n)->maps, EBITMAP_SIZE); if (ofs < EBITMAP_SIZE) return ofs + (*n)->startbit; } return ebitmap_length(e); } #define EBITMAP_NODE_INDEX(node, bit) \ (((bit) - (node)->startbit) / EBITMAP_UNIT_SIZE) #define EBITMAP_NODE_OFFSET(node, bit) \ (((bit) - (node)->startbit) % EBITMAP_UNIT_SIZE) static inline int ebitmap_node_get_bit(const struct ebitmap_node *n, u32 bit) { u32 index = EBITMAP_NODE_INDEX(n, bit); u32 ofs = EBITMAP_NODE_OFFSET(n, bit); BUG_ON(index >= EBITMAP_UNIT_NUMS); if ((n->maps[index] & (EBITMAP_BIT << ofs))) return 1; return 0; } static inline void ebitmap_node_set_bit(struct ebitmap_node *n, u32 bit) { u32 index = EBITMAP_NODE_INDEX(n, bit); u32 ofs = EBITMAP_NODE_OFFSET(n, bit); BUG_ON(index >= EBITMAP_UNIT_NUMS); n->maps[index] |= (EBITMAP_BIT << ofs); } static inline void ebitmap_node_clr_bit(struct ebitmap_node *n, u32 bit) { u32 index = EBITMAP_NODE_INDEX(n, bit); u32 ofs = EBITMAP_NODE_OFFSET(n, bit); BUG_ON(index >= EBITMAP_UNIT_NUMS); n->maps[index] &= ~(EBITMAP_BIT << ofs); } #define ebitmap_for_each_positive_bit(e, n, bit) \ for ((bit) = ebitmap_start_positive(e, &(n)); \ (bit) < ebitmap_length(e); \ (bit) = ebitmap_next_positive(e, &(n), bit)) bool ebitmap_equal(const struct ebitmap *e1, const struct ebitmap *e2); int ebitmap_cpy(struct ebitmap *dst, const struct ebitmap *src); int ebitmap_and(struct ebitmap *dst, const struct ebitmap *e1, const struct ebitmap *e2); int ebitmap_contains(const struct ebitmap *e1, const struct ebitmap *e2, u32 last_e2bit); int ebitmap_get_bit(const struct ebitmap *e, u32 bit); int ebitmap_set_bit(struct ebitmap *e, u32 bit, int value); void ebitmap_destroy(struct ebitmap *e); struct policy_file; int ebitmap_read(struct ebitmap *e, struct policy_file *fp); int ebitmap_write(const struct ebitmap *e, struct policy_file *fp); u32 ebitmap_hash(const struct ebitmap *e, u32 hash); #ifdef CONFIG_NETLABEL int ebitmap_netlbl_export(struct ebitmap *ebmap, struct netlbl_lsm_catmap **catmap); int ebitmap_netlbl_import(struct ebitmap *ebmap, struct netlbl_lsm_catmap *catmap); #else static inline int ebitmap_netlbl_export(struct ebitmap *ebmap, struct netlbl_lsm_catmap **catmap) { return -ENOMEM; } static inline int ebitmap_netlbl_import(struct ebitmap *ebmap, struct netlbl_lsm_catmap *catmap) { return -ENOMEM; } #endif #endif /* _SS_EBITMAP_H_ */
2 79 81 143 1 202 65 1 3 216 218 210 210 4 2 2 1 196 2 196 54 165 165 162 1 55 56 46 54 144 143 204 205 204 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2012,2013 - ARM Ltd * Author: Marc Zyngier <marc.zyngier@arm.com> * * Derived from arch/arm/include/kvm_emulate.h * Copyright (C) 2012 - Virtual Open Systems and Columbia University * Author: Christoffer Dall <c.dall@virtualopensystems.com> */ #ifndef __ARM64_KVM_EMULATE_H__ #define __ARM64_KVM_EMULATE_H__ #include <linux/bitfield.h> #include <linux/kvm_host.h> #include <asm/debug-monitors.h> #include <asm/esr.h> #include <asm/kvm_arm.h> #include <asm/kvm_hyp.h> #include <asm/kvm_nested.h> #include <asm/ptrace.h> #include <asm/cputype.h> #include <asm/virt.h> #define CURRENT_EL_SP_EL0_VECTOR 0x0 #define CURRENT_EL_SP_ELx_VECTOR 0x200 #define LOWER_EL_AArch64_VECTOR 0x400 #define LOWER_EL_AArch32_VECTOR 0x600 enum exception_type { except_type_sync = 0, except_type_irq = 0x80, except_type_fiq = 0x100, except_type_serror = 0x180, }; #define kvm_exception_type_names \ { except_type_sync, "SYNC" }, \ { except_type_irq, "IRQ" }, \ { except_type_fiq, "FIQ" }, \ { except_type_serror, "SERROR" } bool kvm_condition_valid32(const struct kvm_vcpu *vcpu); void kvm_skip_instr32(struct kvm_vcpu *vcpu); void kvm_inject_undefined(struct kvm_vcpu *vcpu); int kvm_inject_serror_esr(struct kvm_vcpu *vcpu, u64 esr); int kvm_inject_sea(struct kvm_vcpu *vcpu, bool iabt, u64 addr); void kvm_inject_size_fault(struct kvm_vcpu *vcpu); static inline int kvm_inject_sea_dabt(struct kvm_vcpu *vcpu, u64 addr) { return kvm_inject_sea(vcpu, false, addr); } static inline int kvm_inject_sea_iabt(struct kvm_vcpu *vcpu, u64 addr) { return kvm_inject_sea(vcpu, true, addr); } static inline int kvm_inject_serror(struct kvm_vcpu *vcpu) { /* * ESR_ELx.ISV (later renamed to IDS) indicates whether or not * ESR_ELx.ISS contains IMPLEMENTATION DEFINED syndrome information. * * Set the bit when injecting an SError w/o an ESR to indicate ISS * does not follow the architected format. */ return kvm_inject_serror_esr(vcpu, ESR_ELx_ISV); } void kvm_vcpu_wfi(struct kvm_vcpu *vcpu); void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu); int kvm_inject_nested_sync(struct kvm_vcpu *vcpu, u64 esr_el2); int kvm_inject_nested_irq(struct kvm_vcpu *vcpu); int kvm_inject_nested_sea(struct kvm_vcpu *vcpu, bool iabt, u64 addr); int kvm_inject_nested_serror(struct kvm_vcpu *vcpu, u64 esr); static inline void kvm_inject_nested_sve_trap(struct kvm_vcpu *vcpu) { u64 esr = FIELD_PREP(ESR_ELx_EC_MASK, ESR_ELx_EC_SVE) | ESR_ELx_IL; kvm_inject_nested_sync(vcpu, esr); } #if defined(__KVM_VHE_HYPERVISOR__) || defined(__KVM_NVHE_HYPERVISOR__) static __always_inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu) { return !(vcpu->arch.hcr_el2 & HCR_RW); } #else static __always_inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu) { return vcpu_has_feature(vcpu, KVM_ARM_VCPU_EL1_32BIT); } #endif static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu) { if (!vcpu_has_run_once(vcpu)) vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS; /* * For non-FWB CPUs, we trap VM ops (HCR_EL2.TVM) until M+C * get set in SCTLR_EL1 such that we can detect when the guest * MMU gets turned on and do the necessary cache maintenance * then. */ if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) vcpu->arch.hcr_el2 |= HCR_TVM; } static inline unsigned long *vcpu_hcr(struct kvm_vcpu *vcpu) { return (unsigned long *)&vcpu->arch.hcr_el2; } static inline void vcpu_clear_wfx_traps(struct kvm_vcpu *vcpu) { vcpu->arch.hcr_el2 &= ~HCR_TWE; if (atomic_read(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vlpi_count) || vcpu->kvm->arch.vgic.nassgireq) vcpu->arch.hcr_el2 &= ~HCR_TWI; else vcpu->arch.hcr_el2 |= HCR_TWI; } static inline void vcpu_set_wfx_traps(struct kvm_vcpu *vcpu) { vcpu->arch.hcr_el2 |= HCR_TWE; vcpu->arch.hcr_el2 |= HCR_TWI; } static inline unsigned long vcpu_get_vsesr(struct kvm_vcpu *vcpu) { return vcpu->arch.vsesr_el2; } static inline void vcpu_set_vsesr(struct kvm_vcpu *vcpu, u64 vsesr) { vcpu->arch.vsesr_el2 = vsesr; } static __always_inline unsigned long *vcpu_pc(const struct kvm_vcpu *vcpu) { return (unsigned long *)&vcpu_gp_regs(vcpu)->pc; } static __always_inline unsigned long *vcpu_cpsr(const struct kvm_vcpu *vcpu) { return (unsigned long *)&vcpu_gp_regs(vcpu)->pstate; } static __always_inline bool vcpu_mode_is_32bit(const struct kvm_vcpu *vcpu) { return !!(*vcpu_cpsr(vcpu) & PSR_MODE32_BIT); } static __always_inline bool kvm_condition_valid(const struct kvm_vcpu *vcpu) { if (vcpu_mode_is_32bit(vcpu)) return kvm_condition_valid32(vcpu); return true; } static inline void vcpu_set_thumb(struct kvm_vcpu *vcpu) { *vcpu_cpsr(vcpu) |= PSR_AA32_T_BIT; } /* * vcpu_get_reg and vcpu_set_reg should always be passed a register number * coming from a read of ESR_EL2. Otherwise, it may give the wrong result on * AArch32 with banked registers. */ static __always_inline unsigned long vcpu_get_reg(const struct kvm_vcpu *vcpu, u8 reg_num) { return (reg_num == 31) ? 0 : vcpu_gp_regs(vcpu)->regs[reg_num]; } static __always_inline void vcpu_set_reg(struct kvm_vcpu *vcpu, u8 reg_num, unsigned long val) { if (reg_num != 31) vcpu_gp_regs(vcpu)->regs[reg_num] = val; } static inline bool vcpu_is_el2_ctxt(const struct kvm_cpu_context *ctxt) { switch (ctxt->regs.pstate & (PSR_MODE32_BIT | PSR_MODE_MASK)) { case PSR_MODE_EL2h: case PSR_MODE_EL2t: return true; default: return false; } } static inline bool vcpu_is_el2(const struct kvm_vcpu *vcpu) { return vcpu_is_el2_ctxt(&vcpu->arch.ctxt); } static inline bool vcpu_el2_e2h_is_set(const struct kvm_vcpu *vcpu) { return (!cpus_have_final_cap(ARM64_HAS_HCR_NV1) || (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_E2H)); } static inline bool vcpu_el2_tge_is_set(const struct kvm_vcpu *vcpu) { return ctxt_sys_reg(&vcpu->arch.ctxt, HCR_EL2) & HCR_TGE; } static inline bool vcpu_el2_amo_is_set(const struct kvm_vcpu *vcpu) { /* * DDI0487L.b Known Issue D22105 * * When executing at EL2 and HCR_EL2.{E2H,TGE} = {1, 0} it is * IMPLEMENTATION DEFINED whether the effective value of HCR_EL2.AMO * is the value programmed or 1. * * Make the implementation choice of treating the effective value as 1 as * we cannot subsequently catch changes to TGE or AMO that would * otherwise lead to the SError becoming deliverable. */ if (vcpu_is_el2(vcpu) && vcpu_el2_e2h_is_set(vcpu) && !vcpu_el2_tge_is_set(vcpu)) return true; return ctxt_sys_reg(&vcpu->arch.ctxt, HCR_EL2) & HCR_AMO; } static inline bool is_hyp_ctxt(const struct kvm_vcpu *vcpu) { bool e2h, tge; u64 hcr; if (!vcpu_has_nv(vcpu)) return false; hcr = __vcpu_sys_reg(vcpu, HCR_EL2); e2h = (hcr & HCR_E2H); tge = (hcr & HCR_TGE); /* * We are in a hypervisor context if the vcpu mode is EL2 or * E2H and TGE bits are set. The latter means we are in the user space * of the VHE kernel. ARMv8.1 ARM describes this as 'InHost' * * Note that the HCR_EL2.{E2H,TGE}={0,1} isn't really handled in the * rest of the KVM code, and will result in a misbehaving guest. */ return vcpu_is_el2(vcpu) || (e2h && tge) || tge; } static inline bool vcpu_is_host_el0(const struct kvm_vcpu *vcpu) { return is_hyp_ctxt(vcpu) && !vcpu_is_el2(vcpu); } static inline bool is_nested_ctxt(struct kvm_vcpu *vcpu) { return vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu); } static inline bool vserror_state_is_nested(struct kvm_vcpu *vcpu) { if (!is_nested_ctxt(vcpu)) return false; return vcpu_el2_amo_is_set(vcpu) || (__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TMEA); } /* * The layout of SPSR for an AArch32 state is different when observed from an * AArch64 SPSR_ELx or an AArch32 SPSR_*. This function generates the AArch32 * view given an AArch64 view. * * In ARM DDI 0487E.a see: * * - The AArch64 view (SPSR_EL2) in section C5.2.18, page C5-426 * - The AArch32 view (SPSR_abt) in section G8.2.126, page G8-6256 * - The AArch32 view (SPSR_und) in section G8.2.132, page G8-6280 * * Which show the following differences: * * | Bit | AA64 | AA32 | Notes | * +-----+------+------+-----------------------------| * | 24 | DIT | J | J is RES0 in ARMv8 | * | 21 | SS | DIT | SS doesn't exist in AArch32 | * * ... and all other bits are (currently) common. */ static inline unsigned long host_spsr_to_spsr32(unsigned long spsr) { const unsigned long overlap = BIT(24) | BIT(21); unsigned long dit = !!(spsr & PSR_AA32_DIT_BIT); spsr &= ~overlap; spsr |= dit << 21; return spsr; } static inline bool vcpu_mode_priv(const struct kvm_vcpu *vcpu) { u32 mode; if (vcpu_mode_is_32bit(vcpu)) { mode = *vcpu_cpsr(vcpu) & PSR_AA32_MODE_MASK; return mode > PSR_AA32_MODE_USR; } mode = *vcpu_cpsr(vcpu) & PSR_MODE_MASK; return mode != PSR_MODE_EL0t; } static __always_inline u64 kvm_vcpu_get_esr(const struct kvm_vcpu *vcpu) { return vcpu->arch.fault.esr_el2; } static inline bool guest_hyp_wfx_traps_enabled(const struct kvm_vcpu *vcpu) { u64 esr = kvm_vcpu_get_esr(vcpu); bool is_wfe = !!(esr & ESR_ELx_WFx_ISS_WFE); u64 hcr_el2 = __vcpu_sys_reg(vcpu, HCR_EL2); if (!vcpu_has_nv(vcpu) || vcpu_is_el2(vcpu)) return false; return ((is_wfe && (hcr_el2 & HCR_TWE)) || (!is_wfe && (hcr_el2 & HCR_TWI))); } static __always_inline int kvm_vcpu_get_condition(const struct kvm_vcpu *vcpu) { u64 esr = kvm_vcpu_get_esr(vcpu); if (esr & ESR_ELx_CV) return (esr & ESR_ELx_COND_MASK) >> ESR_ELx_COND_SHIFT; return -1; } static __always_inline unsigned long kvm_vcpu_get_hfar(const struct kvm_vcpu *vcpu) { return vcpu->arch.fault.far_el2; } static __always_inline phys_addr_t kvm_vcpu_get_fault_ipa(const struct kvm_vcpu *vcpu) { u64 hpfar = vcpu->arch.fault.hpfar_el2; if (unlikely(!(hpfar & HPFAR_EL2_NS))) return INVALID_GPA; return FIELD_GET(HPFAR_EL2_FIPA, hpfar) << 12; } static inline u64 kvm_vcpu_get_disr(const struct kvm_vcpu *vcpu) { return vcpu->arch.fault.disr_el1; } static inline u32 kvm_vcpu_hvc_get_imm(const struct kvm_vcpu *vcpu) { return kvm_vcpu_get_esr(vcpu) & ESR_ELx_xVC_IMM_MASK; } static __always_inline bool kvm_vcpu_dabt_isvalid(const struct kvm_vcpu *vcpu) { return !!(kvm_vcpu_get_esr(vcpu) & ESR_ELx_ISV); } static inline unsigned long kvm_vcpu_dabt_iss_nisv_sanitized(const struct kvm_vcpu *vcpu) { return kvm_vcpu_get_esr(vcpu) & (ESR_ELx_CM | ESR_ELx_WNR | ESR_ELx_FSC); } static inline bool kvm_vcpu_dabt_issext(const struct kvm_vcpu *vcpu) { return !!(kvm_vcpu_get_esr(vcpu) & ESR_ELx_SSE); } static inline bool kvm_vcpu_dabt_issf(const struct kvm_vcpu *vcpu) { return !!(kvm_vcpu_get_esr(vcpu) & ESR_ELx_SF); } static __always_inline int kvm_vcpu_dabt_get_rd(const struct kvm_vcpu *vcpu) { return (kvm_vcpu_get_esr(vcpu) & ESR_ELx_SRT_MASK) >> ESR_ELx_SRT_SHIFT; } static __always_inline bool kvm_vcpu_abt_iss1tw(const struct kvm_vcpu *vcpu) { return !!(kvm_vcpu_get_esr(vcpu) & ESR_ELx_S1PTW); } /* Always check for S1PTW *before* using this. */ static __always_inline bool kvm_vcpu_dabt_iswrite(const struct kvm_vcpu *vcpu) { return kvm_vcpu_get_esr(vcpu) & ESR_ELx_WNR; } static inline bool kvm_vcpu_dabt_is_cm(const struct kvm_vcpu *vcpu) { return !!(kvm_vcpu_get_esr(vcpu) & ESR_ELx_CM); } static __always_inline unsigned int kvm_vcpu_dabt_get_as(const struct kvm_vcpu *vcpu) { return 1 << ((kvm_vcpu_get_esr(vcpu) & ESR_ELx_SAS) >> ESR_ELx_SAS_SHIFT); } /* This one is not specific to Data Abort */ static __always_inline bool kvm_vcpu_trap_il_is32bit(const struct kvm_vcpu *vcpu) { return !!(kvm_vcpu_get_esr(vcpu) & ESR_ELx_IL); } static __always_inline u8 kvm_vcpu_trap_get_class(const struct kvm_vcpu *vcpu) { return ESR_ELx_EC(kvm_vcpu_get_esr(vcpu)); } static inline bool kvm_vcpu_trap_is_iabt(const struct kvm_vcpu *vcpu) { return kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_IABT_LOW; } static inline bool kvm_vcpu_trap_is_exec_fault(const struct kvm_vcpu *vcpu) { return kvm_vcpu_trap_is_iabt(vcpu) && !kvm_vcpu_abt_iss1tw(vcpu); } static __always_inline u8 kvm_vcpu_trap_get_fault(const struct kvm_vcpu *vcpu) { return kvm_vcpu_get_esr(vcpu) & ESR_ELx_FSC; } static inline bool kvm_vcpu_trap_is_permission_fault(const struct kvm_vcpu *vcpu) { return esr_fsc_is_permission_fault(kvm_vcpu_get_esr(vcpu)); } static inline bool kvm_vcpu_trap_is_translation_fault(const struct kvm_vcpu *vcpu) { return esr_fsc_is_translation_fault(kvm_vcpu_get_esr(vcpu)); } static inline u64 kvm_vcpu_trap_get_perm_fault_granule(const struct kvm_vcpu *vcpu) { unsigned long esr = kvm_vcpu_get_esr(vcpu); BUG_ON(!esr_fsc_is_permission_fault(esr)); return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(esr & ESR_ELx_FSC_LEVEL)); } static __always_inline bool kvm_vcpu_abt_issea(const struct kvm_vcpu *vcpu) { switch (kvm_vcpu_trap_get_fault(vcpu)) { case ESR_ELx_FSC_EXTABT: case ESR_ELx_FSC_SEA_TTW(-1) ... ESR_ELx_FSC_SEA_TTW(3): case ESR_ELx_FSC_SECC: case ESR_ELx_FSC_SECC_TTW(-1) ... ESR_ELx_FSC_SECC_TTW(3): return true; default: return false; } } static __always_inline int kvm_vcpu_sys_get_rt(struct kvm_vcpu *vcpu) { u64 esr = kvm_vcpu_get_esr(vcpu); return ESR_ELx_SYS64_ISS_RT(esr); } static inline bool kvm_is_write_fault(struct kvm_vcpu *vcpu) { if (kvm_vcpu_abt_iss1tw(vcpu)) { /* * Only a permission fault on a S1PTW should be * considered as a write. Otherwise, page tables baked * in a read-only memslot will result in an exception * being delivered in the guest. * * The drawback is that we end-up faulting twice if the * guest is using any of HW AF/DB: a translation fault * to map the page containing the PT (read only at * first), then a permission fault to allow the flags * to be set. */ return kvm_vcpu_trap_is_permission_fault(vcpu); } if (kvm_vcpu_trap_is_iabt(vcpu)) return false; return kvm_vcpu_dabt_iswrite(vcpu); } static inline unsigned long kvm_vcpu_get_mpidr_aff(struct kvm_vcpu *vcpu) { return __vcpu_sys_reg(vcpu, MPIDR_EL1) & MPIDR_HWID_BITMASK; } static inline void kvm_vcpu_set_be(struct kvm_vcpu *vcpu) { if (vcpu_mode_is_32bit(vcpu)) { *vcpu_cpsr(vcpu) |= PSR_AA32_E_BIT; } else { enum vcpu_sysreg r; u64 sctlr; r = vcpu_has_nv(vcpu) ? SCTLR_EL2 : SCTLR_EL1; sctlr = vcpu_read_sys_reg(vcpu, r); sctlr |= SCTLR_ELx_EE; vcpu_write_sys_reg(vcpu, sctlr, r); } } static inline bool kvm_vcpu_is_be(struct kvm_vcpu *vcpu) { enum vcpu_sysreg r; u64 bit; if (vcpu_mode_is_32bit(vcpu)) return !!(*vcpu_cpsr(vcpu) & PSR_AA32_E_BIT); r = is_hyp_ctxt(vcpu) ? SCTLR_EL2 : SCTLR_EL1; bit = vcpu_mode_priv(vcpu) ? SCTLR_ELx_EE : SCTLR_EL1_E0E; return vcpu_read_sys_reg(vcpu, r) & bit; } static inline unsigned long vcpu_data_guest_to_host(struct kvm_vcpu *vcpu, unsigned long data, unsigned int len) { if (kvm_vcpu_is_be(vcpu)) { switch (len) { case 1: return data & 0xff; case 2: return be16_to_cpu(data & 0xffff); case 4: return be32_to_cpu(data & 0xffffffff); default: return be64_to_cpu(data); } } else { switch (len) { case 1: return data & 0xff; case 2: return le16_to_cpu(data & 0xffff); case 4: return le32_to_cpu(data & 0xffffffff); default: return le64_to_cpu(data); } } return data; /* Leave LE untouched */ } static inline unsigned long vcpu_data_host_to_guest(struct kvm_vcpu *vcpu, unsigned long data, unsigned int len) { if (kvm_vcpu_is_be(vcpu)) { switch (len) { case 1: return data & 0xff; case 2: return cpu_to_be16(data & 0xffff); case 4: return cpu_to_be32(data & 0xffffffff); default: return cpu_to_be64(data); } } else { switch (len) { case 1: return data & 0xff; case 2: return cpu_to_le16(data & 0xffff); case 4: return cpu_to_le32(data & 0xffffffff); default: return cpu_to_le64(data); } } return data; /* Leave LE untouched */ } static __always_inline void kvm_incr_pc(struct kvm_vcpu *vcpu) { WARN_ON(vcpu_get_flag(vcpu, PENDING_EXCEPTION)); vcpu_set_flag(vcpu, INCREMENT_PC); } #define kvm_pend_exception(v, e) \ do { \ WARN_ON(vcpu_get_flag((v), INCREMENT_PC)); \ vcpu_set_flag((v), PENDING_EXCEPTION); \ vcpu_set_flag((v), e); \ } while (0) /* * Returns a 'sanitised' view of CPTR_EL2, translating from nVHE to the VHE * format if E2H isn't set. */ static inline u64 vcpu_sanitised_cptr_el2(const struct kvm_vcpu *vcpu) { u64 cptr = __vcpu_sys_reg(vcpu, CPTR_EL2); if (!vcpu_el2_e2h_is_set(vcpu)) cptr = translate_cptr_el2_to_cpacr_el1(cptr); return cptr; } static inline bool ____cptr_xen_trap_enabled(const struct kvm_vcpu *vcpu, unsigned int xen) { switch (xen) { case 0b00: case 0b10: return true; case 0b01: return vcpu_el2_tge_is_set(vcpu) && !vcpu_is_el2(vcpu); case 0b11: default: return false; } } #define __guest_hyp_cptr_xen_trap_enabled(vcpu, xen) \ (!vcpu_has_nv(vcpu) ? false : \ ____cptr_xen_trap_enabled(vcpu, \ SYS_FIELD_GET(CPACR_EL1, xen, \ vcpu_sanitised_cptr_el2(vcpu)))) static inline bool guest_hyp_fpsimd_traps_enabled(const struct kvm_vcpu *vcpu) { return __guest_hyp_cptr_xen_trap_enabled(vcpu, FPEN); } static inline bool guest_hyp_sve_traps_enabled(const struct kvm_vcpu *vcpu) { return __guest_hyp_cptr_xen_trap_enabled(vcpu, ZEN); } static inline void vcpu_set_hcrx(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; if (cpus_have_final_cap(ARM64_HAS_HCX)) { /* * In general, all HCRX_EL2 bits are gated by a feature. * The only reason we can set SMPME without checking any * feature is that its effects are not directly observable * from the guest. */ vcpu->arch.hcrx_el2 = HCRX_EL2_SMPME; if (kvm_has_feat(kvm, ID_AA64ISAR2_EL1, MOPS, IMP)) vcpu->arch.hcrx_el2 |= (HCRX_EL2_MSCEn | HCRX_EL2_MCE2); if (kvm_has_tcr2(kvm)) vcpu->arch.hcrx_el2 |= HCRX_EL2_TCR2En; if (kvm_has_fpmr(kvm)) vcpu->arch.hcrx_el2 |= HCRX_EL2_EnFPM; if (kvm_has_sctlr2(kvm)) vcpu->arch.hcrx_el2 |= HCRX_EL2_SCTLR2En; } } #endif /* __ARM64_KVM_EMULATE_H__ */
31 186 52 3 12 48 30 31 93 56 59 26 12 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PAGEMAP_H #define _LINUX_PAGEMAP_H /* * Copyright 1995 Linus Torvalds */ #include <linux/mm.h> #include <linux/fs.h> #include <linux/list.h> #include <linux/highmem.h> #include <linux/compiler.h> #include <linux/uaccess.h> #include <linux/gfp.h> #include <linux/bitops.h> #include <linux/hardirq.h> /* for in_interrupt() */ #include <linux/hugetlb_inline.h> struct folio_batch; unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end); static inline void invalidate_remote_inode(struct inode *inode) { if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) invalidate_mapping_pages(inode->i_mapping, 0, -1); } int invalidate_inode_pages2(struct address_space *mapping); int invalidate_inode_pages2_range(struct address_space *mapping, pgoff_t start, pgoff_t end); int kiocb_invalidate_pages(struct kiocb *iocb, size_t count); void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count); int filemap_invalidate_pages(struct address_space *mapping, loff_t pos, loff_t end, bool nowait); int write_inode_now(struct inode *, int sync); int filemap_fdatawrite(struct address_space *); int filemap_flush(struct address_space *); int filemap_fdatawait_keep_errors(struct address_space *mapping); int filemap_fdatawait_range(struct address_space *, loff_t lstart, loff_t lend); int filemap_fdatawait_range_keep_errors(struct address_space *mapping, loff_t start_byte, loff_t end_byte); int filemap_invalidate_inode(struct inode *inode, bool flush, loff_t start, loff_t end); static inline int filemap_fdatawait(struct address_space *mapping) { return filemap_fdatawait_range(mapping, 0, LLONG_MAX); } bool filemap_range_has_page(struct address_space *, loff_t lstart, loff_t lend); int filemap_write_and_wait_range(struct address_space *mapping, loff_t lstart, loff_t lend); int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end, int sync_mode); int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end); int filemap_check_errors(struct address_space *mapping); void __filemap_set_wb_err(struct address_space *mapping, int err); int filemap_fdatawrite_wbc(struct address_space *mapping, struct writeback_control *wbc); int kiocb_write_and_wait(struct kiocb *iocb, size_t count); static inline int filemap_write_and_wait(struct address_space *mapping) { return filemap_write_and_wait_range(mapping, 0, LLONG_MAX); } /** * filemap_set_wb_err - set a writeback error on an address_space * @mapping: mapping in which to set writeback error * @err: error to be set in mapping * * When writeback fails in some way, we must record that error so that * userspace can be informed when fsync and the like are called. We endeavor * to report errors on any file that was open at the time of the error. Some * internal callers also need to know when writeback errors have occurred. * * When a writeback error occurs, most filesystems will want to call * filemap_set_wb_err to record the error in the mapping so that it will be * automatically reported whenever fsync is called on the file. */ static inline void filemap_set_wb_err(struct address_space *mapping, int err) { /* Fastpath for common case of no error */ if (unlikely(err)) __filemap_set_wb_err(mapping, err); } /** * filemap_check_wb_err - has an error occurred since the mark was sampled? * @mapping: mapping to check for writeback errors * @since: previously-sampled errseq_t * * Grab the errseq_t value from the mapping, and see if it has changed "since" * the given value was sampled. * * If it has then report the latest error set, otherwise return 0. */ static inline int filemap_check_wb_err(struct address_space *mapping, errseq_t since) { return errseq_check(&mapping->wb_err, since); } /** * filemap_sample_wb_err - sample the current errseq_t to test for later errors * @mapping: mapping to be sampled * * Writeback errors are always reported relative to a particular sample point * in the past. This function provides those sample points. */ static inline errseq_t filemap_sample_wb_err(struct address_space *mapping) { return errseq_sample(&mapping->wb_err); } /** * file_sample_sb_err - sample the current errseq_t to test for later errors * @file: file pointer to be sampled * * Grab the most current superblock-level errseq_t value for the given * struct file. */ static inline errseq_t file_sample_sb_err(struct file *file) { return errseq_sample(&file->f_path.dentry->d_sb->s_wb_err); } /* * Flush file data before changing attributes. Caller must hold any locks * required to prevent further writes to this file until we're done setting * flags. */ static inline int inode_drain_writes(struct inode *inode) { inode_dio_wait(inode); return filemap_write_and_wait(inode->i_mapping); } static inline bool mapping_empty(struct address_space *mapping) { return xa_empty(&mapping->i_pages); } /* * mapping_shrinkable - test if page cache state allows inode reclaim * @mapping: the page cache mapping * * This checks the mapping's cache state for the pupose of inode * reclaim and LRU management. * * The caller is expected to hold the i_lock, but is not required to * hold the i_pages lock, which usually protects cache state. That's * because the i_lock and the list_lru lock that protect the inode and * its LRU state don't nest inside the irq-safe i_pages lock. * * Cache deletions are performed under the i_lock, which ensures that * when an inode goes empty, it will reliably get queued on the LRU. * * Cache additions do not acquire the i_lock and may race with this * check, in which case we'll report the inode as shrinkable when it * has cache pages. This is okay: the shrinker also checks the * refcount and the referenced bit, which will be elevated or set in * the process of adding new cache pages to an inode. */ static inline bool mapping_shrinkable(struct address_space *mapping) { void *head; /* * On highmem systems, there could be lowmem pressure from the * inodes before there is highmem pressure from the page * cache. Make inodes shrinkable regardless of cache state. */ if (IS_ENABLED(CONFIG_HIGHMEM)) return true; /* Cache completely empty? Shrink away. */ head = rcu_access_pointer(mapping->i_pages.xa_head); if (!head) return true; /* * The xarray stores single offset-0 entries directly in the * head pointer, which allows non-resident page cache entries * to escape the shadow shrinker's list of xarray nodes. The * inode shrinker needs to pick them up under memory pressure. */ if (!xa_is_node(head) && xa_is_value(head)) return true; return false; } /* * Bits in mapping->flags. */ enum mapping_flags { AS_EIO = 0, /* IO error on async write */ AS_ENOSPC = 1, /* ENOSPC on async write */ AS_MM_ALL_LOCKS = 2, /* under mm_take_all_locks() */ AS_UNEVICTABLE = 3, /* e.g., ramdisk, SHM_LOCK */ AS_EXITING = 4, /* final truncate in progress */ /* writeback related tags are not used */ AS_NO_WRITEBACK_TAGS = 5, AS_RELEASE_ALWAYS = 6, /* Call ->release_folio(), even if no private data */ AS_STABLE_WRITES = 7, /* must wait for writeback before modifying folio contents */ AS_INACCESSIBLE = 8, /* Do not attempt direct R/W access to the mapping */ AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM = 9, /* Bits 16-25 are used for FOLIO_ORDER */ AS_FOLIO_ORDER_BITS = 5, AS_FOLIO_ORDER_MIN = 16, AS_FOLIO_ORDER_MAX = AS_FOLIO_ORDER_MIN + AS_FOLIO_ORDER_BITS, }; #define AS_FOLIO_ORDER_BITS_MASK ((1u << AS_FOLIO_ORDER_BITS) - 1) #define AS_FOLIO_ORDER_MIN_MASK (AS_FOLIO_ORDER_BITS_MASK << AS_FOLIO_ORDER_MIN) #define AS_FOLIO_ORDER_MAX_MASK (AS_FOLIO_ORDER_BITS_MASK << AS_FOLIO_ORDER_MAX) #define AS_FOLIO_ORDER_MASK (AS_FOLIO_ORDER_MIN_MASK | AS_FOLIO_ORDER_MAX_MASK) /** * mapping_set_error - record a writeback error in the address_space * @mapping: the mapping in which an error should be set * @error: the error to set in the mapping * * When writeback fails in some way, we must record that error so that * userspace can be informed when fsync and the like are called. We endeavor * to report errors on any file that was open at the time of the error. Some * internal callers also need to know when writeback errors have occurred. * * When a writeback error occurs, most filesystems will want to call * mapping_set_error to record the error in the mapping so that it can be * reported when the application calls fsync(2). */ static inline void mapping_set_error(struct address_space *mapping, int error) { if (likely(!error)) return; /* Record in wb_err for checkers using errseq_t based tracking */ __filemap_set_wb_err(mapping, error); /* Record it in superblock */ if (mapping->host) errseq_set(&mapping->host->i_sb->s_wb_err, error); /* Record it in flags for now, for legacy callers */ if (error == -ENOSPC) set_bit(AS_ENOSPC, &mapping->flags); else set_bit(AS_EIO, &mapping->flags); } static inline void mapping_set_unevictable(struct address_space *mapping) { set_bit(AS_UNEVICTABLE, &mapping->flags); } static inline void mapping_clear_unevictable(struct address_space *mapping) { clear_bit(AS_UNEVICTABLE, &mapping->flags); } static inline bool mapping_unevictable(struct address_space *mapping) { return mapping && test_bit(AS_UNEVICTABLE, &mapping->flags); } static inline void mapping_set_exiting(struct address_space *mapping) { set_bit(AS_EXITING, &mapping->flags); } static inline int mapping_exiting(struct address_space *mapping) { return test_bit(AS_EXITING, &mapping->flags); } static inline void mapping_set_no_writeback_tags(struct address_space *mapping) { set_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags); } static inline int mapping_use_writeback_tags(struct address_space *mapping) { return !test_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags); } static inline bool mapping_release_always(const struct address_space *mapping) { return test_bit(AS_RELEASE_ALWAYS, &mapping->flags); } static inline void mapping_set_release_always(struct address_space *mapping) { set_bit(AS_RELEASE_ALWAYS, &mapping->flags); } static inline void mapping_clear_release_always(struct address_space *mapping) { clear_bit(AS_RELEASE_ALWAYS, &mapping->flags); } static inline bool mapping_stable_writes(const struct address_space *mapping) { return test_bit(AS_STABLE_WRITES, &mapping->flags); } static inline void mapping_set_stable_writes(struct address_space *mapping) { set_bit(AS_STABLE_WRITES, &mapping->flags); } static inline void mapping_clear_stable_writes(struct address_space *mapping) { clear_bit(AS_STABLE_WRITES, &mapping->flags); } static inline void mapping_set_inaccessible(struct address_space *mapping) { /* * It's expected inaccessible mappings are also unevictable. Compaction * migrate scanner (isolate_migratepages_block()) relies on this to * reduce page locking. */ set_bit(AS_UNEVICTABLE, &mapping->flags); set_bit(AS_INACCESSIBLE, &mapping->flags); } static inline bool mapping_inaccessible(struct address_space *mapping) { return test_bit(AS_INACCESSIBLE, &mapping->flags); } static inline void mapping_set_writeback_may_deadlock_on_reclaim(struct address_space *mapping) { set_bit(AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM, &mapping->flags); } static inline bool mapping_writeback_may_deadlock_on_reclaim(struct address_space *mapping) { return test_bit(AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM, &mapping->flags); } static inline gfp_t mapping_gfp_mask(struct address_space * mapping) { return mapping->gfp_mask; } /* Restricts the given gfp_mask to what the mapping allows. */ static inline gfp_t mapping_gfp_constraint(struct address_space *mapping, gfp_t gfp_mask) { return mapping_gfp_mask(mapping) & gfp_mask; } /* * This is non-atomic. Only to be used before the mapping is activated. * Probably needs a barrier... */ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask) { m->gfp_mask = mask; } /* * There are some parts of the kernel which assume that PMD entries * are exactly HPAGE_PMD_ORDER. Those should be fixed, but until then, * limit the maximum allocation order to PMD size. I'm not aware of any * assumptions about maximum order if THP are disabled, but 8 seems like * a good order (that's 1MB if you're using 4kB pages) */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define PREFERRED_MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER #else #define PREFERRED_MAX_PAGECACHE_ORDER 8 #endif /* * xas_split_alloc() does not support arbitrary orders. This implies no * 512MB THP on ARM64 with 64KB base page size. */ #define MAX_XAS_ORDER (XA_CHUNK_SHIFT * 2 - 1) #define MAX_PAGECACHE_ORDER min(MAX_XAS_ORDER, PREFERRED_MAX_PAGECACHE_ORDER) /* * mapping_max_folio_size_supported() - Check the max folio size supported * * The filesystem should call this function at mount time if there is a * requirement on the folio mapping size in the page cache. */ static inline size_t mapping_max_folio_size_supported(void) { if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return 1U << (PAGE_SHIFT + MAX_PAGECACHE_ORDER); return PAGE_SIZE; } /* * mapping_set_folio_order_range() - Set the orders supported by a file. * @mapping: The address space of the file. * @min: Minimum folio order (between 0-MAX_PAGECACHE_ORDER inclusive). * @max: Maximum folio order (between @min-MAX_PAGECACHE_ORDER inclusive). * * The filesystem should call this function in its inode constructor to * indicate which base size (min) and maximum size (max) of folio the VFS * can use to cache the contents of the file. This should only be used * if the filesystem needs special handling of folio sizes (ie there is * something the core cannot know). * Do not tune it based on, eg, i_size. * * Context: This should not be called while the inode is active as it * is non-atomic. */ static inline void mapping_set_folio_order_range(struct address_space *mapping, unsigned int min, unsigned int max) { if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return; if (min > MAX_PAGECACHE_ORDER) min = MAX_PAGECACHE_ORDER; if (max > MAX_PAGECACHE_ORDER) max = MAX_PAGECACHE_ORDER; if (max < min) max = min; mapping->flags = (mapping->flags & ~AS_FOLIO_ORDER_MASK) | (min << AS_FOLIO_ORDER_MIN) | (max << AS_FOLIO_ORDER_MAX); } static inline void mapping_set_folio_min_order(struct address_space *mapping, unsigned int min) { mapping_set_folio_order_range(mapping, min, MAX_PAGECACHE_ORDER); } /** * mapping_set_large_folios() - Indicate the file supports large folios. * @mapping: The address space of the file. * * The filesystem should call this function in its inode constructor to * indicate that the VFS can use large folios to cache the contents of * the file. * * Context: This should not be called while the inode is active as it * is non-atomic. */ static inline void mapping_set_large_folios(struct address_space *mapping) { mapping_set_folio_order_range(mapping, 0, MAX_PAGECACHE_ORDER); } static inline unsigned int mapping_max_folio_order(const struct address_space *mapping) { if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return 0; return (mapping->flags & AS_FOLIO_ORDER_MAX_MASK) >> AS_FOLIO_ORDER_MAX; } static inline unsigned int mapping_min_folio_order(const struct address_space *mapping) { if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return 0; return (mapping->flags & AS_FOLIO_ORDER_MIN_MASK) >> AS_FOLIO_ORDER_MIN; } static inline unsigned long mapping_min_folio_nrpages(struct address_space *mapping) { return 1UL << mapping_min_folio_order(mapping); } /** * mapping_align_index() - Align index for this mapping. * @mapping: The address_space. * @index: The page index. * * The index of a folio must be naturally aligned. If you are adding a * new folio to the page cache and need to know what index to give it, * call this function. */ static inline pgoff_t mapping_align_index(struct address_space *mapping, pgoff_t index) { return round_down(index, mapping_min_folio_nrpages(mapping)); } /* * Large folio support currently depends on THP. These dependencies are * being worked on but are not yet fixed. */ static inline bool mapping_large_folio_support(struct address_space *mapping) { /* AS_FOLIO_ORDER is only reasonable for pagecache folios */ VM_WARN_ONCE((unsigned long)mapping & FOLIO_MAPPING_ANON, "Anonymous mapping always supports large folio"); return mapping_max_folio_order(mapping) > 0; } /* Return the maximum folio size for this pagecache mapping, in bytes. */ static inline size_t mapping_max_folio_size(const struct address_space *mapping) { return PAGE_SIZE << mapping_max_folio_order(mapping); } static inline int filemap_nr_thps(struct address_space *mapping) { #ifdef CONFIG_READ_ONLY_THP_FOR_FS return atomic_read(&mapping->nr_thps); #else return 0; #endif } static inline void filemap_nr_thps_inc(struct address_space *mapping) { #ifdef CONFIG_READ_ONLY_THP_FOR_FS if (!mapping_large_folio_support(mapping)) atomic_inc(&mapping->nr_thps); #else WARN_ON_ONCE(mapping_large_folio_support(mapping) == 0); #endif } static inline void filemap_nr_thps_dec(struct address_space *mapping) { #ifdef CONFIG_READ_ONLY_THP_FOR_FS if (!mapping_large_folio_support(mapping)) atomic_dec(&mapping->nr_thps); #else WARN_ON_ONCE(mapping_large_folio_support(mapping) == 0); #endif } struct address_space *folio_mapping(struct folio *); /** * folio_flush_mapping - Find the file mapping this folio belongs to. * @folio: The folio. * * For folios which are in the page cache, return the mapping that this * page belongs to. Anonymous folios return NULL, even if they're in * the swap cache. Other kinds of folio also return NULL. * * This is ONLY used by architecture cache flushing code. If you aren't * writing cache flushing code, you want either folio_mapping() or * folio_file_mapping(). */ static inline struct address_space *folio_flush_mapping(struct folio *folio) { if (unlikely(folio_test_swapcache(folio))) return NULL; return folio_mapping(folio); } /** * folio_inode - Get the host inode for this folio. * @folio: The folio. * * For folios which are in the page cache, return the inode that this folio * belongs to. * * Do not call this for folios which aren't in the page cache. */ static inline struct inode *folio_inode(struct folio *folio) { return folio->mapping->host; } /** * folio_attach_private - Attach private data to a folio. * @folio: Folio to attach data to. * @data: Data to attach to folio. * * Attaching private data to a folio increments the page's reference count. * The data must be detached before the folio will be freed. */ static inline void folio_attach_private(struct folio *folio, void *data) { folio_get(folio); folio->private = data; folio_set_private(folio); } /** * folio_change_private - Change private data on a folio. * @folio: Folio to change the data on. * @data: Data to set on the folio. * * Change the private data attached to a folio and return the old * data. The page must previously have had data attached and the data * must be detached before the folio will be freed. * * Return: Data that was previously attached to the folio. */ static inline void *folio_change_private(struct folio *folio, void *data) { void *old = folio_get_private(folio); folio->private = data; return old; } /** * folio_detach_private - Detach private data from a folio. * @folio: Folio to detach data from. * * Removes the data that was previously attached to the folio and decrements * the refcount on the page. * * Return: Data that was attached to the folio. */ static inline void *folio_detach_private(struct folio *folio) { void *data = folio_get_private(folio); if (!folio_test_private(folio)) return NULL; folio_clear_private(folio); folio->private = NULL; folio_put(folio); return data; } static inline void attach_page_private(struct page *page, void *data) { folio_attach_private(page_folio(page), data); } static inline void *detach_page_private(struct page *page) { return folio_detach_private(page_folio(page)); } #ifdef CONFIG_NUMA struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order); #else static inline struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order) { return folio_alloc_noprof(gfp, order); } #endif #define filemap_alloc_folio(...) \ alloc_hooks(filemap_alloc_folio_noprof(__VA_ARGS__)) static inline struct page *__page_cache_alloc(gfp_t gfp) { return &filemap_alloc_folio(gfp, 0)->page; } static inline gfp_t readahead_gfp_mask(struct address_space *x) { return mapping_gfp_mask(x) | __GFP_NORETRY | __GFP_NOWARN; } typedef int filler_t(struct file *, struct folio *); pgoff_t page_cache_next_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan); pgoff_t page_cache_prev_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan); /** * typedef fgf_t - Flags for getting folios from the page cache. * * Most users of the page cache will not need to use these flags; * there are convenience functions such as filemap_get_folio() and * filemap_lock_folio(). For users which need more control over exactly * what is done with the folios, these flags to __filemap_get_folio() * are available. * * * %FGP_ACCESSED - The folio will be marked accessed. * * %FGP_LOCK - The folio is returned locked. * * %FGP_CREAT - If no folio is present then a new folio is allocated, * added to the page cache and the VM's LRU list. The folio is * returned locked. * * %FGP_FOR_MMAP - The caller wants to do its own locking dance if the * folio is already in cache. If the folio was allocated, unlock it * before returning so the caller can do the same dance. * * %FGP_WRITE - The folio will be written to by the caller. * * %FGP_NOFS - __GFP_FS will get cleared in gfp. * * %FGP_NOWAIT - Don't block on the folio lock. * * %FGP_STABLE - Wait for the folio to be stable (finished writeback) * * %FGP_DONTCACHE - Uncached buffered IO * * %FGP_WRITEBEGIN - The flags to use in a filesystem write_begin() * implementation. */ typedef unsigned int __bitwise fgf_t; #define FGP_ACCESSED ((__force fgf_t)0x00000001) #define FGP_LOCK ((__force fgf_t)0x00000002) #define FGP_CREAT ((__force fgf_t)0x00000004) #define FGP_WRITE ((__force fgf_t)0x00000008) #define FGP_NOFS ((__force fgf_t)0x00000010) #define FGP_NOWAIT ((__force fgf_t)0x00000020) #define FGP_FOR_MMAP ((__force fgf_t)0x00000040) #define FGP_STABLE ((__force fgf_t)0x00000080) #define FGP_DONTCACHE ((__force fgf_t)0x00000100) #define FGF_GET_ORDER(fgf) (((__force unsigned)fgf) >> 26) /* top 6 bits */ #define FGP_WRITEBEGIN (FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE) static inline unsigned int filemap_get_order(size_t size) { unsigned int shift = ilog2(size); if (shift <= PAGE_SHIFT) return 0; return shift - PAGE_SHIFT; } /** * fgf_set_order - Encode a length in the fgf_t flags. * @size: The suggested size of the folio to create. * * The caller of __filemap_get_folio() can use this to suggest a preferred * size for the folio that is created. If there is already a folio at * the index, it will be returned, no matter what its size. If a folio * is freshly created, it may be of a different size than requested * due to alignment constraints, memory pressure, or the presence of * other folios at nearby indices. */ static inline fgf_t fgf_set_order(size_t size) { unsigned int order = filemap_get_order(size); if (!order) return 0; return (__force fgf_t)(order << 26); } void *filemap_get_entry(struct address_space *mapping, pgoff_t index); struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, fgf_t fgp_flags, gfp_t gfp); struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, fgf_t fgp_flags, gfp_t gfp); /** * write_begin_get_folio - Get folio for write_begin with flags. * @iocb: The kiocb passed from write_begin (may be NULL). * @mapping: The address space to search. * @index: The page cache index. * @len: Length of data being written. * * This is a helper for filesystem write_begin() implementations. * It wraps __filemap_get_folio(), setting appropriate flags in * the write begin context. * * Return: A folio or an ERR_PTR. */ static inline struct folio *write_begin_get_folio(const struct kiocb *iocb, struct address_space *mapping, pgoff_t index, size_t len) { fgf_t fgp_flags = FGP_WRITEBEGIN; fgp_flags |= fgf_set_order(len); if (iocb && iocb->ki_flags & IOCB_DONTCACHE) fgp_flags |= FGP_DONTCACHE; return __filemap_get_folio(mapping, index, fgp_flags, mapping_gfp_mask(mapping)); } /** * filemap_get_folio - Find and get a folio. * @mapping: The address_space to search. * @index: The page index. * * Looks up the page cache entry at @mapping & @index. If a folio is * present, it is returned with an increased refcount. * * Return: A folio or ERR_PTR(-ENOENT) if there is no folio in the cache for * this index. Will not return a shadow, swap or DAX entry. */ static inline struct folio *filemap_get_folio(struct address_space *mapping, pgoff_t index) { return __filemap_get_folio(mapping, index, 0, 0); } /** * filemap_lock_folio - Find and lock a folio. * @mapping: The address_space to search. * @index: The page index. * * Looks up the page cache entry at @mapping & @index. If a folio is * present, it is returned locked with an increased refcount. * * Context: May sleep. * Return: A folio or ERR_PTR(-ENOENT) if there is no folio in the cache for * this index. Will not return a shadow, swap or DAX entry. */ static inline struct folio *filemap_lock_folio(struct address_space *mapping, pgoff_t index) { return __filemap_get_folio(mapping, index, FGP_LOCK, 0); } /** * filemap_grab_folio - grab a folio from the page cache * @mapping: The address space to search * @index: The page index * * Looks up the page cache entry at @mapping & @index. If no folio is found, * a new folio is created. The folio is locked, marked as accessed, and * returned. * * Return: A found or created folio. ERR_PTR(-ENOMEM) if no folio is found * and failed to create a folio. */ static inline struct folio *filemap_grab_folio(struct address_space *mapping, pgoff_t index) { return __filemap_get_folio(mapping, index, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mapping_gfp_mask(mapping)); } /** * find_get_page - find and get a page reference * @mapping: the address_space to search * @offset: the page index * * Looks up the page cache slot at @mapping & @offset. If there is a * page cache page, it is returned with an increased refcount. * * Otherwise, %NULL is returned. */ static inline struct page *find_get_page(struct address_space *mapping, pgoff_t offset) { return pagecache_get_page(mapping, offset, 0, 0); } static inline struct page *find_get_page_flags(struct address_space *mapping, pgoff_t offset, fgf_t fgp_flags) { return pagecache_get_page(mapping, offset, fgp_flags, 0); } /** * find_lock_page - locate, pin and lock a pagecache page * @mapping: the address_space to search * @index: the page index * * Looks up the page cache entry at @mapping & @index. If there is a * page cache page, it is returned locked and with an increased * refcount. * * Context: May sleep. * Return: A struct page or %NULL if there is no page in the cache for this * index. */ static inline struct page *find_lock_page(struct address_space *mapping, pgoff_t index) { return pagecache_get_page(mapping, index, FGP_LOCK, 0); } /** * find_or_create_page - locate or add a pagecache page * @mapping: the page's address_space * @index: the page's index into the mapping * @gfp_mask: page allocation mode * * Looks up the page cache slot at @mapping & @offset. If there is a * page cache page, it is returned locked and with an increased * refcount. * * If the page is not present, a new page is allocated using @gfp_mask * and added to the page cache and the VM's LRU list. The page is * returned locked and with an increased refcount. * * On memory exhaustion, %NULL is returned. * * find_or_create_page() may sleep, even if @gfp_flags specifies an * atomic allocation! */ static inline struct page *find_or_create_page(struct address_space *mapping, pgoff_t index, gfp_t gfp_mask) { return pagecache_get_page(mapping, index, FGP_LOCK|FGP_ACCESSED|FGP_CREAT, gfp_mask); } /** * grab_cache_page_nowait - returns locked page at given index in given cache * @mapping: target address_space * @index: the page index * * Returns locked page at given index in given cache, creating it if * needed, but do not wait if the page is locked or to reclaim memory. * This is intended for speculative data generators, where the data can * be regenerated if the page couldn't be grabbed. This routine should * be safe to call while holding the lock for another page. * * Clear __GFP_FS when allocating the page to avoid recursion into the fs * and deadlock against the caller's locked page. */ static inline struct page *grab_cache_page_nowait(struct address_space *mapping, pgoff_t index) { return pagecache_get_page(mapping, index, FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT, mapping_gfp_mask(mapping)); } /** * folio_next_index - Get the index of the next folio. * @folio: The current folio. * * Return: The index of the folio which follows this folio in the file. */ static inline pgoff_t folio_next_index(struct folio *folio) { return folio->index + folio_nr_pages(folio); } /** * folio_file_page - The page for a particular index. * @folio: The folio which contains this index. * @index: The index we want to look up. * * Sometimes after looking up a folio in the page cache, we need to * obtain the specific page for an index (eg a page fault). * * Return: The page containing the file data for this index. */ static inline struct page *folio_file_page(struct folio *folio, pgoff_t index) { return folio_page(folio, index & (folio_nr_pages(folio) - 1)); } /** * folio_contains - Does this folio contain this index? * @folio: The folio. * @index: The page index within the file. * * Context: The caller should have the folio locked and ensure * e.g., shmem did not move this folio to the swap cache. * Return: true or false. */ static inline bool folio_contains(struct folio *folio, pgoff_t index) { VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio); return index - folio->index < folio_nr_pages(folio); } unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch); unsigned filemap_get_folios_contig(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch); unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start, pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch); struct folio *read_cache_folio(struct address_space *, pgoff_t index, filler_t *filler, struct file *file); struct folio *mapping_read_folio_gfp(struct address_space *, pgoff_t index, gfp_t flags); struct page *read_cache_page(struct address_space *, pgoff_t index, filler_t *filler, struct file *file); extern struct page * read_cache_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); static inline struct page *read_mapping_page(struct address_space *mapping, pgoff_t index, struct file *file) { return read_cache_page(mapping, index, NULL, file); } static inline struct folio *read_mapping_folio(struct address_space *mapping, pgoff_t index, struct file *file) { return read_cache_folio(mapping, index, NULL, file); } /** * page_pgoff - Calculate the logical page offset of this page. * @folio: The folio containing this page. * @page: The page which we need the offset of. * * For file pages, this is the offset from the beginning of the file * in units of PAGE_SIZE. For anonymous pages, this is the offset from * the beginning of the anon_vma in units of PAGE_SIZE. This will * return nonsense for KSM pages. * * Context: Caller must have a reference on the folio or otherwise * prevent it from being split or freed. * * Return: The offset in units of PAGE_SIZE. */ static inline pgoff_t page_pgoff(const struct folio *folio, const struct page *page) { return folio->index + folio_page_idx(folio, page); } /** * folio_pos - Returns the byte position of this folio in its file. * @folio: The folio. */ static inline loff_t folio_pos(const struct folio *folio) { return ((loff_t)folio->index) * PAGE_SIZE; } /* * Return byte-offset into filesystem object for page. */ static inline loff_t page_offset(struct page *page) { struct folio *folio = page_folio(page); return folio_pos(folio) + folio_page_idx(folio, page) * PAGE_SIZE; } /* * Get the offset in PAGE_SIZE (even for hugetlb folios). */ static inline pgoff_t folio_pgoff(struct folio *folio) { return folio->index; } static inline pgoff_t linear_page_index(struct vm_area_struct *vma, unsigned long address) { pgoff_t pgoff; pgoff = (address - vma->vm_start) >> PAGE_SHIFT; pgoff += vma->vm_pgoff; return pgoff; } struct wait_page_key { struct folio *folio; int bit_nr; int page_match; }; struct wait_page_queue { struct folio *folio; int bit_nr; wait_queue_entry_t wait; }; static inline bool wake_page_match(struct wait_page_queue *wait_page, struct wait_page_key *key) { if (wait_page->folio != key->folio) return false; key->page_match = 1; if (wait_page->bit_nr != key->bit_nr) return false; return true; } void __folio_lock(struct folio *folio); int __folio_lock_killable(struct folio *folio); vm_fault_t __folio_lock_or_retry(struct folio *folio, struct vm_fault *vmf); void unlock_page(struct page *page); void folio_unlock(struct folio *folio); /** * folio_trylock() - Attempt to lock a folio. * @folio: The folio to attempt to lock. * * Sometimes it is undesirable to wait for a folio to be unlocked (eg * when the locks are being taken in the wrong order, or if making * progress through a batch of folios is more important than processing * them in order). Usually folio_lock() is the correct function to call. * * Context: Any context. * Return: Whether the lock was successfully acquired. */ static inline bool folio_trylock(struct folio *folio) { return likely(!test_and_set_bit_lock(PG_locked, folio_flags(folio, 0))); } /* * Return true if the page was successfully locked */ static inline bool trylock_page(struct page *page) { return folio_trylock(page_folio(page)); } /** * folio_lock() - Lock this folio. * @folio: The folio to lock. * * The folio lock protects against many things, probably more than it * should. It is primarily held while a folio is being brought uptodate, * either from its backing file or from swap. It is also held while a * folio is being truncated from its address_space, so holding the lock * is sufficient to keep folio->mapping stable. * * The folio lock is also held while write() is modifying the page to * provide POSIX atomicity guarantees (as long as the write does not * cross a page boundary). Other modifications to the data in the folio * do not hold the folio lock and can race with writes, eg DMA and stores * to mapped pages. * * Context: May sleep. If you need to acquire the locks of two or * more folios, they must be in order of ascending index, if they are * in the same address_space. If they are in different address_spaces, * acquire the lock of the folio which belongs to the address_space which * has the lowest address in memory first. */ static inline void folio_lock(struct folio *folio) { might_sleep(); if (!folio_trylock(folio)) __folio_lock(folio); } /** * lock_page() - Lock the folio containing this page. * @page: The page to lock. * * See folio_lock() for a description of what the lock protects. * This is a legacy function and new code should probably use folio_lock() * instead. * * Context: May sleep. Pages in the same folio share a lock, so do not * attempt to lock two pages which share a folio. */ static inline void lock_page(struct page *page) { struct folio *folio; might_sleep(); folio = page_folio(page); if (!folio_trylock(folio)) __folio_lock(folio); } /** * folio_lock_killable() - Lock this folio, interruptible by a fatal signal. * @folio: The folio to lock. * * Attempts to lock the folio, like folio_lock(), except that the sleep * to acquire the lock is interruptible by a fatal signal. * * Context: May sleep; see folio_lock(). * Return: 0 if the lock was acquired; -EINTR if a fatal signal was received. */ static inline int folio_lock_killable(struct folio *folio) { might_sleep(); if (!folio_trylock(folio)) return __folio_lock_killable(folio); return 0; } /* * folio_lock_or_retry - Lock the folio, unless this would block and the * caller indicated that it can handle a retry. * * Return value and mmap_lock implications depend on flags; see * __folio_lock_or_retry(). */ static inline vm_fault_t folio_lock_or_retry(struct folio *folio, struct vm_fault *vmf) { might_sleep(); if (!folio_trylock(folio)) return __folio_lock_or_retry(folio, vmf); return 0; } /* * This is exported only for folio_wait_locked/folio_wait_writeback, etc., * and should not be used directly. */ void folio_wait_bit(struct folio *folio, int bit_nr); int folio_wait_bit_killable(struct folio *folio, int bit_nr); /* * Wait for a folio to be unlocked. * * This must be called with the caller "holding" the folio, * ie with increased folio reference count so that the folio won't * go away during the wait. */ static inline void folio_wait_locked(struct folio *folio) { if (folio_test_locked(folio)) folio_wait_bit(folio, PG_locked); } static inline int folio_wait_locked_killable(struct folio *folio) { if (!folio_test_locked(folio)) return 0; return folio_wait_bit_killable(folio, PG_locked); } void folio_end_read(struct folio *folio, bool success); void wait_on_page_writeback(struct page *page); void folio_wait_writeback(struct folio *folio); int folio_wait_writeback_killable(struct folio *folio); void end_page_writeback(struct page *page); void folio_end_writeback(struct folio *folio); void folio_wait_stable(struct folio *folio); void __folio_mark_dirty(struct folio *folio, struct address_space *, int warn); void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb); void __folio_cancel_dirty(struct folio *folio); static inline void folio_cancel_dirty(struct folio *folio) { /* Avoid atomic ops, locking, etc. when not actually needed. */ if (folio_test_dirty(folio)) __folio_cancel_dirty(folio); } bool folio_clear_dirty_for_io(struct folio *folio); bool clear_page_dirty_for_io(struct page *page); void folio_invalidate(struct folio *folio, size_t offset, size_t length); bool noop_dirty_folio(struct address_space *mapping, struct folio *folio); #ifdef CONFIG_MIGRATION int filemap_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode); #else #define filemap_migrate_folio NULL #endif void folio_end_private_2(struct folio *folio); void folio_wait_private_2(struct folio *folio); int folio_wait_private_2_killable(struct folio *folio); /* * Fault in userspace address range. */ size_t fault_in_writeable(char __user *uaddr, size_t size); size_t fault_in_subpage_writeable(char __user *uaddr, size_t size); size_t fault_in_safe_writeable(const char __user *uaddr, size_t size); size_t fault_in_readable(const char __user *uaddr, size_t size); int add_to_page_cache_lru(struct page *page, struct address_space *mapping, pgoff_t index, gfp_t gfp); int filemap_add_folio(struct address_space *mapping, struct folio *folio, pgoff_t index, gfp_t gfp); void filemap_remove_folio(struct folio *folio); void __filemap_remove_folio(struct folio *folio, void *shadow); void replace_page_cache_folio(struct folio *old, struct folio *new); void delete_from_page_cache_batch(struct address_space *mapping, struct folio_batch *fbatch); bool filemap_release_folio(struct folio *folio, gfp_t gfp); loff_t mapping_seek_hole_data(struct address_space *, loff_t start, loff_t end, int whence); /* Must be non-static for BPF error injection */ int __filemap_add_folio(struct address_space *mapping, struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp); bool filemap_range_has_writeback(struct address_space *mapping, loff_t start_byte, loff_t end_byte); /** * filemap_range_needs_writeback - check if range potentially needs writeback * @mapping: address space within which to check * @start_byte: offset in bytes where the range starts * @end_byte: offset in bytes where the range ends (inclusive) * * Find at least one page in the range supplied, usually used to check if * direct writing in this range will trigger a writeback. Used by O_DIRECT * read/write with IOCB_NOWAIT, to see if the caller needs to do * filemap_write_and_wait_range() before proceeding. * * Return: %true if the caller should do filemap_write_and_wait_range() before * doing O_DIRECT to a page in this range, %false otherwise. */ static inline bool filemap_range_needs_writeback(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { if (!mapping->nrpages) return false; if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && !mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) return false; return filemap_range_has_writeback(mapping, start_byte, end_byte); } /** * struct readahead_control - Describes a readahead request. * * A readahead request is for consecutive pages. Filesystems which * implement the ->readahead method should call readahead_folio() or * __readahead_batch() in a loop and attempt to start reads into each * folio in the request. * * Most of the fields in this struct are private and should be accessed * by the functions below. * * @file: The file, used primarily by network filesystems for authentication. * May be NULL if invoked internally by the filesystem. * @mapping: Readahead this filesystem object. * @ra: File readahead state. May be NULL. */ struct readahead_control { struct file *file; struct address_space *mapping; struct file_ra_state *ra; /* private: use the readahead_* accessors instead */ pgoff_t _index; unsigned int _nr_pages; unsigned int _batch_count; bool dropbehind; bool _workingset; unsigned long _pflags; }; #define DEFINE_READAHEAD(ractl, f, r, m, i) \ struct readahead_control ractl = { \ .file = f, \ .mapping = m, \ .ra = r, \ ._index = i, \ } #define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE) void page_cache_ra_unbounded(struct readahead_control *, unsigned long nr_to_read, unsigned long lookahead_count); void page_cache_sync_ra(struct readahead_control *, unsigned long req_count); void page_cache_async_ra(struct readahead_control *, struct folio *, unsigned long req_count); void readahead_expand(struct readahead_control *ractl, loff_t new_start, size_t new_len); /** * page_cache_sync_readahead - generic file readahead * @mapping: address_space which holds the pagecache and I/O vectors * @ra: file_ra_state which holds the readahead state * @file: Used by the filesystem for authentication. * @index: Index of first page to be read. * @req_count: Total number of pages being read by the caller. * * page_cache_sync_readahead() should be called when a cache miss happened: * it will submit the read. The readahead logic may decide to piggyback more * pages onto the read request if access patterns suggest it will improve * performance. */ static inline void page_cache_sync_readahead(struct address_space *mapping, struct file_ra_state *ra, struct file *file, pgoff_t index, unsigned long req_count) { DEFINE_READAHEAD(ractl, file, ra, mapping, index); page_cache_sync_ra(&ractl, req_count); } /** * page_cache_async_readahead - file readahead for marked pages * @mapping: address_space which holds the pagecache and I/O vectors * @ra: file_ra_state which holds the readahead state * @file: Used by the filesystem for authentication. * @folio: The folio which triggered the readahead call. * @req_count: Total number of pages being read by the caller. * * page_cache_async_readahead() should be called when a page is used which * is marked as PageReadahead; this is a marker to suggest that the application * has used up enough of the readahead window that we should start pulling in * more pages. */ static inline void page_cache_async_readahead(struct address_space *mapping, struct file_ra_state *ra, struct file *file, struct folio *folio, unsigned long req_count) { DEFINE_READAHEAD(ractl, file, ra, mapping, folio->index); page_cache_async_ra(&ractl, folio, req_count); } static inline struct folio *__readahead_folio(struct readahead_control *ractl) { struct folio *folio; BUG_ON(ractl->_batch_count > ractl->_nr_pages); ractl->_nr_pages -= ractl->_batch_count; ractl->_index += ractl->_batch_count; if (!ractl->_nr_pages) { ractl->_batch_count = 0; return NULL; } folio = xa_load(&ractl->mapping->i_pages, ractl->_index); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); ractl->_batch_count = folio_nr_pages(folio); return folio; } /** * readahead_folio - Get the next folio to read. * @ractl: The current readahead request. * * Context: The folio is locked. The caller should unlock the folio once * all I/O to that folio has completed. * Return: A pointer to the next folio, or %NULL if we are done. */ static inline struct folio *readahead_folio(struct readahead_control *ractl) { struct folio *folio = __readahead_folio(ractl); if (folio) folio_put(folio); return folio; } static inline unsigned int __readahead_batch(struct readahead_control *rac, struct page **array, unsigned int array_sz) { unsigned int i = 0; XA_STATE(xas, &rac->mapping->i_pages, 0); struct folio *folio; BUG_ON(rac->_batch_count > rac->_nr_pages); rac->_nr_pages -= rac->_batch_count; rac->_index += rac->_batch_count; rac->_batch_count = 0; xas_set(&xas, rac->_index); rcu_read_lock(); xas_for_each(&xas, folio, rac->_index + rac->_nr_pages - 1) { if (xas_retry(&xas, folio)) continue; VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); array[i++] = folio_page(folio, 0); rac->_batch_count += folio_nr_pages(folio); if (i == array_sz) break; } rcu_read_unlock(); return i; } /** * readahead_pos - The byte offset into the file of this readahead request. * @rac: The readahead request. */ static inline loff_t readahead_pos(struct readahead_control *rac) { return (loff_t)rac->_index * PAGE_SIZE; } /** * readahead_length - The number of bytes in this readahead request. * @rac: The readahead request. */ static inline size_t readahead_length(struct readahead_control *rac) { return rac->_nr_pages * PAGE_SIZE; } /** * readahead_index - The index of the first page in this readahead request. * @rac: The readahead request. */ static inline pgoff_t readahead_index(struct readahead_control *rac) { return rac->_index; } /** * readahead_count - The number of pages in this readahead request. * @rac: The readahead request. */ static inline unsigned int readahead_count(struct readahead_control *rac) { return rac->_nr_pages; } /** * readahead_batch_length - The number of bytes in the current batch. * @rac: The readahead request. */ static inline size_t readahead_batch_length(struct readahead_control *rac) { return rac->_batch_count * PAGE_SIZE; } static inline unsigned long dir_pages(struct inode *inode) { return (unsigned long)(inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT; } /** * folio_mkwrite_check_truncate - check if folio was truncated * @folio: the folio to check * @inode: the inode to check the folio against * * Return: the number of bytes in the folio up to EOF, * or -EFAULT if the folio was truncated. */ static inline ssize_t folio_mkwrite_check_truncate(struct folio *folio, struct inode *inode) { loff_t size = i_size_read(inode); pgoff_t index = size >> PAGE_SHIFT; size_t offset = offset_in_folio(folio, size); if (!folio->mapping) return -EFAULT; /* folio is wholly inside EOF */ if (folio_next_index(folio) - 1 < index) return folio_size(folio); /* folio is wholly past EOF */ if (folio->index > index || !offset) return -EFAULT; /* folio is partially inside EOF */ return offset; } /** * i_blocks_per_folio - How many blocks fit in this folio. * @inode: The inode which contains the blocks. * @folio: The folio. * * If the block size is larger than the size of this folio, return zero. * * Context: The caller should hold a refcount on the folio to prevent it * from being split. * Return: The number of filesystem blocks covered by this folio. */ static inline unsigned int i_blocks_per_folio(struct inode *inode, struct folio *folio) { return folio_size(folio) >> inode->i_blkbits; } #endif /* _LINUX_PAGEMAP_H */
584 500 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 /* SPDX-License-Identifier: GPL-2.0 */ /* rwsem.h: R/W semaphores, public interface * * Written by David Howells (dhowells@redhat.com). * Derived from asm-i386/semaphore.h */ #ifndef _LINUX_RWSEM_H #define _LINUX_RWSEM_H #include <linux/linkage.h> #include <linux/types.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/atomic.h> #include <linux/err.h> #include <linux/cleanup.h> #ifdef CONFIG_DEBUG_LOCK_ALLOC # define __RWSEM_DEP_MAP_INIT(lockname) \ .dep_map = { \ .name = #lockname, \ .wait_type_inner = LD_WAIT_SLEEP, \ }, #else # define __RWSEM_DEP_MAP_INIT(lockname) #endif #ifndef CONFIG_PREEMPT_RT #ifdef CONFIG_RWSEM_SPIN_ON_OWNER #include <linux/osq_lock.h> #endif /* * For an uncontended rwsem, count and owner are the only fields a task * needs to touch when acquiring the rwsem. So they are put next to each * other to increase the chance that they will share the same cacheline. * * In a contended rwsem, the owner is likely the most frequently accessed * field in the structure as the optimistic waiter that holds the osq lock * will spin on owner. For an embedded rwsem, other hot fields in the * containing structure should be moved further away from the rwsem to * reduce the chance that they will share the same cacheline causing * cacheline bouncing problem. */ struct rw_semaphore { atomic_long_t count; /* * Write owner or one of the read owners as well flags regarding * the current state of the rwsem. Can be used as a speculative * check to see if the write owner is running on the cpu. */ atomic_long_t owner; #ifdef CONFIG_RWSEM_SPIN_ON_OWNER struct optimistic_spin_queue osq; /* spinner MCS lock */ #endif raw_spinlock_t wait_lock; struct list_head wait_list; #ifdef CONFIG_DEBUG_RWSEMS void *magic; #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif }; #define RWSEM_UNLOCKED_VALUE 0UL #define RWSEM_WRITER_LOCKED (1UL << 0) #define __RWSEM_COUNT_INIT(name) .count = ATOMIC_LONG_INIT(RWSEM_UNLOCKED_VALUE) static inline int rwsem_is_locked(struct rw_semaphore *sem) { return atomic_long_read(&sem->count) != RWSEM_UNLOCKED_VALUE; } static inline void rwsem_assert_held_nolockdep(const struct rw_semaphore *sem) { WARN_ON(atomic_long_read(&sem->count) == RWSEM_UNLOCKED_VALUE); } static inline void rwsem_assert_held_write_nolockdep(const struct rw_semaphore *sem) { WARN_ON(!(atomic_long_read(&sem->count) & RWSEM_WRITER_LOCKED)); } /* Common initializer macros and functions */ #ifdef CONFIG_DEBUG_RWSEMS # define __RWSEM_DEBUG_INIT(lockname) .magic = &lockname, #else # define __RWSEM_DEBUG_INIT(lockname) #endif #ifdef CONFIG_RWSEM_SPIN_ON_OWNER #define __RWSEM_OPT_INIT(lockname) .osq = OSQ_LOCK_UNLOCKED, #else #define __RWSEM_OPT_INIT(lockname) #endif #define __RWSEM_INITIALIZER(name) \ { __RWSEM_COUNT_INIT(name), \ .owner = ATOMIC_LONG_INIT(0), \ __RWSEM_OPT_INIT(name) \ .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),\ .wait_list = LIST_HEAD_INIT((name).wait_list), \ __RWSEM_DEBUG_INIT(name) \ __RWSEM_DEP_MAP_INIT(name) } #define DECLARE_RWSEM(name) \ struct rw_semaphore name = __RWSEM_INITIALIZER(name) extern void __init_rwsem(struct rw_semaphore *sem, const char *name, struct lock_class_key *key); #define init_rwsem(sem) \ do { \ static struct lock_class_key __key; \ \ __init_rwsem((sem), #sem, &__key); \ } while (0) /* * This is the same regardless of which rwsem implementation that is being used. * It is just a heuristic meant to be called by somebody already holding the * rwsem to see if somebody from an incompatible type is wanting access to the * lock. */ static inline int rwsem_is_contended(struct rw_semaphore *sem) { return !list_empty(&sem->wait_list); } #if defined(CONFIG_DEBUG_RWSEMS) || defined(CONFIG_DETECT_HUNG_TASK_BLOCKER) /* * Return just the real task structure pointer of the owner */ extern struct task_struct *rwsem_owner(struct rw_semaphore *sem); /* * Return true if the rwsem is owned by a reader. */ extern bool is_rwsem_reader_owned(struct rw_semaphore *sem); #endif #else /* !CONFIG_PREEMPT_RT */ #include <linux/rwbase_rt.h> struct rw_semaphore { struct rwbase_rt rwbase; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif }; #define __RWSEM_INITIALIZER(name) \ { \ .rwbase = __RWBASE_INITIALIZER(name), \ __RWSEM_DEP_MAP_INIT(name) \ } #define DECLARE_RWSEM(lockname) \ struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname) extern void __init_rwsem(struct rw_semaphore *rwsem, const char *name, struct lock_class_key *key); #define init_rwsem(sem) \ do { \ static struct lock_class_key __key; \ \ __init_rwsem((sem), #sem, &__key); \ } while (0) static __always_inline int rwsem_is_locked(const struct rw_semaphore *sem) { return rw_base_is_locked(&sem->rwbase); } static __always_inline void rwsem_assert_held_nolockdep(const struct rw_semaphore *sem) { WARN_ON(!rwsem_is_locked(sem)); } static __always_inline void rwsem_assert_held_write_nolockdep(const struct rw_semaphore *sem) { WARN_ON(!rw_base_is_write_locked(&sem->rwbase)); } static __always_inline int rwsem_is_contended(struct rw_semaphore *sem) { return rw_base_is_contended(&sem->rwbase); } #endif /* CONFIG_PREEMPT_RT */ /* * The functions below are the same for all rwsem implementations including * the RT specific variant. */ static inline void rwsem_assert_held(const struct rw_semaphore *sem) { if (IS_ENABLED(CONFIG_LOCKDEP)) lockdep_assert_held(sem); else rwsem_assert_held_nolockdep(sem); } static inline void rwsem_assert_held_write(const struct rw_semaphore *sem) { if (IS_ENABLED(CONFIG_LOCKDEP)) lockdep_assert_held_write(sem); else rwsem_assert_held_write_nolockdep(sem); } /* * lock for reading */ extern void down_read(struct rw_semaphore *sem); extern int __must_check down_read_interruptible(struct rw_semaphore *sem); extern int __must_check down_read_killable(struct rw_semaphore *sem); /* * trylock for reading -- returns 1 if successful, 0 if contention */ extern int down_read_trylock(struct rw_semaphore *sem); /* * lock for writing */ extern void down_write(struct rw_semaphore *sem); extern int __must_check down_write_killable(struct rw_semaphore *sem); /* * trylock for writing -- returns 1 if successful, 0 if contention */ extern int down_write_trylock(struct rw_semaphore *sem); /* * release a read lock */ extern void up_read(struct rw_semaphore *sem); /* * release a write lock */ extern void up_write(struct rw_semaphore *sem); DEFINE_GUARD(rwsem_read, struct rw_semaphore *, down_read(_T), up_read(_T)) DEFINE_GUARD_COND(rwsem_read, _try, down_read_trylock(_T)) DEFINE_GUARD_COND(rwsem_read, _intr, down_read_interruptible(_T), _RET == 0) DEFINE_GUARD(rwsem_write, struct rw_semaphore *, down_write(_T), up_write(_T)) DEFINE_GUARD_COND(rwsem_write, _try, down_write_trylock(_T)) DEFINE_GUARD_COND(rwsem_write, _kill, down_write_killable(_T), _RET == 0) /* * downgrade write lock to read lock */ extern void downgrade_write(struct rw_semaphore *sem); #ifdef CONFIG_DEBUG_LOCK_ALLOC /* * nested locking. NOTE: rwsems are not allowed to recurse * (which occurs if the same task tries to acquire the same * lock instance multiple times), but multiple locks of the * same lock class might be taken, if the order of the locks * is always the same. This ordering rule can be expressed * to lockdep via the _nested() APIs, but enumerating the * subclasses that are used. (If the nesting relationship is * static then another method for expressing nested locking is * the explicit definition of lock class keys and the use of * lockdep_set_class() at lock initialization time. * See Documentation/locking/lockdep-design.rst for more details.) */ extern void down_read_nested(struct rw_semaphore *sem, int subclass); extern int __must_check down_read_killable_nested(struct rw_semaphore *sem, int subclass); extern void down_write_nested(struct rw_semaphore *sem, int subclass); extern int down_write_killable_nested(struct rw_semaphore *sem, int subclass); extern void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest_lock); # define down_write_nest_lock(sem, nest_lock) \ do { \ typecheck(struct lockdep_map *, &(nest_lock)->dep_map); \ _down_write_nest_lock(sem, &(nest_lock)->dep_map); \ } while (0) /* * Take/release a lock when not the owner will release it. * * [ This API should be avoided as much as possible - the * proper abstraction for this case is completions. ] */ extern void down_read_non_owner(struct rw_semaphore *sem); extern void up_read_non_owner(struct rw_semaphore *sem); #else # define down_read_nested(sem, subclass) down_read(sem) # define down_read_killable_nested(sem, subclass) down_read_killable(sem) # define down_write_nest_lock(sem, nest_lock) down_write(sem) # define down_write_nested(sem, subclass) down_write(sem) # define down_write_killable_nested(sem, subclass) down_write_killable(sem) # define down_read_non_owner(sem) down_read(sem) # define up_read_non_owner(sem) up_read(sem) #endif #endif /* _LINUX_RWSEM_H */
12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 // SPDX-License-Identifier: GPL-2.0 /* * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner * * NOHZ implementation for low and high resolution timers * * Started by: Thomas Gleixner and Ingo Molnar */ #include <linux/compiler.h> #include <linux/cpu.h> #include <linux/err.h> #include <linux/hrtimer.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> #include <linux/percpu.h> #include <linux/nmi.h> #include <linux/profile.h> #include <linux/sched/signal.h> #include <linux/sched/clock.h> #include <linux/sched/stat.h> #include <linux/sched/nohz.h> #include <linux/sched/loadavg.h> #include <linux/module.h> #include <linux/irq_work.h> #include <linux/posix-timers.h> #include <linux/context_tracking.h> #include <linux/mm.h> #include <asm/irq_regs.h> #include "tick-internal.h" #include <trace/events/timer.h> /* * Per-CPU nohz control structure */ static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); struct tick_sched *tick_get_tick_sched(int cpu) { return &per_cpu(tick_cpu_sched, cpu); } /* * The time when the last jiffy update happened. Write access must hold * jiffies_lock and jiffies_seq. tick_nohz_next_event() needs to get a * consistent view of jiffies and last_jiffies_update. */ static ktime_t last_jiffies_update; /* * Must be called with interrupts disabled ! */ static void tick_do_update_jiffies64(ktime_t now) { unsigned long ticks = 1; ktime_t delta, nextp; /* * 64-bit can do a quick check without holding the jiffies lock and * without looking at the sequence count. The smp_load_acquire() * pairs with the update done later in this function. * * 32-bit cannot do that because the store of 'tick_next_period' * consists of two 32-bit stores, and the first store could be * moved by the CPU to a random point in the future. */ if (IS_ENABLED(CONFIG_64BIT)) { if (ktime_before(now, smp_load_acquire(&tick_next_period))) return; } else { unsigned int seq; /* * Avoid contention on 'jiffies_lock' and protect the quick * check with the sequence count. */ do { seq = read_seqcount_begin(&jiffies_seq); nextp = tick_next_period; } while (read_seqcount_retry(&jiffies_seq, seq)); if (ktime_before(now, nextp)) return; } /* Quick check failed, i.e. update is required. */ raw_spin_lock(&jiffies_lock); /* * Re-evaluate with the lock held. Another CPU might have done the * update already. */ if (ktime_before(now, tick_next_period)) { raw_spin_unlock(&jiffies_lock); return; } write_seqcount_begin(&jiffies_seq); delta = ktime_sub(now, tick_next_period); if (unlikely(delta >= TICK_NSEC)) { /* Slow path for long idle sleep times */ s64 incr = TICK_NSEC; ticks += ktime_divns(delta, incr); last_jiffies_update = ktime_add_ns(last_jiffies_update, incr * ticks); } else { last_jiffies_update = ktime_add_ns(last_jiffies_update, TICK_NSEC); } /* Advance jiffies to complete the 'jiffies_seq' protected job */ jiffies_64 += ticks; /* Keep the tick_next_period variable up to date */ nextp = ktime_add_ns(last_jiffies_update, TICK_NSEC); if (IS_ENABLED(CONFIG_64BIT)) { /* * Pairs with smp_load_acquire() in the lockless quick * check above, and ensures that the update to 'jiffies_64' is * not reordered vs. the store to 'tick_next_period', neither * by the compiler nor by the CPU. */ smp_store_release(&tick_next_period, nextp); } else { /* * A plain store is good enough on 32-bit, as the quick check * above is protected by the sequence count. */ tick_next_period = nextp; } /* * Release the sequence count. calc_global_load() below is not * protected by it, but 'jiffies_lock' needs to be held to prevent * concurrent invocations. */ write_seqcount_end(&jiffies_seq); calc_global_load(); raw_spin_unlock(&jiffies_lock); update_wall_time(); } /* * Initialize and return retrieve the jiffies update. */ static ktime_t tick_init_jiffy_update(void) { ktime_t period; raw_spin_lock(&jiffies_lock); write_seqcount_begin(&jiffies_seq); /* Have we started the jiffies update yet ? */ if (last_jiffies_update == 0) { u32 rem; /* * Ensure that the tick is aligned to a multiple of * TICK_NSEC. */ div_u64_rem(tick_next_period, TICK_NSEC, &rem); if (rem) tick_next_period += TICK_NSEC - rem; last_jiffies_update = tick_next_period; } period = last_jiffies_update; write_seqcount_end(&jiffies_seq); raw_spin_unlock(&jiffies_lock); return period; } static inline int tick_sched_flag_test(struct tick_sched *ts, unsigned long flag) { return !!(ts->flags & flag); } static inline void tick_sched_flag_set(struct tick_sched *ts, unsigned long flag) { lockdep_assert_irqs_disabled(); ts->flags |= flag; } static inline void tick_sched_flag_clear(struct tick_sched *ts, unsigned long flag) { lockdep_assert_irqs_disabled(); ts->flags &= ~flag; } #define MAX_STALLED_JIFFIES 5 static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) { int tick_cpu, cpu = smp_processor_id(); /* * Check if the do_timer duty was dropped. We don't care about * concurrency: This happens only when the CPU in charge went * into a long sleep. If two CPUs happen to assign themselves to * this duty, then the jiffies update is still serialized by * 'jiffies_lock'. * * If nohz_full is enabled, this should not happen because the * 'tick_do_timer_cpu' CPU never relinquishes. */ tick_cpu = READ_ONCE(tick_do_timer_cpu); if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && unlikely(tick_cpu == TICK_DO_TIMER_NONE)) { #ifdef CONFIG_NO_HZ_FULL WARN_ON_ONCE(tick_nohz_full_running); #endif WRITE_ONCE(tick_do_timer_cpu, cpu); tick_cpu = cpu; } /* Check if jiffies need an update */ if (tick_cpu == cpu) tick_do_update_jiffies64(now); /* * If the jiffies update stalled for too long (timekeeper in stop_machine() * or VMEXIT'ed for several msecs), force an update. */ if (ts->last_tick_jiffies != jiffies) { ts->stalled_jiffies = 0; ts->last_tick_jiffies = READ_ONCE(jiffies); } else { if (++ts->stalled_jiffies == MAX_STALLED_JIFFIES) { tick_do_update_jiffies64(now); ts->stalled_jiffies = 0; ts->last_tick_jiffies = READ_ONCE(jiffies); } } if (tick_sched_flag_test(ts, TS_FLAG_INIDLE)) ts->got_idle_tick = 1; } static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) { /* * When we are idle and the tick is stopped, we have to touch * the watchdog as we might not schedule for a really long * time. This happens on completely idle SMP systems while * waiting on the login prompt. We also increment the "start of * idle" jiffy stamp so the idle accounting adjustment we do * when we go busy again does not account too many ticks. */ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { touch_softlockup_watchdog_sched(); if (is_idle_task(current)) ts->idle_jiffies++; /* * In case the current tick fired too early past its expected * expiration, make sure we don't bypass the next clock reprogramming * to the same deadline. */ ts->next_tick = 0; } update_process_times(user_mode(regs)); profile_tick(CPU_PROFILING); } /* * We rearm the timer until we get disabled by the idle code. * Called with interrupts disabled. */ static enum hrtimer_restart tick_nohz_handler(struct hrtimer *timer) { struct tick_sched *ts = container_of(timer, struct tick_sched, sched_timer); struct pt_regs *regs = get_irq_regs(); ktime_t now = ktime_get(); tick_sched_do_timer(ts, now); /* * Do not call when we are not in IRQ context and have * no valid 'regs' pointer */ if (regs) tick_sched_handle(ts, regs); else ts->next_tick = 0; /* * In dynticks mode, tick reprogram is deferred: * - to the idle task if in dynticks-idle * - to IRQ exit if in full-dynticks. */ if (unlikely(tick_sched_flag_test(ts, TS_FLAG_STOPPED))) return HRTIMER_NORESTART; hrtimer_forward(timer, now, TICK_NSEC); return HRTIMER_RESTART; } #ifdef CONFIG_NO_HZ_FULL cpumask_var_t tick_nohz_full_mask; EXPORT_SYMBOL_GPL(tick_nohz_full_mask); bool tick_nohz_full_running; EXPORT_SYMBOL_GPL(tick_nohz_full_running); static atomic_t tick_dep_mask; static bool check_tick_dependency(atomic_t *dep) { int val = atomic_read(dep); if (val & TICK_DEP_MASK_POSIX_TIMER) { trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER); return true; } if (val & TICK_DEP_MASK_PERF_EVENTS) { trace_tick_stop(0, TICK_DEP_MASK_PERF_EVENTS); return true; } if (val & TICK_DEP_MASK_SCHED) { trace_tick_stop(0, TICK_DEP_MASK_SCHED); return true; } if (val & TICK_DEP_MASK_CLOCK_UNSTABLE) { trace_tick_stop(0, TICK_DEP_MASK_CLOCK_UNSTABLE); return true; } if (val & TICK_DEP_MASK_RCU) { trace_tick_stop(0, TICK_DEP_MASK_RCU); return true; } if (val & TICK_DEP_MASK_RCU_EXP) { trace_tick_stop(0, TICK_DEP_MASK_RCU_EXP); return true; } return false; } static bool can_stop_full_tick(int cpu, struct tick_sched *ts) { lockdep_assert_irqs_disabled(); if (unlikely(!cpu_online(cpu))) return false; if (check_tick_dependency(&tick_dep_mask)) return false; if (check_tick_dependency(&ts->tick_dep_mask)) return false; if (check_tick_dependency(&current->tick_dep_mask)) return false; if (check_tick_dependency(&current->signal->tick_dep_mask)) return false; return true; } static void nohz_full_kick_func(struct irq_work *work) { /* Empty, the tick restart happens on tick_nohz_irq_exit() */ } static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = IRQ_WORK_INIT_HARD(nohz_full_kick_func); /* * Kick this CPU if it's full dynticks in order to force it to * re-evaluate its dependency on the tick and restart it if necessary. * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(), * is NMI safe. */ static void tick_nohz_full_kick(void) { if (!tick_nohz_full_cpu(smp_processor_id())) return; irq_work_queue(this_cpu_ptr(&nohz_full_kick_work)); } /* * Kick the CPU if it's full dynticks in order to force it to * re-evaluate its dependency on the tick and restart it if necessary. */ void tick_nohz_full_kick_cpu(int cpu) { if (!tick_nohz_full_cpu(cpu)) return; irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu); } static void tick_nohz_kick_task(struct task_struct *tsk) { int cpu; /* * If the task is not running, run_posix_cpu_timers() * has nothing to elapse, and an IPI can then be optimized out. * * activate_task() STORE p->tick_dep_mask * STORE p->on_rq * __schedule() (switch to task 'p') smp_mb() (atomic_fetch_or()) * LOCK rq->lock LOAD p->on_rq * smp_mb__after_spin_lock() * tick_nohz_task_switch() * LOAD p->tick_dep_mask * * XXX given a task picks up the dependency on schedule(), should we * only care about tasks that are currently on the CPU instead of all * that are on the runqueue? * * That is, does this want to be: task_on_cpu() / task_curr()? */ if (!sched_task_on_rq(tsk)) return; /* * If the task concurrently migrates to another CPU, * we guarantee it sees the new tick dependency upon * schedule. * * set_task_cpu(p, cpu); * STORE p->cpu = @cpu * __schedule() (switch to task 'p') * LOCK rq->lock * smp_mb__after_spin_lock() STORE p->tick_dep_mask * tick_nohz_task_switch() smp_mb() (atomic_fetch_or()) * LOAD p->tick_dep_mask LOAD p->cpu */ cpu = task_cpu(tsk); preempt_disable(); if (cpu_online(cpu)) tick_nohz_full_kick_cpu(cpu); preempt_enable(); } /* * Kick all full dynticks CPUs in order to force these to re-evaluate * their dependency on the tick and restart it if necessary. */ static void tick_nohz_full_kick_all(void) { int cpu; if (!tick_nohz_full_running) return; preempt_disable(); for_each_cpu_and(cpu, tick_nohz_full_mask, cpu_online_mask) tick_nohz_full_kick_cpu(cpu); preempt_enable(); } static void tick_nohz_dep_set_all(atomic_t *dep, enum tick_dep_bits bit) { int prev; prev = atomic_fetch_or(BIT(bit), dep); if (!prev) tick_nohz_full_kick_all(); } /* * Set a global tick dependency. Used by perf events that rely on freq and * unstable clocks. */ void tick_nohz_dep_set(enum tick_dep_bits bit) { tick_nohz_dep_set_all(&tick_dep_mask, bit); } void tick_nohz_dep_clear(enum tick_dep_bits bit) { atomic_andnot(BIT(bit), &tick_dep_mask); } /* * Set per-CPU tick dependency. Used by scheduler and perf events in order to * manage event-throttling. */ void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit) { int prev; struct tick_sched *ts; ts = per_cpu_ptr(&tick_cpu_sched, cpu); prev = atomic_fetch_or(BIT(bit), &ts->tick_dep_mask); if (!prev) { preempt_disable(); /* Perf needs local kick that is NMI safe */ if (cpu == smp_processor_id()) { tick_nohz_full_kick(); } else { /* Remote IRQ work not NMI-safe */ if (!WARN_ON_ONCE(in_nmi())) tick_nohz_full_kick_cpu(cpu); } preempt_enable(); } } EXPORT_SYMBOL_GPL(tick_nohz_dep_set_cpu); void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit) { struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu); atomic_andnot(BIT(bit), &ts->tick_dep_mask); } EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu); /* * Set a per-task tick dependency. RCU needs this. Also posix CPU timers * in order to elapse per task timers. */ void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit) { if (!atomic_fetch_or(BIT(bit), &tsk->tick_dep_mask)) tick_nohz_kick_task(tsk); } EXPORT_SYMBOL_GPL(tick_nohz_dep_set_task); void tick_nohz_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit) { atomic_andnot(BIT(bit), &tsk->tick_dep_mask); } EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_task); /* * Set a per-taskgroup tick dependency. Posix CPU timers need this in order to elapse * per process timers. */ void tick_nohz_dep_set_signal(struct task_struct *tsk, enum tick_dep_bits bit) { int prev; struct signal_struct *sig = tsk->signal; prev = atomic_fetch_or(BIT(bit), &sig->tick_dep_mask); if (!prev) { struct task_struct *t; lockdep_assert_held(&tsk->sighand->siglock); __for_each_thread(sig, t) tick_nohz_kick_task(t); } } void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bit) { atomic_andnot(BIT(bit), &sig->tick_dep_mask); } /* * Re-evaluate the need for the tick as we switch the current task. * It might need the tick due to per task/process properties: * perf events, posix CPU timers, ... */ void __tick_nohz_task_switch(void) { struct tick_sched *ts; if (!tick_nohz_full_cpu(smp_processor_id())) return; ts = this_cpu_ptr(&tick_cpu_sched); if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { if (atomic_read(&current->tick_dep_mask) || atomic_read(&current->signal->tick_dep_mask)) tick_nohz_full_kick(); } } /* Get the boot-time nohz CPU list from the kernel parameters. */ void __init tick_nohz_full_setup(cpumask_var_t cpumask) { alloc_bootmem_cpumask_var(&tick_nohz_full_mask); cpumask_copy(tick_nohz_full_mask, cpumask); tick_nohz_full_running = true; } bool tick_nohz_cpu_hotpluggable(unsigned int cpu) { /* * The 'tick_do_timer_cpu' CPU handles housekeeping duty (unbound * timers, workqueues, timekeeping, ...) on behalf of full dynticks * CPUs. It must remain online when nohz full is enabled. */ if (tick_nohz_full_running && READ_ONCE(tick_do_timer_cpu) == cpu) return false; return true; } static int tick_nohz_cpu_down(unsigned int cpu) { return tick_nohz_cpu_hotpluggable(cpu) ? 0 : -EBUSY; } void __init tick_nohz_init(void) { int cpu, ret; if (!tick_nohz_full_running) return; /* * Full dynticks uses IRQ work to drive the tick rescheduling on safe * locking contexts. But then we need IRQ work to raise its own * interrupts to avoid circular dependency on the tick. */ if (!arch_irq_work_has_interrupt()) { pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support IRQ work self-IPIs\n"); cpumask_clear(tick_nohz_full_mask); tick_nohz_full_running = false; return; } if (IS_ENABLED(CONFIG_PM_SLEEP_SMP) && !IS_ENABLED(CONFIG_PM_SLEEP_SMP_NONZERO_CPU)) { cpu = smp_processor_id(); if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { pr_warn("NO_HZ: Clearing %d from nohz_full range " "for timekeeping\n", cpu); cpumask_clear_cpu(cpu, tick_nohz_full_mask); } } for_each_cpu(cpu, tick_nohz_full_mask) ct_cpu_track_user(cpu); ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "kernel/nohz:predown", NULL, tick_nohz_cpu_down); WARN_ON(ret < 0); pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n", cpumask_pr_args(tick_nohz_full_mask)); } #endif /* #ifdef CONFIG_NO_HZ_FULL */ /* * NOHZ - aka dynamic tick functionality */ #ifdef CONFIG_NO_HZ_COMMON /* * NO HZ enabled ? */ bool tick_nohz_enabled __read_mostly = true; unsigned long tick_nohz_active __read_mostly; /* * Enable / Disable tickless mode */ static int __init setup_tick_nohz(char *str) { return (kstrtobool(str, &tick_nohz_enabled) == 0); } __setup("nohz=", setup_tick_nohz); bool tick_nohz_tick_stopped(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); return tick_sched_flag_test(ts, TS_FLAG_STOPPED); } bool tick_nohz_tick_stopped_cpu(int cpu) { struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu); return tick_sched_flag_test(ts, TS_FLAG_STOPPED); } /** * tick_nohz_update_jiffies - update jiffies when idle was interrupted * @now: current ktime_t * * Called from interrupt entry when the CPU was idle * * In case the sched_tick was stopped on this CPU, we have to check if jiffies * must be updated. Otherwise an interrupt handler could use a stale jiffy * value. We do this unconditionally on any CPU, as we don't know whether the * CPU, which has the update task assigned, is in a long sleep. */ static void tick_nohz_update_jiffies(ktime_t now) { unsigned long flags; __this_cpu_write(tick_cpu_sched.idle_waketime, now); local_irq_save(flags); tick_do_update_jiffies64(now); local_irq_restore(flags); touch_softlockup_watchdog_sched(); } static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) { ktime_t delta; if (WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE))) return; delta = ktime_sub(now, ts->idle_entrytime); write_seqcount_begin(&ts->idle_sleeptime_seq); if (nr_iowait_cpu(smp_processor_id()) > 0) ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); else ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); ts->idle_entrytime = now; tick_sched_flag_clear(ts, TS_FLAG_IDLE_ACTIVE); write_seqcount_end(&ts->idle_sleeptime_seq); sched_clock_idle_wakeup_event(); } static void tick_nohz_start_idle(struct tick_sched *ts) { write_seqcount_begin(&ts->idle_sleeptime_seq); ts->idle_entrytime = ktime_get(); tick_sched_flag_set(ts, TS_FLAG_IDLE_ACTIVE); write_seqcount_end(&ts->idle_sleeptime_seq); sched_clock_idle_sleep_event(); } static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime, bool compute_delta, u64 *last_update_time) { ktime_t now, idle; unsigned int seq; if (!tick_nohz_active) return -1; now = ktime_get(); if (last_update_time) *last_update_time = ktime_to_us(now); do { seq = read_seqcount_begin(&ts->idle_sleeptime_seq); if (tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE) && compute_delta) { ktime_t delta = ktime_sub(now, ts->idle_entrytime); idle = ktime_add(*sleeptime, delta); } else { idle = *sleeptime; } } while (read_seqcount_retry(&ts->idle_sleeptime_seq, seq)); return ktime_to_us(idle); } /** * get_cpu_idle_time_us - get the total idle time of a CPU * @cpu: CPU number to query * @last_update_time: variable to store update time in. Do not update * counters if NULL. * * Return the cumulative idle time (since boot) for a given * CPU, in microseconds. Note that this is partially broken due to * the counter of iowait tasks that can be remotely updated without * any synchronization. Therefore it is possible to observe backward * values within two consecutive reads. * * This time is measured via accounting rather than sampling, * and is as accurate as ktime_get() is. * * Return: -1 if NOHZ is not enabled, else total idle time of the @cpu */ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); return get_cpu_sleep_time_us(ts, &ts->idle_sleeptime, !nr_iowait_cpu(cpu), last_update_time); } EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); /** * get_cpu_iowait_time_us - get the total iowait time of a CPU * @cpu: CPU number to query * @last_update_time: variable to store update time in. Do not update * counters if NULL. * * Return the cumulative iowait time (since boot) for a given * CPU, in microseconds. Note this is partially broken due to * the counter of iowait tasks that can be remotely updated without * any synchronization. Therefore it is possible to observe backward * values within two consecutive reads. * * This time is measured via accounting rather than sampling, * and is as accurate as ktime_get() is. * * Return: -1 if NOHZ is not enabled, else total iowait time of @cpu */ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); return get_cpu_sleep_time_us(ts, &ts->iowait_sleeptime, nr_iowait_cpu(cpu), last_update_time); } EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) { hrtimer_cancel(&ts->sched_timer); hrtimer_set_expires(&ts->sched_timer, ts->last_tick); /* Forward the time to expire in the future */ hrtimer_forward(&ts->sched_timer, now, TICK_NSEC); if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) { hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD); } else { tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); } /* * Reset to make sure the next tick stop doesn't get fooled by past * cached clock deadline. */ ts->next_tick = 0; } static inline bool local_timer_softirq_pending(void) { return local_timers_pending() & BIT(TIMER_SOFTIRQ); } /* * Read jiffies and the time when jiffies were updated last */ u64 get_jiffies_update(unsigned long *basej) { unsigned long basejiff; unsigned int seq; u64 basemono; do { seq = read_seqcount_begin(&jiffies_seq); basemono = last_jiffies_update; basejiff = jiffies; } while (read_seqcount_retry(&jiffies_seq, seq)); *basej = basejiff; return basemono; } /** * tick_nohz_next_event() - return the clock monotonic based next event * @ts: pointer to tick_sched struct * @cpu: CPU number * * Return: * *%0 - When the next event is a maximum of TICK_NSEC in the future * and the tick is not stopped yet * *%next_event - Next event based on clock monotonic */ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) { u64 basemono, next_tick, delta, expires; unsigned long basejiff; int tick_cpu; basemono = get_jiffies_update(&basejiff); ts->last_jiffies = basejiff; ts->timer_expires_base = basemono; /* * Keep the periodic tick, when RCU, architecture or irq_work * requests it. * Aside of that, check whether the local timer softirq is * pending. If so, its a bad idea to call get_next_timer_interrupt(), * because there is an already expired timer, so it will request * immediate expiry, which rearms the hardware timer with a * minimal delta, which brings us back to this place * immediately. Lather, rinse and repeat... */ if (rcu_needs_cpu() || arch_needs_cpu() || irq_work_needs_cpu() || local_timer_softirq_pending()) { next_tick = basemono + TICK_NSEC; } else { /* * Get the next pending timer. If high resolution * timers are enabled this only takes the timer wheel * timers into account. If high resolution timers are * disabled this also looks at the next expiring * hrtimer. */ next_tick = get_next_timer_interrupt(basejiff, basemono); ts->next_timer = next_tick; } /* Make sure next_tick is never before basemono! */ if (WARN_ON_ONCE(basemono > next_tick)) next_tick = basemono; /* * If the tick is due in the next period, keep it ticking or * force prod the timer. */ delta = next_tick - basemono; if (delta <= (u64)TICK_NSEC) { /* * We've not stopped the tick yet, and there's a timer in the * next period, so no point in stopping it either, bail. */ if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { ts->timer_expires = 0; goto out; } } /* * If this CPU is the one which had the do_timer() duty last, we limit * the sleep time to the timekeeping 'max_deferment' value. * Otherwise we can sleep as long as we want. */ delta = timekeeping_max_deferment(); tick_cpu = READ_ONCE(tick_do_timer_cpu); if (tick_cpu != cpu && (tick_cpu != TICK_DO_TIMER_NONE || !tick_sched_flag_test(ts, TS_FLAG_DO_TIMER_LAST))) delta = KTIME_MAX; /* Calculate the next expiry time */ if (delta < (KTIME_MAX - basemono)) expires = basemono + delta; else expires = KTIME_MAX; ts->timer_expires = min_t(u64, expires, next_tick); out: return ts->timer_expires; } static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) { struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); unsigned long basejiff = ts->last_jiffies; u64 basemono = ts->timer_expires_base; bool timer_idle = tick_sched_flag_test(ts, TS_FLAG_STOPPED); int tick_cpu; u64 expires; /* Make sure we won't be trying to stop it twice in a row. */ ts->timer_expires_base = 0; /* * Now the tick should be stopped definitely - so the timer base needs * to be marked idle as well to not miss a newly queued timer. */ expires = timer_base_try_to_set_idle(basejiff, basemono, &timer_idle); if (expires > ts->timer_expires) { /* * This path could only happen when the first timer was removed * between calculating the possible sleep length and now (when * high resolution mode is not active, timer could also be a * hrtimer). * * We have to stick to the original calculated expiry value to * not stop the tick for too long with a shallow C-state (which * was programmed by cpuidle because of an early next expiration * value). */ expires = ts->timer_expires; } /* If the timer base is not idle, retain the not yet stopped tick. */ if (!timer_idle) return; /* * If this CPU is the one which updates jiffies, then give up * the assignment and let it be taken by the CPU which runs * the tick timer next, which might be this CPU as well. If we * don't drop this here, the jiffies might be stale and * do_timer() never gets invoked. Keep track of the fact that it * was the one which had the do_timer() duty last. */ tick_cpu = READ_ONCE(tick_do_timer_cpu); if (tick_cpu == cpu) { WRITE_ONCE(tick_do_timer_cpu, TICK_DO_TIMER_NONE); tick_sched_flag_set(ts, TS_FLAG_DO_TIMER_LAST); } else if (tick_cpu != TICK_DO_TIMER_NONE) { tick_sched_flag_clear(ts, TS_FLAG_DO_TIMER_LAST); } /* Skip reprogram of event if it's not changed */ if (tick_sched_flag_test(ts, TS_FLAG_STOPPED) && (expires == ts->next_tick)) { /* Sanity check: make sure clockevent is actually programmed */ if (expires == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer)) return; WARN_ONCE(1, "basemono: %llu ts->next_tick: %llu dev->next_event: %llu " "timer->active: %d timer->expires: %llu\n", basemono, ts->next_tick, dev->next_event, hrtimer_active(&ts->sched_timer), hrtimer_get_expires(&ts->sched_timer)); } /* * tick_nohz_stop_tick() can be called several times before * tick_nohz_restart_sched_tick() is called. This happens when * interrupts arrive which do not cause a reschedule. In the first * call we save the current tick time, so we can restart the * scheduler tick in tick_nohz_restart_sched_tick(). */ if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { calc_load_nohz_start(); quiet_vmstat(); ts->last_tick = hrtimer_get_expires(&ts->sched_timer); tick_sched_flag_set(ts, TS_FLAG_STOPPED); trace_tick_stop(1, TICK_DEP_MASK_NONE); } ts->next_tick = expires; /* * If the expiration time == KTIME_MAX, then we simply stop * the tick timer. */ if (unlikely(expires == KTIME_MAX)) { if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) hrtimer_cancel(&ts->sched_timer); else tick_program_event(KTIME_MAX, 1); return; } if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) { hrtimer_start(&ts->sched_timer, expires, HRTIMER_MODE_ABS_PINNED_HARD); } else { hrtimer_set_expires(&ts->sched_timer, expires); tick_program_event(expires, 1); } } static void tick_nohz_retain_tick(struct tick_sched *ts) { ts->timer_expires_base = 0; } #ifdef CONFIG_NO_HZ_FULL static void tick_nohz_full_stop_tick(struct tick_sched *ts, int cpu) { if (tick_nohz_next_event(ts, cpu)) tick_nohz_stop_tick(ts, cpu); else tick_nohz_retain_tick(ts); } #endif /* CONFIG_NO_HZ_FULL */ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) { /* Update jiffies first */ tick_do_update_jiffies64(now); /* * Clear the timer idle flag, so we avoid IPIs on remote queueing and * the clock forward checks in the enqueue path: */ timer_clear_idle(); calc_load_nohz_stop(); touch_softlockup_watchdog_sched(); /* Cancel the scheduled timer and restore the tick: */ tick_sched_flag_clear(ts, TS_FLAG_STOPPED); tick_nohz_restart(ts, now); } static void __tick_nohz_full_update_tick(struct tick_sched *ts, ktime_t now) { #ifdef CONFIG_NO_HZ_FULL int cpu = smp_processor_id(); if (can_stop_full_tick(cpu, ts)) tick_nohz_full_stop_tick(ts, cpu); else if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) tick_nohz_restart_sched_tick(ts, now); #endif } static void tick_nohz_full_update_tick(struct tick_sched *ts) { if (!tick_nohz_full_cpu(smp_processor_id())) return; if (!tick_sched_flag_test(ts, TS_FLAG_NOHZ)) return; __tick_nohz_full_update_tick(ts, ktime_get()); } /* * A pending softirq outside an IRQ (or softirq disabled section) context * should be waiting for ksoftirqd to handle it. Therefore we shouldn't * reach this code due to the need_resched() early check in can_stop_idle_tick(). * * However if we are between CPUHP_AP_SMPBOOT_THREADS and CPU_TEARDOWN_CPU on the * cpu_down() process, softirqs can still be raised while ksoftirqd is parked, * triggering the code below, since wakep_softirqd() is ignored. * */ static bool report_idle_softirq(void) { static int ratelimit; unsigned int pending = local_softirq_pending(); if (likely(!pending)) return false; /* Some softirqs claim to be safe against hotplug and ksoftirqd parking */ if (!cpu_active(smp_processor_id())) { pending &= ~SOFTIRQ_HOTPLUG_SAFE_MASK; if (!pending) return false; } if (ratelimit >= 10) return false; /* On RT, softirq handling may be waiting on some lock */ if (local_bh_blocked()) return false; pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n", pending); ratelimit++; return true; } static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) { WARN_ON_ONCE(cpu_is_offline(cpu)); if (unlikely(!tick_sched_flag_test(ts, TS_FLAG_NOHZ))) return false; if (need_resched()) return false; if (unlikely(report_idle_softirq())) return false; if (tick_nohz_full_enabled()) { int tick_cpu = READ_ONCE(tick_do_timer_cpu); /* * Keep the tick alive to guarantee timekeeping progression * if there are full dynticks CPUs around */ if (tick_cpu == cpu) return false; /* Should not happen for nohz-full */ if (WARN_ON_ONCE(tick_cpu == TICK_DO_TIMER_NONE)) return false; } return true; } /** * tick_nohz_idle_stop_tick - stop the idle tick from the idle task * * When the next event is more than a tick into the future, stop the idle tick */ void tick_nohz_idle_stop_tick(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); int cpu = smp_processor_id(); ktime_t expires; /* * If tick_nohz_get_sleep_length() ran tick_nohz_next_event(), the * tick timer expiration time is known already. */ if (ts->timer_expires_base) expires = ts->timer_expires; else if (can_stop_idle_tick(cpu, ts)) expires = tick_nohz_next_event(ts, cpu); else return; ts->idle_calls++; if (expires > 0LL) { int was_stopped = tick_sched_flag_test(ts, TS_FLAG_STOPPED); tick_nohz_stop_tick(ts, cpu); ts->idle_sleeps++; ts->idle_expires = expires; if (!was_stopped && tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { ts->idle_jiffies = ts->last_jiffies; nohz_balance_enter_idle(cpu); } } else { tick_nohz_retain_tick(ts); } } void tick_nohz_idle_retain_tick(void) { tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched)); } /** * tick_nohz_idle_enter - prepare for entering idle on the current CPU * * Called when we start the idle loop. */ void tick_nohz_idle_enter(void) { struct tick_sched *ts; lockdep_assert_irqs_enabled(); local_irq_disable(); ts = this_cpu_ptr(&tick_cpu_sched); WARN_ON_ONCE(ts->timer_expires_base); tick_sched_flag_set(ts, TS_FLAG_INIDLE); tick_nohz_start_idle(ts); local_irq_enable(); } /** * tick_nohz_irq_exit - Notify the tick about IRQ exit * * A timer may have been added/modified/deleted either by the current IRQ, * or by another place using this IRQ as a notification. This IRQ may have * also updated the RCU callback list. These events may require a * re-evaluation of the next tick. Depending on the context: * * 1) If the CPU is idle and no resched is pending, just proceed with idle * time accounting. The next tick will be re-evaluated on the next idle * loop iteration. * * 2) If the CPU is nohz_full: * * 2.1) If there is any tick dependency, restart the tick if stopped. * * 2.2) If there is no tick dependency, (re-)evaluate the next tick and * stop/update it accordingly. */ void tick_nohz_irq_exit(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (tick_sched_flag_test(ts, TS_FLAG_INIDLE)) tick_nohz_start_idle(ts); else tick_nohz_full_update_tick(ts); } /** * tick_nohz_idle_got_tick - Check whether or not the tick handler has run * * Return: %true if the tick handler has run, otherwise %false */ bool tick_nohz_idle_got_tick(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (ts->got_idle_tick) { ts->got_idle_tick = 0; return true; } return false; } /** * tick_nohz_get_next_hrtimer - return the next expiration time for the hrtimer * or the tick, whichever expires first. Note that, if the tick has been * stopped, it returns the next hrtimer. * * Called from power state control code with interrupts disabled * * Return: the next expiration time */ ktime_t tick_nohz_get_next_hrtimer(void) { return __this_cpu_read(tick_cpu_device.evtdev)->next_event; } /** * tick_nohz_get_sleep_length - return the expected length of the current sleep * @delta_next: duration until the next event if the tick cannot be stopped * * Called from power state control code with interrupts disabled. * * The return value of this function and/or the value returned by it through the * @delta_next pointer can be negative which must be taken into account by its * callers. * * Return: the expected length of the current sleep */ ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next) { struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); int cpu = smp_processor_id(); /* * The idle entry time is expected to be a sufficient approximation of * the current time at this point. */ ktime_t now = ts->idle_entrytime; ktime_t next_event; WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_INIDLE)); *delta_next = ktime_sub(dev->next_event, now); if (!can_stop_idle_tick(cpu, ts)) return *delta_next; next_event = tick_nohz_next_event(ts, cpu); if (!next_event) return *delta_next; /* * If the next highres timer to expire is earlier than 'next_event', the * idle governor needs to know that. */ next_event = min_t(u64, next_event, hrtimer_next_event_without(&ts->sched_timer)); return ktime_sub(next_event, now); } /** * tick_nohz_get_idle_calls_cpu - return the current idle calls counter value * for a particular CPU. * @cpu: target CPU number * * Called from the schedutil frequency scaling governor in scheduler context. * * Return: the current idle calls counter value for @cpu */ unsigned long tick_nohz_get_idle_calls_cpu(int cpu) { struct tick_sched *ts = tick_get_tick_sched(cpu); return ts->idle_calls; } static void tick_nohz_account_idle_time(struct tick_sched *ts, ktime_t now) { unsigned long ticks; ts->idle_exittime = now; if (vtime_accounting_enabled_this_cpu()) return; /* * We stopped the tick in idle. update_process_times() would miss the * time we slept, as it does only a 1 tick accounting. * Enforce that this is accounted to idle ! */ ticks = jiffies - ts->idle_jiffies; /* * We might be one off. Do not randomly account a huge number of ticks! */ if (ticks && ticks < LONG_MAX) account_idle_ticks(ticks); } void tick_nohz_idle_restart_tick(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { ktime_t now = ktime_get(); tick_nohz_restart_sched_tick(ts, now); tick_nohz_account_idle_time(ts, now); } } static void tick_nohz_idle_update_tick(struct tick_sched *ts, ktime_t now) { if (tick_nohz_full_cpu(smp_processor_id())) __tick_nohz_full_update_tick(ts, now); else tick_nohz_restart_sched_tick(ts, now); tick_nohz_account_idle_time(ts, now); } /** * tick_nohz_idle_exit - Update the tick upon idle task exit * * When the idle task exits, update the tick depending on the * following situations: * * 1) If the CPU is not in nohz_full mode (most cases), then * restart the tick. * * 2) If the CPU is in nohz_full mode (corner case): * 2.1) If the tick can be kept stopped (no tick dependencies) * then re-evaluate the next tick and try to keep it stopped * as long as possible. * 2.2) If the tick has dependencies, restart the tick. * */ void tick_nohz_idle_exit(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); bool idle_active, tick_stopped; ktime_t now; local_irq_disable(); WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_INIDLE)); WARN_ON_ONCE(ts->timer_expires_base); tick_sched_flag_clear(ts, TS_FLAG_INIDLE); idle_active = tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE); tick_stopped = tick_sched_flag_test(ts, TS_FLAG_STOPPED); if (idle_active || tick_stopped) now = ktime_get(); if (idle_active) tick_nohz_stop_idle(ts, now); if (tick_stopped) tick_nohz_idle_update_tick(ts, now); local_irq_enable(); } /* * In low-resolution mode, the tick handler must be implemented directly * at the clockevent level. hrtimer can't be used instead, because its * infrastructure actually relies on the tick itself as a backend in * low-resolution mode (see hrtimer_run_queues()). */ static void tick_nohz_lowres_handler(struct clock_event_device *dev) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); dev->next_event = KTIME_MAX; if (likely(tick_nohz_handler(&ts->sched_timer) == HRTIMER_RESTART)) tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); } static inline void tick_nohz_activate(struct tick_sched *ts) { if (!tick_nohz_enabled) return; tick_sched_flag_set(ts, TS_FLAG_NOHZ); /* One update is enough */ if (!test_and_set_bit(0, &tick_nohz_active)) timers_update_nohz(); } /** * tick_nohz_switch_to_nohz - switch to NOHZ mode */ static void tick_nohz_switch_to_nohz(void) { if (!tick_nohz_enabled) return; if (tick_switch_to_oneshot(tick_nohz_lowres_handler)) return; /* * Recycle the hrtimer in 'ts', so we can share the * highres code. */ tick_setup_sched_timer(false); } static inline void tick_nohz_irq_enter(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); ktime_t now; if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED | TS_FLAG_IDLE_ACTIVE)) return; now = ktime_get(); if (tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE)) tick_nohz_stop_idle(ts, now); /* * If all CPUs are idle we may need to update a stale jiffies value. * Note nohz_full is a special case: a timekeeper is guaranteed to stay * alive but it might be busy looping with interrupts disabled in some * rare case (typically stop machine). So we must make sure we have a * last resort. */ if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) tick_nohz_update_jiffies(now); } #else static inline void tick_nohz_switch_to_nohz(void) { } static inline void tick_nohz_irq_enter(void) { } static inline void tick_nohz_activate(struct tick_sched *ts) { } #endif /* CONFIG_NO_HZ_COMMON */ /* * Called from irq_enter() to notify about the possible interruption of idle() */ void tick_irq_enter(void) { tick_check_oneshot_broadcast_this_cpu(); tick_nohz_irq_enter(); } static int sched_skew_tick; static int __init skew_tick(char *str) { get_option(&str, &sched_skew_tick); return 0; } early_param("skew_tick", skew_tick); /** * tick_setup_sched_timer - setup the tick emulation timer * @hrtimer: whether to use the hrtimer or not */ void tick_setup_sched_timer(bool hrtimer) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); /* Emulate tick processing via per-CPU hrtimers: */ hrtimer_setup(&ts->sched_timer, tick_nohz_handler, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer) tick_sched_flag_set(ts, TS_FLAG_HIGHRES); /* Get the next period (per-CPU) */ hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); /* Offset the tick to avert 'jiffies_lock' contention. */ if (sched_skew_tick) { u64 offset = TICK_NSEC >> 1; do_div(offset, num_possible_cpus()); offset *= smp_processor_id(); hrtimer_add_expires_ns(&ts->sched_timer, offset); } hrtimer_forward_now(&ts->sched_timer, TICK_NSEC); if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer) hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD); else tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); tick_nohz_activate(ts); } /* * Shut down the tick and make sure the CPU won't try to retake the timekeeping * duty before disabling IRQs in idle for the last time. */ void tick_sched_timer_dying(int cpu) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); ktime_t idle_sleeptime, iowait_sleeptime; unsigned long idle_calls, idle_sleeps; /* This must happen before hrtimers are migrated! */ if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) hrtimer_cancel(&ts->sched_timer); idle_sleeptime = ts->idle_sleeptime; iowait_sleeptime = ts->iowait_sleeptime; idle_calls = ts->idle_calls; idle_sleeps = ts->idle_sleeps; memset(ts, 0, sizeof(*ts)); ts->idle_sleeptime = idle_sleeptime; ts->iowait_sleeptime = iowait_sleeptime; ts->idle_calls = idle_calls; ts->idle_sleeps = idle_sleeps; } /* * Async notification about clocksource changes */ void tick_clock_notify(void) { int cpu; for_each_possible_cpu(cpu) set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks); } /* * Async notification about clock event changes */ void tick_oneshot_notify(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); set_bit(0, &ts->check_clocks); } /* * Check if a change happened, which makes oneshot possible. * * Called cyclically from the hrtimer softirq (driven by the timer * softirq). 'allow_nohz' signals that we can switch into low-res NOHZ * mode, because high resolution timers are disabled (either compile * or runtime). Called with interrupts disabled. */ int tick_check_oneshot_change(int allow_nohz) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (!test_and_clear_bit(0, &ts->check_clocks)) return 0; if (tick_sched_flag_test(ts, TS_FLAG_NOHZ)) return 0; if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available()) return 0; if (!allow_nohz) return 1; tick_nohz_switch_to_nohz(); return 0; }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the IP protocol. * * Version: @(#)ip.h 1.0.2 04/28/93 * * Authors: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> */ #ifndef _LINUX_IP_H #define _LINUX_IP_H #include <linux/skbuff.h> #include <uapi/linux/ip.h> static inline struct iphdr *ip_hdr(const struct sk_buff *skb) { return (struct iphdr *)skb_network_header(skb); } static inline struct iphdr *inner_ip_hdr(const struct sk_buff *skb) { return (struct iphdr *)skb_inner_network_header(skb); } static inline struct iphdr *ipip_hdr(const struct sk_buff *skb) { return (struct iphdr *)skb_transport_header(skb); } static inline unsigned int ip_transport_len(const struct sk_buff *skb) { return ntohs(ip_hdr(skb)->tot_len) - skb_network_header_len(skb); } static inline unsigned int iph_totlen(const struct sk_buff *skb, const struct iphdr *iph) { u32 len = ntohs(iph->tot_len); return (len || !skb_is_gso(skb) || !skb_is_gso_tcp(skb)) ? len : skb->len - skb_network_offset(skb); } static inline unsigned int skb_ip_totlen(const struct sk_buff *skb) { return iph_totlen(skb, ip_hdr(skb)); } /* IPv4 datagram length is stored into 16bit field (tot_len) */ #define IP_MAX_MTU 0xFFFFU static inline void iph_set_totlen(struct iphdr *iph, unsigned int len) { iph->tot_len = len <= IP_MAX_MTU ? htons(len) : 0; } #endif /* _LINUX_IP_H */
9 10 1 8 9 9 7 7 7 9 9 9 9 9 8 9 9 8 9 9 9 9 9 9 9 9 9 9 9 1 1 1 1 1 1 1 1 1 1 97 88 97 14 8 8 55 1 53 45 54 2 56 54 3 72 3 84 10 8 6 3 2 16 6 10 55 55 8 49 45 55 63 63 63 10 50 47 63 62 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 // SPDX-License-Identifier: GPL-2.0-only /* * VGIC MMIO handling functions */ #include <linux/bitops.h> #include <linux/bsearch.h> #include <linux/interrupt.h> #include <linux/irq.h> #include <linux/kvm.h> #include <linux/kvm_host.h> #include <kvm/iodev.h> #include <kvm/arm_arch_timer.h> #include <kvm/arm_vgic.h> #include "vgic.h" #include "vgic-mmio.h" unsigned long vgic_mmio_read_raz(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len) { return 0; } unsigned long vgic_mmio_read_rao(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len) { return -1UL; } void vgic_mmio_write_wi(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { /* Ignore */ } int vgic_mmio_uaccess_write_wi(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { /* Ignore */ return 0; } unsigned long vgic_mmio_read_group(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len) { u32 intid = VGIC_ADDR_TO_INTID(addr, 1); u32 value = 0; int i; /* Loop over all IRQs affected by this read */ for (i = 0; i < len * 8; i++) { struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); if (irq->group) value |= BIT(i); vgic_put_irq(vcpu->kvm, irq); } return value; } static void vgic_update_vsgi(struct vgic_irq *irq) { WARN_ON(its_prop_update_vsgi(irq->host_irq, irq->priority, irq->group)); } void vgic_mmio_write_group(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { u32 intid = VGIC_ADDR_TO_INTID(addr, 1); int i; unsigned long flags; for (i = 0; i < len * 8; i++) { struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); raw_spin_lock_irqsave(&irq->irq_lock, flags); irq->group = !!(val & BIT(i)); if (irq->hw && vgic_irq_is_sgi(irq->intid)) { vgic_update_vsgi(irq); raw_spin_unlock_irqrestore(&irq->irq_lock, flags); } else { vgic_queue_irq_unlock(vcpu->kvm, irq, flags); } vgic_put_irq(vcpu->kvm, irq); } } /* * Read accesses to both GICD_ICENABLER and GICD_ISENABLER return the value * of the enabled bit, so there is only one function for both here. */ unsigned long vgic_mmio_read_enable(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len) { u32 intid = VGIC_ADDR_TO_INTID(addr, 1); u32 value = 0; int i; /* Loop over all IRQs affected by this read */ for (i = 0; i < len * 8; i++) { struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); if (irq->enabled) value |= (1U << i); vgic_put_irq(vcpu->kvm, irq); } return value; } void vgic_mmio_write_senable(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { u32 intid = VGIC_ADDR_TO_INTID(addr, 1); int i; unsigned long flags; for_each_set_bit(i, &val, len * 8) { struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); raw_spin_lock_irqsave(&irq->irq_lock, flags); if (irq->hw && vgic_irq_is_sgi(irq->intid)) { if (!irq->enabled) { struct irq_data *data; irq->enabled = true; data = &irq_to_desc(irq->host_irq)->irq_data; while (irqd_irq_disabled(data)) enable_irq(irq->host_irq); } raw_spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(vcpu->kvm, irq); continue; } else if (vgic_irq_is_mapped_level(irq)) { bool was_high = irq->line_level; /* * We need to update the state of the interrupt because * the guest might have changed the state of the device * while the interrupt was disabled at the VGIC level. */ irq->line_level = vgic_get_phys_line_level(irq); /* * Deactivate the physical interrupt so the GIC will let * us know when it is asserted again. */ if (!irq->active && was_high && !irq->line_level) vgic_irq_set_phys_active(irq, false); } irq->enabled = true; vgic_queue_irq_unlock(vcpu->kvm, irq, flags); vgic_put_irq(vcpu->kvm, irq); } } void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { u32 intid = VGIC_ADDR_TO_INTID(addr, 1); int i; unsigned long flags; for_each_set_bit(i, &val, len * 8) { struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); raw_spin_lock_irqsave(&irq->irq_lock, flags); if (irq->hw && vgic_irq_is_sgi(irq->intid) && irq->enabled) disable_irq_nosync(irq->host_irq); irq->enabled = false; raw_spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(vcpu->kvm, irq); } } int vgic_uaccess_write_senable(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { u32 intid = VGIC_ADDR_TO_INTID(addr, 1); int i; unsigned long flags; for_each_set_bit(i, &val, len * 8) { struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); raw_spin_lock_irqsave(&irq->irq_lock, flags); irq->enabled = true; vgic_queue_irq_unlock(vcpu->kvm, irq, flags); vgic_put_irq(vcpu->kvm, irq); } return 0; } int vgic_uaccess_write_cenable(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { u32 intid = VGIC_ADDR_TO_INTID(addr, 1); int i; unsigned long flags; for_each_set_bit(i, &val, len * 8) { struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); raw_spin_lock_irqsave(&irq->irq_lock, flags); irq->enabled = false; raw_spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(vcpu->kvm, irq); } return 0; } static unsigned long __read_pending(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, bool is_user) { u32 intid = VGIC_ADDR_TO_INTID(addr, 1); u32 value = 0; int i; /* Loop over all IRQs affected by this read */ for (i = 0; i < len * 8; i++) { struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); unsigned long flags; bool val; /* * When used from userspace with a GICv3 model: * * Pending state of interrupt is latched in pending_latch * variable. Userspace will save and restore pending state * and line_level separately. * Refer to Documentation/virt/kvm/devices/arm-vgic-v3.rst * for handling of ISPENDR and ICPENDR. */ raw_spin_lock_irqsave(&irq->irq_lock, flags); if (irq->hw && vgic_irq_is_sgi(irq->intid)) { int err; val = false; err = irq_get_irqchip_state(irq->host_irq, IRQCHIP_STATE_PENDING, &val); WARN_RATELIMIT(err, "IRQ %d", irq->host_irq); } else if (!is_user && vgic_irq_is_mapped_level(irq)) { val = vgic_get_phys_line_level(irq); } else { switch (vcpu->kvm->arch.vgic.vgic_model) { case KVM_DEV_TYPE_ARM_VGIC_V3: if (is_user) { val = irq->pending_latch; break; } fallthrough; default: val = irq_is_pending(irq); break; } } value |= ((u32)val << i); raw_spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(vcpu->kvm, irq); } return value; } unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len) { return __read_pending(vcpu, addr, len, false); } unsigned long vgic_uaccess_read_pending(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len) { return __read_pending(vcpu, addr, len, true); } static bool is_vgic_v2_sgi(struct kvm_vcpu *vcpu, struct vgic_irq *irq) { return (vgic_irq_is_sgi(irq->intid) && vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2); } static void __set_pending(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val, bool is_user) { u32 intid = VGIC_ADDR_TO_INTID(addr, 1); int i; unsigned long flags; for_each_set_bit(i, &val, len * 8) { struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); /* GICD_ISPENDR0 SGI bits are WI when written from the guest. */ if (is_vgic_v2_sgi(vcpu, irq) && !is_user) { vgic_put_irq(vcpu->kvm, irq); continue; } raw_spin_lock_irqsave(&irq->irq_lock, flags); /* * GICv2 SGIs are terribly broken. We can't restore * the source of the interrupt, so just pick the vcpu * itself as the source... */ if (is_vgic_v2_sgi(vcpu, irq)) irq->source |= BIT(vcpu->vcpu_id); if (irq->hw && vgic_irq_is_sgi(irq->intid)) { /* HW SGI? Ask the GIC to inject it */ int err; err = irq_set_irqchip_state(irq->host_irq, IRQCHIP_STATE_PENDING, true); WARN_RATELIMIT(err, "IRQ %d", irq->host_irq); raw_spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(vcpu->kvm, irq); continue; } irq->pending_latch = true; if (irq->hw && !is_user) vgic_irq_set_phys_active(irq, true); vgic_queue_irq_unlock(vcpu->kvm, irq, flags); vgic_put_irq(vcpu->kvm, irq); } } void vgic_mmio_write_spending(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { __set_pending(vcpu, addr, len, val, false); } int vgic_uaccess_write_spending(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { __set_pending(vcpu, addr, len, val, true); return 0; } /* Must be called with irq->irq_lock held */ static void vgic_hw_irq_cpending(struct kvm_vcpu *vcpu, struct vgic_irq *irq) { irq->pending_latch = false; /* * We don't want the guest to effectively mask the physical * interrupt by doing a write to SPENDR followed by a write to * CPENDR for HW interrupts, so we clear the active state on * the physical side if the virtual interrupt is not active. * This may lead to taking an additional interrupt on the * host, but that should not be a problem as the worst that * can happen is an additional vgic injection. We also clear * the pending state to maintain proper semantics for edge HW * interrupts. */ vgic_irq_set_phys_pending(irq, false); if (!irq->active) vgic_irq_set_phys_active(irq, false); } static void __clear_pending(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val, bool is_user) { u32 intid = VGIC_ADDR_TO_INTID(addr, 1); int i; unsigned long flags; for_each_set_bit(i, &val, len * 8) { struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); /* GICD_ICPENDR0 SGI bits are WI when written from the guest. */ if (is_vgic_v2_sgi(vcpu, irq) && !is_user) { vgic_put_irq(vcpu->kvm, irq); continue; } raw_spin_lock_irqsave(&irq->irq_lock, flags); /* * More fun with GICv2 SGIs! If we're clearing one of them * from userspace, which source vcpu to clear? Let's not * even think of it, and blow the whole set. */ if (is_vgic_v2_sgi(vcpu, irq)) irq->source = 0; if (irq->hw && vgic_irq_is_sgi(irq->intid)) { /* HW SGI? Ask the GIC to clear its pending bit */ int err; err = irq_set_irqchip_state(irq->host_irq, IRQCHIP_STATE_PENDING, false); WARN_RATELIMIT(err, "IRQ %d", irq->host_irq); raw_spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(vcpu->kvm, irq); continue; } if (irq->hw && !is_user) vgic_hw_irq_cpending(vcpu, irq); else irq->pending_latch = false; raw_spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(vcpu->kvm, irq); } } void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { __clear_pending(vcpu, addr, len, val, false); } int vgic_uaccess_write_cpending(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { __clear_pending(vcpu, addr, len, val, true); return 0; } /* * If we are fiddling with an IRQ's active state, we have to make sure the IRQ * is not queued on some running VCPU's LRs, because then the change to the * active state can be overwritten when the VCPU's state is synced coming back * from the guest. * * For shared interrupts as well as GICv3 private interrupts accessed from the * non-owning CPU, we have to stop all the VCPUs because interrupts can be * migrated while we don't hold the IRQ locks and we don't want to be chasing * moving targets. * * For GICv2 private interrupts we don't have to do anything because * userspace accesses to the VGIC state already require all VCPUs to be * stopped, and only the VCPU itself can modify its private interrupts * active state, which guarantees that the VCPU is not running. */ static void vgic_access_active_prepare(struct kvm_vcpu *vcpu, u32 intid) { if ((vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 && vcpu != kvm_get_running_vcpu()) || intid >= VGIC_NR_PRIVATE_IRQS) kvm_arm_halt_guest(vcpu->kvm); } /* See vgic_access_active_prepare */ static void vgic_access_active_finish(struct kvm_vcpu *vcpu, u32 intid) { if ((vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 && vcpu != kvm_get_running_vcpu()) || intid >= VGIC_NR_PRIVATE_IRQS) kvm_arm_resume_guest(vcpu->kvm); } static unsigned long __vgic_mmio_read_active(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len) { u32 intid = VGIC_ADDR_TO_INTID(addr, 1); u32 value = 0; int i; /* Loop over all IRQs affected by this read */ for (i = 0; i < len * 8; i++) { struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); /* * Even for HW interrupts, don't evaluate the HW state as * all the guest is interested in is the virtual state. */ if (irq->active) value |= (1U << i); vgic_put_irq(vcpu->kvm, irq); } return value; } unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len) { u32 intid = VGIC_ADDR_TO_INTID(addr, 1); u32 val; mutex_lock(&vcpu->kvm->arch.config_lock); vgic_access_active_prepare(vcpu, intid); val = __vgic_mmio_read_active(vcpu, addr, len); vgic_access_active_finish(vcpu, intid); mutex_unlock(&vcpu->kvm->arch.config_lock); return val; } unsigned long vgic_uaccess_read_active(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len) { return __vgic_mmio_read_active(vcpu, addr, len); } /* Must be called with irq->irq_lock held */ static void vgic_hw_irq_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq, bool active, bool is_uaccess) { if (is_uaccess) return; irq->active = active; vgic_irq_set_phys_active(irq, active); } static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq, bool active) { unsigned long flags; struct kvm_vcpu *requester_vcpu = kvm_get_running_vcpu(); raw_spin_lock_irqsave(&irq->irq_lock, flags); if (irq->hw && !vgic_irq_is_sgi(irq->intid)) { vgic_hw_irq_change_active(vcpu, irq, active, !requester_vcpu); } else if (irq->hw && vgic_irq_is_sgi(irq->intid)) { /* * GICv4.1 VSGI feature doesn't track an active state, * so let's not kid ourselves, there is nothing we can * do here. */ irq->active = false; } else { u32 model = vcpu->kvm->arch.vgic.vgic_model; u8 active_source; irq->active = active; /* * The GICv2 architecture indicates that the source CPUID for * an SGI should be provided during an EOI which implies that * the active state is stored somewhere, but at the same time * this state is not architecturally exposed anywhere and we * have no way of knowing the right source. * * This may lead to a VCPU not being able to receive * additional instances of a particular SGI after migration * for a GICv2 VM on some GIC implementations. Oh well. */ active_source = (requester_vcpu) ? requester_vcpu->vcpu_id : 0; if (model == KVM_DEV_TYPE_ARM_VGIC_V2 && active && vgic_irq_is_sgi(irq->intid)) irq->active_source = active_source; } if (irq->active) vgic_queue_irq_unlock(vcpu->kvm, irq, flags); else raw_spin_unlock_irqrestore(&irq->irq_lock, flags); } static void __vgic_mmio_write_cactive(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { u32 intid = VGIC_ADDR_TO_INTID(addr, 1); int i; for_each_set_bit(i, &val, len * 8) { struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); vgic_mmio_change_active(vcpu, irq, false); vgic_put_irq(vcpu->kvm, irq); } } void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { u32 intid = VGIC_ADDR_TO_INTID(addr, 1); mutex_lock(&vcpu->kvm->arch.config_lock); vgic_access_active_prepare(vcpu, intid); __vgic_mmio_write_cactive(vcpu, addr, len, val); vgic_access_active_finish(vcpu, intid); mutex_unlock(&vcpu->kvm->arch.config_lock); } int vgic_mmio_uaccess_write_cactive(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { __vgic_mmio_write_cactive(vcpu, addr, len, val); return 0; } static void __vgic_mmio_write_sactive(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { u32 intid = VGIC_ADDR_TO_INTID(addr, 1); int i; for_each_set_bit(i, &val, len * 8) { struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); vgic_mmio_change_active(vcpu, irq, true); vgic_put_irq(vcpu->kvm, irq); } } void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { u32 intid = VGIC_ADDR_TO_INTID(addr, 1); mutex_lock(&vcpu->kvm->arch.config_lock); vgic_access_active_prepare(vcpu, intid); __vgic_mmio_write_sactive(vcpu, addr, len, val); vgic_access_active_finish(vcpu, intid); mutex_unlock(&vcpu->kvm->arch.config_lock); } int vgic_mmio_uaccess_write_sactive(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { __vgic_mmio_write_sactive(vcpu, addr, len, val); return 0; } unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len) { u32 intid = VGIC_ADDR_TO_INTID(addr, 8); int i; u64 val = 0; for (i = 0; i < len; i++) { struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); val |= (u64)irq->priority << (i * 8); vgic_put_irq(vcpu->kvm, irq); } return val; } /* * We currently don't handle changing the priority of an interrupt that * is already pending on a VCPU. If there is a need for this, we would * need to make this VCPU exit and re-evaluate the priorities, potentially * leading to this interrupt getting presented now to the guest (if it has * been masked by the priority mask before). */ void vgic_mmio_write_priority(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { u32 intid = VGIC_ADDR_TO_INTID(addr, 8); int i; unsigned long flags; for (i = 0; i < len; i++) { struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); raw_spin_lock_irqsave(&irq->irq_lock, flags); /* Narrow the priority range to what we actually support */ irq->priority = (val >> (i * 8)) & GENMASK(7, 8 - VGIC_PRI_BITS); if (irq->hw && vgic_irq_is_sgi(irq->intid)) vgic_update_vsgi(irq); raw_spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(vcpu->kvm, irq); } } unsigned long vgic_mmio_read_config(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len) { u32 intid = VGIC_ADDR_TO_INTID(addr, 2); u32 value = 0; int i; for (i = 0; i < len * 4; i++) { struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); if (irq->config == VGIC_CONFIG_EDGE) value |= (2U << (i * 2)); vgic_put_irq(vcpu->kvm, irq); } return value; } void vgic_mmio_write_config(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { u32 intid = VGIC_ADDR_TO_INTID(addr, 2); int i; unsigned long flags; for (i = 0; i < len * 4; i++) { struct vgic_irq *irq; /* * The configuration cannot be changed for SGIs in general, * for PPIs this is IMPLEMENTATION DEFINED. The arch timer * code relies on PPIs being level triggered, so we also * make them read-only here. */ if (intid + i < VGIC_NR_PRIVATE_IRQS) continue; irq = vgic_get_irq(vcpu->kvm, intid + i); raw_spin_lock_irqsave(&irq->irq_lock, flags); if (test_bit(i * 2 + 1, &val)) irq->config = VGIC_CONFIG_EDGE; else irq->config = VGIC_CONFIG_LEVEL; raw_spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(vcpu->kvm, irq); } } u32 vgic_read_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid) { int i; u32 val = 0; int nr_irqs = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS; for (i = 0; i < 32; i++) { struct vgic_irq *irq; if ((intid + i) < VGIC_NR_SGIS || (intid + i) >= nr_irqs) continue; irq = vgic_get_vcpu_irq(vcpu, intid + i); if (irq->config == VGIC_CONFIG_LEVEL && irq->line_level) val |= (1U << i); vgic_put_irq(vcpu->kvm, irq); } return val; } void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid, const u32 val) { int i; int nr_irqs = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS; unsigned long flags; for (i = 0; i < 32; i++) { struct vgic_irq *irq; bool new_level; if ((intid + i) < VGIC_NR_SGIS || (intid + i) >= nr_irqs) continue; irq = vgic_get_vcpu_irq(vcpu, intid + i); /* * Line level is set irrespective of irq type * (level or edge) to avoid dependency that VM should * restore irq config before line level. */ new_level = !!(val & (1U << i)); raw_spin_lock_irqsave(&irq->irq_lock, flags); irq->line_level = new_level; if (new_level) vgic_queue_irq_unlock(vcpu->kvm, irq, flags); else raw_spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(vcpu->kvm, irq); } } static int match_region(const void *key, const void *elt) { const unsigned int offset = (unsigned long)key; const struct vgic_register_region *region = elt; if (offset < region->reg_offset) return -1; if (offset >= region->reg_offset + region->len) return 1; return 0; } const struct vgic_register_region * vgic_find_mmio_region(const struct vgic_register_region *regions, int nr_regions, unsigned int offset) { return bsearch((void *)(uintptr_t)offset, regions, nr_regions, sizeof(regions[0]), match_region); } void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr) { if (kvm_vgic_global_state.type == VGIC_V2) vgic_v2_set_vmcr(vcpu, vmcr); else vgic_v3_set_vmcr(vcpu, vmcr); } void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr) { if (kvm_vgic_global_state.type == VGIC_V2) vgic_v2_get_vmcr(vcpu, vmcr); else vgic_v3_get_vmcr(vcpu, vmcr); } /* * kvm_mmio_read_buf() returns a value in a format where it can be converted * to a byte array and be directly observed as the guest wanted it to appear * in memory if it had done the store itself, which is LE for the GIC, as the * guest knows the GIC is always LE. * * We convert this value to the CPUs native format to deal with it as a data * value. */ unsigned long vgic_data_mmio_bus_to_host(const void *val, unsigned int len) { unsigned long data = kvm_mmio_read_buf(val, len); switch (len) { case 1: return data; case 2: return le16_to_cpu(data); case 4: return le32_to_cpu(data); default: return le64_to_cpu(data); } } /* * kvm_mmio_write_buf() expects a value in a format such that if converted to * a byte array it is observed as the guest would see it if it could perform * the load directly. Since the GIC is LE, and the guest knows this, the * guest expects a value in little endian format. * * We convert the data value from the CPUs native format to LE so that the * value is returned in the proper format. */ void vgic_data_host_to_mmio_bus(void *buf, unsigned int len, unsigned long data) { switch (len) { case 1: break; case 2: data = cpu_to_le16(data); break; case 4: data = cpu_to_le32(data); break; default: data = cpu_to_le64(data); } kvm_mmio_write_buf(buf, len, data); } static struct vgic_io_device *kvm_to_vgic_iodev(const struct kvm_io_device *dev) { return container_of(dev, struct vgic_io_device, dev); } static bool check_region(const struct kvm *kvm, const struct vgic_register_region *region, gpa_t addr, int len) { int flags, nr_irqs = kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS; switch (len) { case sizeof(u8): flags = VGIC_ACCESS_8bit; break; case sizeof(u32): flags = VGIC_ACCESS_32bit; break; case sizeof(u64): flags = VGIC_ACCESS_64bit; break; default: return false; } if ((region->access_flags & flags) && IS_ALIGNED(addr, len)) { if (!region->bits_per_irq) return true; /* Do we access a non-allocated IRQ? */ return VGIC_ADDR_TO_INTID(addr, region->bits_per_irq) < nr_irqs; } return false; } const struct vgic_register_region * vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev, gpa_t addr, int len) { const struct vgic_register_region *region; region = vgic_find_mmio_region(iodev->regions, iodev->nr_regions, addr - iodev->base_addr); if (!region || !check_region(vcpu->kvm, region, addr, len)) return NULL; return region; } static int vgic_uaccess_read(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev, gpa_t addr, u32 *val) { const struct vgic_register_region *region; struct kvm_vcpu *r_vcpu; region = vgic_get_mmio_region(vcpu, iodev, addr, sizeof(u32)); if (!region) { *val = 0; return 0; } r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu; if (region->uaccess_read) *val = region->uaccess_read(r_vcpu, addr, sizeof(u32)); else *val = region->read(r_vcpu, addr, sizeof(u32)); return 0; } static int vgic_uaccess_write(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev, gpa_t addr, const u32 *val) { const struct vgic_register_region *region; struct kvm_vcpu *r_vcpu; region = vgic_get_mmio_region(vcpu, iodev, addr, sizeof(u32)); if (!region) return 0; r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu; if (region->uaccess_write) return region->uaccess_write(r_vcpu, addr, sizeof(u32), *val); region->write(r_vcpu, addr, sizeof(u32), *val); return 0; } /* * Userland access to VGIC registers. */ int vgic_uaccess(struct kvm_vcpu *vcpu, struct vgic_io_device *dev, bool is_write, int offset, u32 *val) { if (is_write) return vgic_uaccess_write(vcpu, dev, offset, val); else return vgic_uaccess_read(vcpu, dev, offset, val); } static int dispatch_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, gpa_t addr, int len, void *val) { struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev); const struct vgic_register_region *region; unsigned long data = 0; region = vgic_get_mmio_region(vcpu, iodev, addr, len); if (!region) { memset(val, 0, len); return 0; } switch (iodev->iodev_type) { case IODEV_CPUIF: data = region->read(vcpu, addr, len); break; case IODEV_DIST: data = region->read(vcpu, addr, len); break; case IODEV_REDIST: data = region->read(iodev->redist_vcpu, addr, len); break; case IODEV_ITS: data = region->its_read(vcpu->kvm, iodev->its, addr, len); break; } vgic_data_host_to_mmio_bus(val, len, data); return 0; } static int dispatch_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, gpa_t addr, int len, const void *val) { struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev); const struct vgic_register_region *region; unsigned long data = vgic_data_mmio_bus_to_host(val, len); region = vgic_get_mmio_region(vcpu, iodev, addr, len); if (!region) return 0; switch (iodev->iodev_type) { case IODEV_CPUIF: region->write(vcpu, addr, len, data); break; case IODEV_DIST: region->write(vcpu, addr, len, data); break; case IODEV_REDIST: region->write(iodev->redist_vcpu, addr, len, data); break; case IODEV_ITS: region->its_write(vcpu->kvm, iodev->its, addr, len, data); break; } return 0; } const struct kvm_io_device_ops kvm_io_gic_ops = { .read = dispatch_mmio_read, .write = dispatch_mmio_write, }; int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address, enum vgic_type type) { struct vgic_io_device *io_device = &kvm->arch.vgic.dist_iodev; unsigned int len; switch (type) { case VGIC_V2: len = vgic_v2_init_dist_iodev(io_device); break; case VGIC_V3: len = vgic_v3_init_dist_iodev(io_device); break; default: BUG(); } io_device->base_addr = dist_base_address; io_device->iodev_type = IODEV_DIST; io_device->redist_vcpu = NULL; return kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, dist_base_address, len, &io_device->dev); }
8 1125 1 2 4 2 3 6 3 18 15 3 3 3 4 2 6 2 2 1 7 1 1 3 1 4 1 1177 1 2 4 6 1 1 1 3 3 5 7 11 11 2 1 13 1 15 5 1 1086 18 1189 525 73 1118 1163 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ioctl.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/syscalls.h> #include <linux/mm.h> #include <linux/capability.h> #include <linux/compat.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/security.h> #include <linux/export.h> #include <linux/uaccess.h> #include <linux/writeback.h> #include <linux/buffer_head.h> #include <linux/falloc.h> #include <linux/sched/signal.h> #include <linux/fiemap.h> #include <linux/mount.h> #include <linux/fscrypt.h> #include <linux/fileattr.h> #include "internal.h" #include <asm/ioctls.h> /* So that the fiemap access checks can't overflow on 32 bit machines. */ #define FIEMAP_MAX_EXTENTS (UINT_MAX / sizeof(struct fiemap_extent)) /** * vfs_ioctl - call filesystem specific ioctl methods * @filp: open file to invoke ioctl method on * @cmd: ioctl command to execute * @arg: command-specific argument for ioctl * * Invokes filesystem specific ->unlocked_ioctl, if one exists; otherwise * returns -ENOTTY. * * Returns 0 on success, -errno on error. */ int vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { int error = -ENOTTY; if (!filp->f_op->unlocked_ioctl) goto out; error = filp->f_op->unlocked_ioctl(filp, cmd, arg); if (error == -ENOIOCTLCMD) error = -ENOTTY; out: return error; } EXPORT_SYMBOL(vfs_ioctl); static int ioctl_fibmap(struct file *filp, int __user *p) { struct inode *inode = file_inode(filp); struct super_block *sb = inode->i_sb; int error, ur_block; sector_t block; if (!capable(CAP_SYS_RAWIO)) return -EPERM; error = get_user(ur_block, p); if (error) return error; if (ur_block < 0) return -EINVAL; block = ur_block; error = bmap(inode, &block); if (block > INT_MAX) { error = -ERANGE; pr_warn_ratelimited("[%s/%d] FS: %s File: %pD4 would truncate fibmap result\n", current->comm, task_pid_nr(current), sb->s_id, filp); } if (error) ur_block = 0; else ur_block = block; if (put_user(ur_block, p)) error = -EFAULT; return error; } /** * fiemap_fill_next_extent - Fiemap helper function * @fieinfo: Fiemap context passed into ->fiemap * @logical: Extent logical start offset, in bytes * @phys: Extent physical start offset, in bytes * @len: Extent length, in bytes * @flags: FIEMAP_EXTENT flags that describe this extent * * Called from file system ->fiemap callback. Will populate extent * info as passed in via arguments and copy to user memory. On * success, extent count on fieinfo is incremented. * * Returns 0 on success, -errno on error, 1 if this was the last * extent that will fit in user array. */ int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical, u64 phys, u64 len, u32 flags) { struct fiemap_extent extent; struct fiemap_extent __user *dest = fieinfo->fi_extents_start; /* only count the extents */ if (fieinfo->fi_extents_max == 0) { fieinfo->fi_extents_mapped++; return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0; } if (fieinfo->fi_extents_mapped >= fieinfo->fi_extents_max) return 1; #define SET_UNKNOWN_FLAGS (FIEMAP_EXTENT_DELALLOC) #define SET_NO_UNMOUNTED_IO_FLAGS (FIEMAP_EXTENT_DATA_ENCRYPTED) #define SET_NOT_ALIGNED_FLAGS (FIEMAP_EXTENT_DATA_TAIL|FIEMAP_EXTENT_DATA_INLINE) if (flags & SET_UNKNOWN_FLAGS) flags |= FIEMAP_EXTENT_UNKNOWN; if (flags & SET_NO_UNMOUNTED_IO_FLAGS) flags |= FIEMAP_EXTENT_ENCODED; if (flags & SET_NOT_ALIGNED_FLAGS) flags |= FIEMAP_EXTENT_NOT_ALIGNED; memset(&extent, 0, sizeof(extent)); extent.fe_logical = logical; extent.fe_physical = phys; extent.fe_length = len; extent.fe_flags = flags; dest += fieinfo->fi_extents_mapped; if (copy_to_user(dest, &extent, sizeof(extent))) return -EFAULT; fieinfo->fi_extents_mapped++; if (fieinfo->fi_extents_mapped == fieinfo->fi_extents_max) return 1; return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0; } EXPORT_SYMBOL(fiemap_fill_next_extent); /** * fiemap_prep - check validity of requested flags for fiemap * @inode: Inode to operate on * @fieinfo: Fiemap context passed into ->fiemap * @start: Start of the mapped range * @len: Length of the mapped range, can be truncated by this function. * @supported_flags: Set of fiemap flags that the file system understands * * This function must be called from each ->fiemap instance to validate the * fiemap request against the file system parameters. * * Returns 0 on success, or a negative error on failure. */ int fiemap_prep(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 *len, u32 supported_flags) { u64 maxbytes = inode->i_sb->s_maxbytes; u32 incompat_flags; int ret = 0; if (*len == 0) return -EINVAL; if (start >= maxbytes) return -EFBIG; /* * Shrink request scope to what the fs can actually handle. */ if (*len > maxbytes || (maxbytes - *len) < start) *len = maxbytes - start; supported_flags |= FIEMAP_FLAG_SYNC; supported_flags &= FIEMAP_FLAGS_COMPAT; incompat_flags = fieinfo->fi_flags & ~supported_flags; if (incompat_flags) { fieinfo->fi_flags = incompat_flags; return -EBADR; } if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) ret = filemap_write_and_wait(inode->i_mapping); return ret; } EXPORT_SYMBOL(fiemap_prep); static int ioctl_fiemap(struct file *filp, struct fiemap __user *ufiemap) { struct fiemap fiemap; struct fiemap_extent_info fieinfo = { 0, }; struct inode *inode = file_inode(filp); int error; if (!inode->i_op->fiemap) return -EOPNOTSUPP; if (copy_from_user(&fiemap, ufiemap, sizeof(fiemap))) return -EFAULT; if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS) return -EINVAL; fieinfo.fi_flags = fiemap.fm_flags; fieinfo.fi_extents_max = fiemap.fm_extent_count; fieinfo.fi_extents_start = ufiemap->fm_extents; error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start, fiemap.fm_length); fiemap.fm_flags = fieinfo.fi_flags; fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped; if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap))) error = -EFAULT; return error; } static int ioctl_file_clone(struct file *dst_file, unsigned long srcfd, u64 off, u64 olen, u64 destoff) { CLASS(fd, src_file)(srcfd); loff_t cloned; int ret; if (fd_empty(src_file)) return -EBADF; cloned = vfs_clone_file_range(fd_file(src_file), off, dst_file, destoff, olen, 0); if (cloned < 0) ret = cloned; else if (olen && cloned != olen) ret = -EINVAL; else ret = 0; return ret; } static int ioctl_file_clone_range(struct file *file, struct file_clone_range __user *argp) { struct file_clone_range args; if (copy_from_user(&args, argp, sizeof(args))) return -EFAULT; return ioctl_file_clone(file, args.src_fd, args.src_offset, args.src_length, args.dest_offset); } /* * This provides compatibility with legacy XFS pre-allocation ioctls * which predate the fallocate syscall. * * Only the l_start, l_len and l_whence fields of the 'struct space_resv' * are used here, rest are ignored. */ static int ioctl_preallocate(struct file *filp, int mode, void __user *argp) { struct inode *inode = file_inode(filp); struct space_resv sr; if (copy_from_user(&sr, argp, sizeof(sr))) return -EFAULT; switch (sr.l_whence) { case SEEK_SET: break; case SEEK_CUR: sr.l_start += filp->f_pos; break; case SEEK_END: sr.l_start += i_size_read(inode); break; default: return -EINVAL; } return vfs_fallocate(filp, mode | FALLOC_FL_KEEP_SIZE, sr.l_start, sr.l_len); } /* on ia32 l_start is on a 32-bit boundary */ #if defined CONFIG_COMPAT && defined(CONFIG_X86_64) /* just account for different alignment */ static int compat_ioctl_preallocate(struct file *file, int mode, struct space_resv_32 __user *argp) { struct inode *inode = file_inode(file); struct space_resv_32 sr; if (copy_from_user(&sr, argp, sizeof(sr))) return -EFAULT; switch (sr.l_whence) { case SEEK_SET: break; case SEEK_CUR: sr.l_start += file->f_pos; break; case SEEK_END: sr.l_start += i_size_read(inode); break; default: return -EINVAL; } return vfs_fallocate(file, mode | FALLOC_FL_KEEP_SIZE, sr.l_start, sr.l_len); } #endif static int file_ioctl(struct file *filp, unsigned int cmd, int __user *p) { switch (cmd) { case FIBMAP: return ioctl_fibmap(filp, p); case FS_IOC_RESVSP: case FS_IOC_RESVSP64: return ioctl_preallocate(filp, 0, p); case FS_IOC_UNRESVSP: case FS_IOC_UNRESVSP64: return ioctl_preallocate(filp, FALLOC_FL_PUNCH_HOLE, p); case FS_IOC_ZERO_RANGE: return ioctl_preallocate(filp, FALLOC_FL_ZERO_RANGE, p); } return -ENOIOCTLCMD; } static int ioctl_fionbio(struct file *filp, int __user *argp) { unsigned int flag; int on, error; error = get_user(on, argp); if (error) return error; flag = O_NONBLOCK; #ifdef __sparc__ /* SunOS compatibility item. */ if (O_NONBLOCK != O_NDELAY) flag |= O_NDELAY; #endif spin_lock(&filp->f_lock); if (on) filp->f_flags |= flag; else filp->f_flags &= ~flag; spin_unlock(&filp->f_lock); return error; } static int ioctl_fioasync(unsigned int fd, struct file *filp, int __user *argp) { unsigned int flag; int on, error; error = get_user(on, argp); if (error) return error; flag = on ? FASYNC : 0; /* Did FASYNC state change ? */ if ((flag ^ filp->f_flags) & FASYNC) { if (filp->f_op->fasync) /* fasync() adjusts filp->f_flags */ error = filp->f_op->fasync(fd, filp, on); else error = -ENOTTY; } return error < 0 ? error : 0; } static int ioctl_fsfreeze(struct file *filp) { struct super_block *sb = file_inode(filp)->i_sb; if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) return -EPERM; /* If filesystem doesn't support freeze feature, return. */ if (sb->s_op->freeze_fs == NULL && sb->s_op->freeze_super == NULL) return -EOPNOTSUPP; /* Freeze */ if (sb->s_op->freeze_super) return sb->s_op->freeze_super(sb, FREEZE_HOLDER_USERSPACE, NULL); return freeze_super(sb, FREEZE_HOLDER_USERSPACE, NULL); } static int ioctl_fsthaw(struct file *filp) { struct super_block *sb = file_inode(filp)->i_sb; if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) return -EPERM; /* Thaw */ if (sb->s_op->thaw_super) return sb->s_op->thaw_super(sb, FREEZE_HOLDER_USERSPACE, NULL); return thaw_super(sb, FREEZE_HOLDER_USERSPACE, NULL); } static int ioctl_file_dedupe_range(struct file *file, struct file_dedupe_range __user *argp) { struct file_dedupe_range *same = NULL; int ret; unsigned long size; u16 count; if (get_user(count, &argp->dest_count)) { ret = -EFAULT; goto out; } size = offsetof(struct file_dedupe_range, info[count]); if (size > PAGE_SIZE) { ret = -ENOMEM; goto out; } same = memdup_user(argp, size); if (IS_ERR(same)) { ret = PTR_ERR(same); same = NULL; goto out; } same->dest_count = count; ret = vfs_dedupe_file_range(file, same); if (ret) goto out; ret = copy_to_user(argp, same, size); if (ret) ret = -EFAULT; out: kfree(same); return ret; } static int ioctl_getfsuuid(struct file *file, void __user *argp) { struct super_block *sb = file_inode(file)->i_sb; struct fsuuid2 u = { .len = sb->s_uuid_len, }; if (!sb->s_uuid_len) return -ENOTTY; memcpy(&u.uuid[0], &sb->s_uuid, sb->s_uuid_len); return copy_to_user(argp, &u, sizeof(u)) ? -EFAULT : 0; } static int ioctl_get_fs_sysfs_path(struct file *file, void __user *argp) { struct super_block *sb = file_inode(file)->i_sb; if (!strlen(sb->s_sysfs_name)) return -ENOTTY; struct fs_sysfs_path u = {}; u.len = scnprintf(u.name, sizeof(u.name), "%s/%s", sb->s_type->name, sb->s_sysfs_name); return copy_to_user(argp, &u, sizeof(u)) ? -EFAULT : 0; } /* * do_vfs_ioctl() is not for drivers and not intended to be EXPORT_SYMBOL()'d. * It's just a simple helper for sys_ioctl and compat_sys_ioctl. * * When you add any new common ioctls to the switches above and below, * please ensure they have compatible arguments in compat mode. * * The LSM mailing list should also be notified of any command additions or * changes, as specific LSMs may be affected. */ static int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, unsigned long arg) { void __user *argp = (void __user *)arg; struct inode *inode = file_inode(filp); switch (cmd) { case FIOCLEX: set_close_on_exec(fd, 1); return 0; case FIONCLEX: set_close_on_exec(fd, 0); return 0; case FIONBIO: return ioctl_fionbio(filp, argp); case FIOASYNC: return ioctl_fioasync(fd, filp, argp); case FIOQSIZE: if (S_ISDIR(inode->i_mode) || (S_ISREG(inode->i_mode) && !IS_ANON_FILE(inode)) || S_ISLNK(inode->i_mode)) { loff_t res = inode_get_bytes(inode); return copy_to_user(argp, &res, sizeof(res)) ? -EFAULT : 0; } return -ENOTTY; case FIFREEZE: return ioctl_fsfreeze(filp); case FITHAW: return ioctl_fsthaw(filp); case FS_IOC_FIEMAP: return ioctl_fiemap(filp, argp); case FIGETBSZ: /* anon_bdev filesystems may not have a block size */ if (!inode->i_sb->s_blocksize) return -EINVAL; return put_user(inode->i_sb->s_blocksize, (int __user *)argp); case FICLONE: return ioctl_file_clone(filp, arg, 0, 0, 0); case FICLONERANGE: return ioctl_file_clone_range(filp, argp); case FIDEDUPERANGE: return ioctl_file_dedupe_range(filp, argp); case FIONREAD: if (!S_ISREG(inode->i_mode) || IS_ANON_FILE(inode)) return vfs_ioctl(filp, cmd, arg); return put_user(i_size_read(inode) - filp->f_pos, (int __user *)argp); case FS_IOC_GETFLAGS: return ioctl_getflags(filp, argp); case FS_IOC_SETFLAGS: return ioctl_setflags(filp, argp); case FS_IOC_FSGETXATTR: return ioctl_fsgetxattr(filp, argp); case FS_IOC_FSSETXATTR: return ioctl_fssetxattr(filp, argp); case FS_IOC_GETFSUUID: return ioctl_getfsuuid(filp, argp); case FS_IOC_GETFSSYSFSPATH: return ioctl_get_fs_sysfs_path(filp, argp); default: if (S_ISREG(inode->i_mode) && !IS_ANON_FILE(inode)) return file_ioctl(filp, cmd, argp); break; } return -ENOIOCTLCMD; } SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg) { CLASS(fd, f)(fd); int error; if (fd_empty(f)) return -EBADF; error = security_file_ioctl(fd_file(f), cmd, arg); if (error) return error; error = do_vfs_ioctl(fd_file(f), fd, cmd, arg); if (error == -ENOIOCTLCMD) error = vfs_ioctl(fd_file(f), cmd, arg); return error; } #ifdef CONFIG_COMPAT /** * compat_ptr_ioctl - generic implementation of .compat_ioctl file operation * @file: The file to operate on. * @cmd: The ioctl command number. * @arg: The argument to the ioctl. * * This is not normally called as a function, but instead set in struct * file_operations as * * .compat_ioctl = compat_ptr_ioctl, * * On most architectures, the compat_ptr_ioctl() just passes all arguments * to the corresponding ->ioctl handler. The exception is arch/s390, where * compat_ptr() clears the top bit of a 32-bit pointer value, so user space * pointers to the second 2GB alias the first 2GB, as is the case for * native 32-bit s390 user space. * * The compat_ptr_ioctl() function must therefore be used only with ioctl * functions that either ignore the argument or pass a pointer to a * compatible data type. * * If any ioctl command handled by fops->unlocked_ioctl passes a plain * integer instead of a pointer, or any of the passed data types * is incompatible between 32-bit and 64-bit architectures, a proper * handler is required instead of compat_ptr_ioctl. */ long compat_ptr_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { if (!file->f_op->unlocked_ioctl) return -ENOIOCTLCMD; return file->f_op->unlocked_ioctl(file, cmd, (unsigned long)compat_ptr(arg)); } EXPORT_SYMBOL(compat_ptr_ioctl); COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, compat_ulong_t, arg) { CLASS(fd, f)(fd); int error; if (fd_empty(f)) return -EBADF; error = security_file_ioctl_compat(fd_file(f), cmd, arg); if (error) return error; switch (cmd) { /* FICLONE takes an int argument, so don't use compat_ptr() */ case FICLONE: error = ioctl_file_clone(fd_file(f), arg, 0, 0, 0); break; #if defined(CONFIG_X86_64) /* these get messy on amd64 due to alignment differences */ case FS_IOC_RESVSP_32: case FS_IOC_RESVSP64_32: error = compat_ioctl_preallocate(fd_file(f), 0, compat_ptr(arg)); break; case FS_IOC_UNRESVSP_32: case FS_IOC_UNRESVSP64_32: error = compat_ioctl_preallocate(fd_file(f), FALLOC_FL_PUNCH_HOLE, compat_ptr(arg)); break; case FS_IOC_ZERO_RANGE_32: error = compat_ioctl_preallocate(fd_file(f), FALLOC_FL_ZERO_RANGE, compat_ptr(arg)); break; #endif /* * These access 32-bit values anyway so no further handling is * necessary. */ case FS_IOC32_GETFLAGS: case FS_IOC32_SETFLAGS: cmd = (cmd == FS_IOC32_GETFLAGS) ? FS_IOC_GETFLAGS : FS_IOC_SETFLAGS; fallthrough; /* * everything else in do_vfs_ioctl() takes either a compatible * pointer argument or no argument -- call it with a modified * argument. */ default: error = do_vfs_ioctl(fd_file(f), fd, cmd, (unsigned long)compat_ptr(arg)); if (error != -ENOIOCTLCMD) break; if (fd_file(f)->f_op->compat_ioctl) error = fd_file(f)->f_op->compat_ioctl(fd_file(f), cmd, arg); if (error == -ENOIOCTLCMD) error = -ENOTTY; break; } return error; } #endif
135 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __SHMEM_FS_H #define __SHMEM_FS_H #include <linux/file.h> #include <linux/swap.h> #include <linux/mempolicy.h> #include <linux/pagemap.h> #include <linux/percpu_counter.h> #include <linux/xattr.h> #include <linux/fs_parser.h> #include <linux/userfaultfd_k.h> struct swap_iocb; /* inode in-kernel data */ #ifdef CONFIG_TMPFS_QUOTA #define SHMEM_MAXQUOTAS 2 #endif struct shmem_inode_info { spinlock_t lock; unsigned int seals; /* shmem seals */ unsigned long flags; unsigned long alloced; /* data pages alloced to file */ unsigned long swapped; /* subtotal assigned to swap */ union { struct offset_ctx dir_offsets; /* stable directory offsets */ struct { struct list_head shrinklist; /* shrinkable hpage inodes */ struct list_head swaplist; /* chain of maybes on swap */ }; }; struct timespec64 i_crtime; /* file creation time */ struct shared_policy policy; /* NUMA memory alloc policy */ struct simple_xattrs xattrs; /* list of xattrs */ pgoff_t fallocend; /* highest fallocate endindex */ unsigned int fsflags; /* for FS_IOC_[SG]ETFLAGS */ atomic_t stop_eviction; /* hold when working on inode */ #ifdef CONFIG_TMPFS_QUOTA struct dquot __rcu *i_dquot[MAXQUOTAS]; #endif struct inode vfs_inode; }; #define SHMEM_FL_USER_VISIBLE (FS_FL_USER_VISIBLE | FS_CASEFOLD_FL) #define SHMEM_FL_USER_MODIFIABLE \ (FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL | FS_NOATIME_FL | FS_CASEFOLD_FL) #define SHMEM_FL_INHERITED (FS_NODUMP_FL | FS_NOATIME_FL | FS_CASEFOLD_FL) struct shmem_quota_limits { qsize_t usrquota_bhardlimit; /* Default user quota block hard limit */ qsize_t usrquota_ihardlimit; /* Default user quota inode hard limit */ qsize_t grpquota_bhardlimit; /* Default group quota block hard limit */ qsize_t grpquota_ihardlimit; /* Default group quota inode hard limit */ }; struct shmem_sb_info { unsigned long max_blocks; /* How many blocks are allowed */ struct percpu_counter used_blocks; /* How many are allocated */ unsigned long max_inodes; /* How many inodes are allowed */ unsigned long free_ispace; /* How much ispace left for allocation */ raw_spinlock_t stat_lock; /* Serialize shmem_sb_info changes */ umode_t mode; /* Mount mode for root directory */ unsigned char huge; /* Whether to try for hugepages */ kuid_t uid; /* Mount uid for root directory */ kgid_t gid; /* Mount gid for root directory */ bool full_inums; /* If i_ino should be uint or ino_t */ bool noswap; /* ignores VM reclaim / swap requests */ ino_t next_ino; /* The next per-sb inode number to use */ ino_t __percpu *ino_batch; /* The next per-cpu inode number to use */ struct mempolicy *mpol; /* default memory policy for mappings */ spinlock_t shrinklist_lock; /* Protects shrinklist */ struct list_head shrinklist; /* List of shinkable inodes */ unsigned long shrinklist_len; /* Length of shrinklist */ struct shmem_quota_limits qlimits; /* Default quota limits */ }; static inline struct shmem_inode_info *SHMEM_I(struct inode *inode) { return container_of(inode, struct shmem_inode_info, vfs_inode); } /* * Functions in mm/shmem.c called directly from elsewhere: */ extern const struct fs_parameter_spec shmem_fs_parameters[]; extern void shmem_init(void); extern int shmem_init_fs_context(struct fs_context *fc); extern struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags); extern struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags); extern struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name, loff_t size, unsigned long flags); extern int shmem_zero_setup(struct vm_area_struct *); extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); extern int shmem_lock(struct file *file, int lock, struct ucounts *ucounts); #ifdef CONFIG_SHMEM bool shmem_mapping(struct address_space *mapping); #else static inline bool shmem_mapping(struct address_space *mapping) { return false; } #endif /* CONFIG_SHMEM */ void shmem_unlock_mapping(struct address_space *mapping); struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); int shmem_writeout(struct folio *folio, struct swap_iocb **plug, struct list_head *folio_list); void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); int shmem_unuse(unsigned int type); #ifdef CONFIG_TRANSPARENT_HUGEPAGE unsigned long shmem_allowable_huge_orders(struct inode *inode, struct vm_area_struct *vma, pgoff_t index, loff_t write_end, bool shmem_huge_force); bool shmem_hpage_pmd_enabled(void); #else static inline unsigned long shmem_allowable_huge_orders(struct inode *inode, struct vm_area_struct *vma, pgoff_t index, loff_t write_end, bool shmem_huge_force) { return 0; } static inline bool shmem_hpage_pmd_enabled(void) { return false; } #endif #ifdef CONFIG_SHMEM extern unsigned long shmem_swap_usage(struct vm_area_struct *vma); #else static inline unsigned long shmem_swap_usage(struct vm_area_struct *vma) { return 0; } #endif extern unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end); /* Flag allocation requirements to shmem_get_folio */ enum sgp_type { SGP_READ, /* don't exceed i_size, don't allocate page */ SGP_NOALLOC, /* similar, but fail on hole or use fallocated page */ SGP_CACHE, /* don't exceed i_size, may allocate page */ SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */ SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */ }; int shmem_get_folio(struct inode *inode, pgoff_t index, loff_t write_end, struct folio **foliop, enum sgp_type sgp); struct folio *shmem_read_folio_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp); static inline struct folio *shmem_read_folio(struct address_space *mapping, pgoff_t index) { return shmem_read_folio_gfp(mapping, index, mapping_gfp_mask(mapping)); } static inline struct page *shmem_read_mapping_page( struct address_space *mapping, pgoff_t index) { return shmem_read_mapping_page_gfp(mapping, index, mapping_gfp_mask(mapping)); } static inline bool shmem_file(struct file *file) { if (!IS_ENABLED(CONFIG_SHMEM)) return false; if (!file || !file->f_mapping) return false; return shmem_mapping(file->f_mapping); } /* * If fallocate(FALLOC_FL_KEEP_SIZE) has been used, there may be pages * beyond i_size's notion of EOF, which fallocate has committed to reserving: * which split_huge_page() must therefore not delete. This use of a single * "fallocend" per inode errs on the side of not deleting a reservation when * in doubt: there are plenty of cases when it preserves unreserved pages. */ static inline pgoff_t shmem_fallocend(struct inode *inode, pgoff_t eof) { return max(eof, SHMEM_I(inode)->fallocend); } extern bool shmem_charge(struct inode *inode, long pages); extern void shmem_uncharge(struct inode *inode, long pages); #ifdef CONFIG_USERFAULTFD #ifdef CONFIG_SHMEM extern int shmem_mfill_atomic_pte(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, uffd_flags_t flags, struct folio **foliop); #else /* !CONFIG_SHMEM */ #define shmem_mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, \ src_addr, flags, foliop) ({ BUG(); 0; }) #endif /* CONFIG_SHMEM */ #endif /* CONFIG_USERFAULTFD */ /* * Used space is stored as unsigned 64-bit value in bytes but * quota core supports only signed 64-bit values so use that * as a limit */ #define SHMEM_QUOTA_MAX_SPC_LIMIT 0x7fffffffffffffffLL /* 2^63-1 */ #define SHMEM_QUOTA_MAX_INO_LIMIT 0x7fffffffffffffffLL #ifdef CONFIG_TMPFS_QUOTA extern const struct dquot_operations shmem_quota_operations; extern struct quota_format_type shmem_quota_format; #endif /* CONFIG_TMPFS_QUOTA */ #endif
4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright 2011-2014 Autronica Fire and Security AS * * 2011-2014 Arvid Brodin, arvid.brodin@alten.se * * include file for HSR and PRP. */ #ifndef __HSR_SLAVE_H #define __HSR_SLAVE_H #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include "hsr_main.h" int hsr_add_port(struct hsr_priv *hsr, struct net_device *dev, enum hsr_port_type pt, struct netlink_ext_ack *extack); void hsr_del_port(struct hsr_port *port); bool hsr_port_exists(const struct net_device *dev); static inline struct hsr_port *hsr_port_get_rtnl(const struct net_device *dev) { ASSERT_RTNL(); return hsr_port_exists(dev) ? rtnl_dereference(dev->rx_handler_data) : NULL; } static inline struct hsr_port *hsr_port_get_rcu(const struct net_device *dev) { return hsr_port_exists(dev) ? rcu_dereference(dev->rx_handler_data) : NULL; } bool hsr_invalid_dan_ingress_frame(__be16 protocol); #endif /* __HSR_SLAVE_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _DELAYED_CALL_H #define _DELAYED_CALL_H /* * Poor man's closures; I wish we could've done them sanely polymorphic, * but... */ struct delayed_call { void (*fn)(void *); void *arg; }; #define DEFINE_DELAYED_CALL(name) struct delayed_call name = {NULL, NULL} /* I really wish we had closures with sane typechecking... */ static inline void set_delayed_call(struct delayed_call *call, void (*fn)(void *), void *arg) { call->fn = fn; call->arg = arg; } static inline void do_delayed_call(struct delayed_call *call) { if (call->fn) call->fn(call->arg); } static inline void clear_delayed_call(struct delayed_call *call) { call->fn = NULL; } #endif
138 30 122 122 121 30 29 30 29 30 30 186 187 187 187 187 24 24 24 187 187 186 184 187 186 186 384 384 386 383 385 387 149 16 307 28 27 11 225 323 326 386 386 6 384 353 13 347 149 71 143 149 149 148 149 149 149 149 227 203 227 227 122 112 37 201 122 204 121 31 31 31 10 31 31 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 // SPDX-License-Identifier: GPL-2.0-only /* * linux/mm/swap.c * * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds */ /* * This file contains the default values for the operation of the * Linux VM subsystem. Fine-tuning documentation can be found in * Documentation/admin-guide/sysctl/vm.rst. * Started 18.12.91 * Swap aging added 23.2.95, Stephen Tweedie. * Buffermem limits added 12.3.98, Rik van Riel. */ #include <linux/mm.h> #include <linux/sched.h> #include <linux/kernel_stat.h> #include <linux/swap.h> #include <linux/mman.h> #include <linux/pagemap.h> #include <linux/pagevec.h> #include <linux/init.h> #include <linux/export.h> #include <linux/mm_inline.h> #include <linux/percpu_counter.h> #include <linux/memremap.h> #include <linux/percpu.h> #include <linux/cpu.h> #include <linux/notifier.h> #include <linux/backing-dev.h> #include <linux/memcontrol.h> #include <linux/gfp.h> #include <linux/uio.h> #include <linux/hugetlb.h> #include <linux/page_idle.h> #include <linux/local_lock.h> #include <linux/buffer_head.h> #include "internal.h" #define CREATE_TRACE_POINTS #include <trace/events/pagemap.h> /* How many pages do we try to swap or page in/out together? As a power of 2 */ int page_cluster; static const int page_cluster_max = 31; struct cpu_fbatches { /* * The following folio batches are grouped together because they are protected * by disabling preemption (and interrupts remain enabled). */ local_lock_t lock; struct folio_batch lru_add; struct folio_batch lru_deactivate_file; struct folio_batch lru_deactivate; struct folio_batch lru_lazyfree; #ifdef CONFIG_SMP struct folio_batch lru_activate; #endif /* Protecting the following batches which require disabling interrupts */ local_lock_t lock_irq; struct folio_batch lru_move_tail; }; static DEFINE_PER_CPU(struct cpu_fbatches, cpu_fbatches) = { .lock = INIT_LOCAL_LOCK(lock), .lock_irq = INIT_LOCAL_LOCK(lock_irq), }; static void __page_cache_release(struct folio *folio, struct lruvec **lruvecp, unsigned long *flagsp) { if (folio_test_lru(folio)) { folio_lruvec_relock_irqsave(folio, lruvecp, flagsp); lruvec_del_folio(*lruvecp, folio); __folio_clear_lru_flags(folio); } } /* * This path almost never happens for VM activity - pages are normally freed * in batches. But it gets used by networking - and for compound pages. */ static void page_cache_release(struct folio *folio) { struct lruvec *lruvec = NULL; unsigned long flags; __page_cache_release(folio, &lruvec, &flags); if (lruvec) unlock_page_lruvec_irqrestore(lruvec, flags); } void __folio_put(struct folio *folio) { if (unlikely(folio_is_zone_device(folio))) { free_zone_device_folio(folio); return; } if (folio_test_hugetlb(folio)) { free_huge_folio(folio); return; } page_cache_release(folio); folio_unqueue_deferred_split(folio); mem_cgroup_uncharge(folio); free_frozen_pages(&folio->page, folio_order(folio)); } EXPORT_SYMBOL(__folio_put); typedef void (*move_fn_t)(struct lruvec *lruvec, struct folio *folio); static void lru_add(struct lruvec *lruvec, struct folio *folio) { int was_unevictable = folio_test_clear_unevictable(folio); long nr_pages = folio_nr_pages(folio); VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); /* * Is an smp_mb__after_atomic() still required here, before * folio_evictable() tests the mlocked flag, to rule out the possibility * of stranding an evictable folio on an unevictable LRU? I think * not, because __munlock_folio() only clears the mlocked flag * while the LRU lock is held. * * (That is not true of __page_cache_release(), and not necessarily * true of folios_put(): but those only clear the mlocked flag after * folio_put_testzero() has excluded any other users of the folio.) */ if (folio_evictable(folio)) { if (was_unevictable) __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages); } else { folio_clear_active(folio); folio_set_unevictable(folio); /* * folio->mlock_count = !!folio_test_mlocked(folio)? * But that leaves __mlock_folio() in doubt whether another * actor has already counted the mlock or not. Err on the * safe side, underestimate, let page reclaim fix it, rather * than leaving a page on the unevictable LRU indefinitely. */ folio->mlock_count = 0; if (!was_unevictable) __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages); } lruvec_add_folio(lruvec, folio); trace_mm_lru_insertion(folio); } static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) { int i; struct lruvec *lruvec = NULL; unsigned long flags = 0; for (i = 0; i < folio_batch_count(fbatch); i++) { struct folio *folio = fbatch->folios[i]; folio_lruvec_relock_irqsave(folio, &lruvec, &flags); move_fn(lruvec, folio); folio_set_lru(folio); } if (lruvec) unlock_page_lruvec_irqrestore(lruvec, flags); folios_put(fbatch); } static void __folio_batch_add_and_move(struct folio_batch __percpu *fbatch, struct folio *folio, move_fn_t move_fn, bool on_lru, bool disable_irq) { unsigned long flags; if (on_lru && !folio_test_clear_lru(folio)) return; folio_get(folio); if (disable_irq) local_lock_irqsave(&cpu_fbatches.lock_irq, flags); else local_lock(&cpu_fbatches.lock); if (!folio_batch_add(this_cpu_ptr(fbatch), folio) || folio_test_large(folio) || lru_cache_disabled()) folio_batch_move_lru(this_cpu_ptr(fbatch), move_fn); if (disable_irq) local_unlock_irqrestore(&cpu_fbatches.lock_irq, flags); else local_unlock(&cpu_fbatches.lock); } #define folio_batch_add_and_move(folio, op, on_lru) \ __folio_batch_add_and_move( \ &cpu_fbatches.op, \ folio, \ op, \ on_lru, \ offsetof(struct cpu_fbatches, op) >= offsetof(struct cpu_fbatches, lock_irq) \ ) static void lru_move_tail(struct lruvec *lruvec, struct folio *folio) { if (folio_test_unevictable(folio)) return; lruvec_del_folio(lruvec, folio); folio_clear_active(folio); lruvec_add_folio_tail(lruvec, folio); __count_vm_events(PGROTATED, folio_nr_pages(folio)); } /* * Writeback is about to end against a folio which has been marked for * immediate reclaim. If it still appears to be reclaimable, move it * to the tail of the inactive list. * * folio_rotate_reclaimable() must disable IRQs, to prevent nasty races. */ void folio_rotate_reclaimable(struct folio *folio) { if (folio_test_locked(folio) || folio_test_dirty(folio) || folio_test_unevictable(folio)) return; folio_batch_add_and_move(folio, lru_move_tail, true); } void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file, unsigned int nr_io, unsigned int nr_rotated) __releases(lruvec->lru_lock) { unsigned long cost; /* * Reflect the relative cost of incurring IO and spending CPU * time on rotations. This doesn't attempt to make a precise * comparison, it just says: if reloads are about comparable * between the LRU lists, or rotations are overwhelmingly * different between them, adjust scan balance for CPU work. */ cost = nr_io * SWAP_CLUSTER_MAX + nr_rotated; if (!cost) { spin_unlock_irq(&lruvec->lru_lock); return; } for (;;) { unsigned long lrusize; /* Record cost event */ if (file) lruvec->file_cost += cost; else lruvec->anon_cost += cost; /* * Decay previous events * * Because workloads change over time (and to avoid * overflow) we keep these statistics as a floating * average, which ends up weighing recent refaults * more than old ones. */ lrusize = lruvec_page_state(lruvec, NR_INACTIVE_ANON) + lruvec_page_state(lruvec, NR_ACTIVE_ANON) + lruvec_page_state(lruvec, NR_INACTIVE_FILE) + lruvec_page_state(lruvec, NR_ACTIVE_FILE); if (lruvec->file_cost + lruvec->anon_cost > lrusize / 4) { lruvec->file_cost /= 2; lruvec->anon_cost /= 2; } spin_unlock_irq(&lruvec->lru_lock); lruvec = parent_lruvec(lruvec); if (!lruvec) break; spin_lock_irq(&lruvec->lru_lock); } } void lru_note_cost_refault(struct folio *folio) { struct lruvec *lruvec; lruvec = folio_lruvec_lock_irq(folio); lru_note_cost_unlock_irq(lruvec, folio_is_file_lru(folio), folio_nr_pages(folio), 0); } static void lru_activate(struct lruvec *lruvec, struct folio *folio) { long nr_pages = folio_nr_pages(folio); if (folio_test_active(folio) || folio_test_unevictable(folio)) return; lruvec_del_folio(lruvec, folio); folio_set_active(folio); lruvec_add_folio(lruvec, folio); trace_mm_lru_activate(folio); __count_vm_events(PGACTIVATE, nr_pages); count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE, nr_pages); } #ifdef CONFIG_SMP static void folio_activate_drain(int cpu) { struct folio_batch *fbatch = &per_cpu(cpu_fbatches.lru_activate, cpu); if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_activate); } void folio_activate(struct folio *folio) { if (folio_test_active(folio) || folio_test_unevictable(folio)) return; folio_batch_add_and_move(folio, lru_activate, true); } #else static inline void folio_activate_drain(int cpu) { } void folio_activate(struct folio *folio) { struct lruvec *lruvec; if (!folio_test_clear_lru(folio)) return; lruvec = folio_lruvec_lock_irq(folio); lru_activate(lruvec, folio); unlock_page_lruvec_irq(lruvec); folio_set_lru(folio); } #endif static void __lru_cache_activate_folio(struct folio *folio) { struct folio_batch *fbatch; int i; local_lock(&cpu_fbatches.lock); fbatch = this_cpu_ptr(&cpu_fbatches.lru_add); /* * Search backwards on the optimistic assumption that the folio being * activated has just been added to this batch. Note that only * the local batch is examined as a !LRU folio could be in the * process of being released, reclaimed, migrated or on a remote * batch that is currently being drained. Furthermore, marking * a remote batch's folio active potentially hits a race where * a folio is marked active just after it is added to the inactive * list causing accounting errors and BUG_ON checks to trigger. */ for (i = folio_batch_count(fbatch) - 1; i >= 0; i--) { struct folio *batch_folio = fbatch->folios[i]; if (batch_folio == folio) { folio_set_active(folio); break; } } local_unlock(&cpu_fbatches.lock); } #ifdef CONFIG_LRU_GEN static void lru_gen_inc_refs(struct folio *folio) { unsigned long new_flags, old_flags = READ_ONCE(folio->flags); if (folio_test_unevictable(folio)) return; /* see the comment on LRU_REFS_FLAGS */ if (!folio_test_referenced(folio)) { set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced)); return; } do { if ((old_flags & LRU_REFS_MASK) == LRU_REFS_MASK) { if (!folio_test_workingset(folio)) folio_set_workingset(folio); return; } new_flags = old_flags + BIT(LRU_REFS_PGOFF); } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); } static bool lru_gen_clear_refs(struct folio *folio) { struct lru_gen_folio *lrugen; int gen = folio_lru_gen(folio); int type = folio_is_file_lru(folio); if (gen < 0) return true; set_mask_bits(&folio->flags, LRU_REFS_FLAGS | BIT(PG_workingset), 0); lrugen = &folio_lruvec(folio)->lrugen; /* whether can do without shuffling under the LRU lock */ return gen == lru_gen_from_seq(READ_ONCE(lrugen->min_seq[type])); } #else /* !CONFIG_LRU_GEN */ static void lru_gen_inc_refs(struct folio *folio) { } static bool lru_gen_clear_refs(struct folio *folio) { return false; } #endif /* CONFIG_LRU_GEN */ /** * folio_mark_accessed - Mark a folio as having seen activity. * @folio: The folio to mark. * * This function will perform one of the following transitions: * * * inactive,unreferenced -> inactive,referenced * * inactive,referenced -> active,unreferenced * * active,unreferenced -> active,referenced * * When a newly allocated folio is not yet visible, so safe for non-atomic ops, * __folio_set_referenced() may be substituted for folio_mark_accessed(). */ void folio_mark_accessed(struct folio *folio) { if (folio_test_dropbehind(folio)) return; if (lru_gen_enabled()) { lru_gen_inc_refs(folio); return; } if (!folio_test_referenced(folio)) { folio_set_referenced(folio); } else if (folio_test_unevictable(folio)) { /* * Unevictable pages are on the "LRU_UNEVICTABLE" list. But, * this list is never rotated or maintained, so marking an * unevictable page accessed has no effect. */ } else if (!folio_test_active(folio)) { /* * If the folio is on the LRU, queue it for activation via * cpu_fbatches.lru_activate. Otherwise, assume the folio is in a * folio_batch, mark it active and it'll be moved to the active * LRU on the next drain. */ if (folio_test_lru(folio)) folio_activate(folio); else __lru_cache_activate_folio(folio); folio_clear_referenced(folio); workingset_activation(folio); } if (folio_test_idle(folio)) folio_clear_idle(folio); } EXPORT_SYMBOL(folio_mark_accessed); /** * folio_add_lru - Add a folio to an LRU list. * @folio: The folio to be added to the LRU. * * Queue the folio for addition to the LRU. The decision on whether * to add the page to the [in]active [file|anon] list is deferred until the * folio_batch is drained. This gives a chance for the caller of folio_add_lru() * have the folio added to the active list using folio_mark_accessed(). */ void folio_add_lru(struct folio *folio) { VM_BUG_ON_FOLIO(folio_test_active(folio) && folio_test_unevictable(folio), folio); VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); /* see the comment in lru_gen_folio_seq() */ if (lru_gen_enabled() && !folio_test_unevictable(folio) && lru_gen_in_fault() && !(current->flags & PF_MEMALLOC)) folio_set_active(folio); folio_batch_add_and_move(folio, lru_add, false); } EXPORT_SYMBOL(folio_add_lru); /** * folio_add_lru_vma() - Add a folio to the appropate LRU list for this VMA. * @folio: The folio to be added to the LRU. * @vma: VMA in which the folio is mapped. * * If the VMA is mlocked, @folio is added to the unevictable list. * Otherwise, it is treated the same way as folio_add_lru(). */ void folio_add_lru_vma(struct folio *folio, struct vm_area_struct *vma) { VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); if (unlikely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED)) mlock_new_folio(folio); else folio_add_lru(folio); } /* * If the folio cannot be invalidated, it is moved to the * inactive list to speed up its reclaim. It is moved to the * head of the list, rather than the tail, to give the flusher * threads some time to write it out, as this is much more * effective than the single-page writeout from reclaim. * * If the folio isn't mapped and dirty/writeback, the folio * could be reclaimed asap using the reclaim flag. * * 1. active, mapped folio -> none * 2. active, dirty/writeback folio -> inactive, head, reclaim * 3. inactive, mapped folio -> none * 4. inactive, dirty/writeback folio -> inactive, head, reclaim * 5. inactive, clean -> inactive, tail * 6. Others -> none * * In 4, it moves to the head of the inactive list so the folio is * written out by flusher threads as this is much more efficient * than the single-page writeout from reclaim. */ static void lru_deactivate_file(struct lruvec *lruvec, struct folio *folio) { bool active = folio_test_active(folio) || lru_gen_enabled(); long nr_pages = folio_nr_pages(folio); if (folio_test_unevictable(folio)) return; /* Some processes are using the folio */ if (folio_mapped(folio)) return; lruvec_del_folio(lruvec, folio); folio_clear_active(folio); folio_clear_referenced(folio); if (folio_test_writeback(folio) || folio_test_dirty(folio)) { /* * Setting the reclaim flag could race with * folio_end_writeback() and confuse readahead. But the * race window is _really_ small and it's not a critical * problem. */ lruvec_add_folio(lruvec, folio); folio_set_reclaim(folio); } else { /* * The folio's writeback ended while it was in the batch. * We move that folio to the tail of the inactive list. */ lruvec_add_folio_tail(lruvec, folio); __count_vm_events(PGROTATED, nr_pages); } if (active) { __count_vm_events(PGDEACTIVATE, nr_pages); count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_pages); } } static void lru_deactivate(struct lruvec *lruvec, struct folio *folio) { long nr_pages = folio_nr_pages(folio); if (folio_test_unevictable(folio) || !(folio_test_active(folio) || lru_gen_enabled())) return; lruvec_del_folio(lruvec, folio); folio_clear_active(folio); folio_clear_referenced(folio); lruvec_add_folio(lruvec, folio); __count_vm_events(PGDEACTIVATE, nr_pages); count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_pages); } static void lru_lazyfree(struct lruvec *lruvec, struct folio *folio) { long nr_pages = folio_nr_pages(folio); if (!folio_test_anon(folio) || !folio_test_swapbacked(folio) || folio_test_swapcache(folio) || folio_test_unevictable(folio)) return; lruvec_del_folio(lruvec, folio); folio_clear_active(folio); if (lru_gen_enabled()) lru_gen_clear_refs(folio); else folio_clear_referenced(folio); /* * Lazyfree folios are clean anonymous folios. They have * the swapbacked flag cleared, to distinguish them from normal * anonymous folios */ folio_clear_swapbacked(folio); lruvec_add_folio(lruvec, folio); __count_vm_events(PGLAZYFREE, nr_pages); count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE, nr_pages); } /* * Drain pages out of the cpu's folio_batch. * Either "cpu" is the current CPU, and preemption has already been * disabled; or "cpu" is being hot-unplugged, and is already dead. */ void lru_add_drain_cpu(int cpu) { struct cpu_fbatches *fbatches = &per_cpu(cpu_fbatches, cpu); struct folio_batch *fbatch = &fbatches->lru_add; if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_add); fbatch = &fbatches->lru_move_tail; /* Disabling interrupts below acts as a compiler barrier. */ if (data_race(folio_batch_count(fbatch))) { unsigned long flags; /* No harm done if a racing interrupt already did this */ local_lock_irqsave(&cpu_fbatches.lock_irq, flags); folio_batch_move_lru(fbatch, lru_move_tail); local_unlock_irqrestore(&cpu_fbatches.lock_irq, flags); } fbatch = &fbatches->lru_deactivate_file; if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_deactivate_file); fbatch = &fbatches->lru_deactivate; if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_deactivate); fbatch = &fbatches->lru_lazyfree; if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_lazyfree); folio_activate_drain(cpu); } /** * deactivate_file_folio() - Deactivate a file folio. * @folio: Folio to deactivate. * * This function hints to the VM that @folio is a good reclaim candidate, * for example if its invalidation fails due to the folio being dirty * or under writeback. * * Context: Caller holds a reference on the folio. */ void deactivate_file_folio(struct folio *folio) { /* Deactivating an unevictable folio will not accelerate reclaim */ if (folio_test_unevictable(folio)) return; if (lru_gen_enabled() && lru_gen_clear_refs(folio)) return; folio_batch_add_and_move(folio, lru_deactivate_file, true); } /* * folio_deactivate - deactivate a folio * @folio: folio to deactivate * * folio_deactivate() moves @folio to the inactive list if @folio was on the * active list and was not unevictable. This is done to accelerate the * reclaim of @folio. */ void folio_deactivate(struct folio *folio) { if (folio_test_unevictable(folio)) return; if (lru_gen_enabled() ? lru_gen_clear_refs(folio) : !folio_test_active(folio)) return; folio_batch_add_and_move(folio, lru_deactivate, true); } /** * folio_mark_lazyfree - make an anon folio lazyfree * @folio: folio to deactivate * * folio_mark_lazyfree() moves @folio to the inactive file list. * This is done to accelerate the reclaim of @folio. */ void folio_mark_lazyfree(struct folio *folio) { if (!folio_test_anon(folio) || !folio_test_swapbacked(folio) || folio_test_swapcache(folio) || folio_test_unevictable(folio)) return; folio_batch_add_and_move(folio, lru_lazyfree, true); } void lru_add_drain(void) { local_lock(&cpu_fbatches.lock); lru_add_drain_cpu(smp_processor_id()); local_unlock(&cpu_fbatches.lock); mlock_drain_local(); } /* * It's called from per-cpu workqueue context in SMP case so * lru_add_drain_cpu and invalidate_bh_lrus_cpu should run on * the same cpu. It shouldn't be a problem in !SMP case since * the core is only one and the locks will disable preemption. */ static void lru_add_and_bh_lrus_drain(void) { local_lock(&cpu_fbatches.lock); lru_add_drain_cpu(smp_processor_id()); local_unlock(&cpu_fbatches.lock); invalidate_bh_lrus_cpu(); mlock_drain_local(); } void lru_add_drain_cpu_zone(struct zone *zone) { local_lock(&cpu_fbatches.lock); lru_add_drain_cpu(smp_processor_id()); drain_local_pages(zone); local_unlock(&cpu_fbatches.lock); mlock_drain_local(); } #ifdef CONFIG_SMP static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); static void lru_add_drain_per_cpu(struct work_struct *dummy) { lru_add_and_bh_lrus_drain(); } static bool cpu_needs_drain(unsigned int cpu) { struct cpu_fbatches *fbatches = &per_cpu(cpu_fbatches, cpu); /* Check these in order of likelihood that they're not zero */ return folio_batch_count(&fbatches->lru_add) || folio_batch_count(&fbatches->lru_move_tail) || folio_batch_count(&fbatches->lru_deactivate_file) || folio_batch_count(&fbatches->lru_deactivate) || folio_batch_count(&fbatches->lru_lazyfree) || folio_batch_count(&fbatches->lru_activate) || need_mlock_drain(cpu) || has_bh_in_lru(cpu, NULL); } /* * Doesn't need any cpu hotplug locking because we do rely on per-cpu * kworkers being shut down before our page_alloc_cpu_dead callback is * executed on the offlined cpu. * Calling this function with cpu hotplug locks held can actually lead * to obscure indirect dependencies via WQ context. */ static inline void __lru_add_drain_all(bool force_all_cpus) { /* * lru_drain_gen - Global pages generation number * * (A) Definition: global lru_drain_gen = x implies that all generations * 0 < n <= x are already *scheduled* for draining. * * This is an optimization for the highly-contended use case where a * user space workload keeps constantly generating a flow of pages for * each CPU. */ static unsigned int lru_drain_gen; static struct cpumask has_work; static DEFINE_MUTEX(lock); unsigned cpu, this_gen; /* * Make sure nobody triggers this path before mm_percpu_wq is fully * initialized. */ if (WARN_ON(!mm_percpu_wq)) return; /* * Guarantee folio_batch counter stores visible by this CPU * are visible to other CPUs before loading the current drain * generation. */ smp_mb(); /* * (B) Locally cache global LRU draining generation number * * The read barrier ensures that the counter is loaded before the mutex * is taken. It pairs with smp_mb() inside the mutex critical section * at (D). */ this_gen = smp_load_acquire(&lru_drain_gen); mutex_lock(&lock); /* * (C) Exit the draining operation if a newer generation, from another * lru_add_drain_all(), was already scheduled for draining. Check (A). */ if (unlikely(this_gen != lru_drain_gen && !force_all_cpus)) goto done; /* * (D) Increment global generation number * * Pairs with smp_load_acquire() at (B), outside of the critical * section. Use a full memory barrier to guarantee that the * new global drain generation number is stored before loading * folio_batch counters. * * This pairing must be done here, before the for_each_online_cpu loop * below which drains the page vectors. * * Let x, y, and z represent some system CPU numbers, where x < y < z. * Assume CPU #z is in the middle of the for_each_online_cpu loop * below and has already reached CPU #y's per-cpu data. CPU #x comes * along, adds some pages to its per-cpu vectors, then calls * lru_add_drain_all(). * * If the paired barrier is done at any later step, e.g. after the * loop, CPU #x will just exit at (C) and miss flushing out all of its * added pages. */ WRITE_ONCE(lru_drain_gen, lru_drain_gen + 1); smp_mb(); cpumask_clear(&has_work); for_each_online_cpu(cpu) { struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); if (cpu_needs_drain(cpu)) { INIT_WORK(work, lru_add_drain_per_cpu); queue_work_on(cpu, mm_percpu_wq, work); __cpumask_set_cpu(cpu, &has_work); } } for_each_cpu(cpu, &has_work) flush_work(&per_cpu(lru_add_drain_work, cpu)); done: mutex_unlock(&lock); } void lru_add_drain_all(void) { __lru_add_drain_all(false); } #else void lru_add_drain_all(void) { lru_add_drain(); } #endif /* CONFIG_SMP */ atomic_t lru_disable_count = ATOMIC_INIT(0); /* * lru_cache_disable() needs to be called before we start compiling * a list of folios to be migrated using folio_isolate_lru(). * It drains folios on LRU cache and then disable on all cpus until * lru_cache_enable is called. * * Must be paired with a call to lru_cache_enable(). */ void lru_cache_disable(void) { atomic_inc(&lru_disable_count); /* * Readers of lru_disable_count are protected by either disabling * preemption or rcu_read_lock: * * preempt_disable, local_irq_disable [bh_lru_lock()] * rcu_read_lock [rt_spin_lock CONFIG_PREEMPT_RT] * preempt_disable [local_lock !CONFIG_PREEMPT_RT] * * Since v5.1 kernel, synchronize_rcu() is guaranteed to wait on * preempt_disable() regions of code. So any CPU which sees * lru_disable_count = 0 will have exited the critical * section when synchronize_rcu() returns. */ synchronize_rcu_expedited(); #ifdef CONFIG_SMP __lru_add_drain_all(true); #else lru_add_and_bh_lrus_drain(); #endif } /** * folios_put_refs - Reduce the reference count on a batch of folios. * @folios: The folios. * @refs: The number of refs to subtract from each folio. * * Like folio_put(), but for a batch of folios. This is more efficient * than writing the loop yourself as it will optimise the locks which need * to be taken if the folios are freed. The folios batch is returned * empty and ready to be reused for another batch; there is no need * to reinitialise it. If @refs is NULL, we subtract one from each * folio refcount. * * Context: May be called in process or interrupt context, but not in NMI * context. May be called while holding a spinlock. */ void folios_put_refs(struct folio_batch *folios, unsigned int *refs) { int i, j; struct lruvec *lruvec = NULL; unsigned long flags = 0; for (i = 0, j = 0; i < folios->nr; i++) { struct folio *folio = folios->folios[i]; unsigned int nr_refs = refs ? refs[i] : 1; if (is_huge_zero_folio(folio)) continue; if (folio_is_zone_device(folio)) { if (lruvec) { unlock_page_lruvec_irqrestore(lruvec, flags); lruvec = NULL; } if (folio_ref_sub_and_test(folio, nr_refs)) free_zone_device_folio(folio); continue; } if (!folio_ref_sub_and_test(folio, nr_refs)) continue; /* hugetlb has its own memcg */ if (folio_test_hugetlb(folio)) { if (lruvec) { unlock_page_lruvec_irqrestore(lruvec, flags); lruvec = NULL; } free_huge_folio(folio); continue; } folio_unqueue_deferred_split(folio); __page_cache_release(folio, &lruvec, &flags); if (j != i) folios->folios[j] = folio; j++; } if (lruvec) unlock_page_lruvec_irqrestore(lruvec, flags); if (!j) { folio_batch_reinit(folios); return; } folios->nr = j; mem_cgroup_uncharge_folios(folios); free_unref_folios(folios); } EXPORT_SYMBOL(folios_put_refs); /** * release_pages - batched put_page() * @arg: array of pages to release * @nr: number of pages * * Decrement the reference count on all the pages in @arg. If it * fell to zero, remove the page from the LRU and free it. * * Note that the argument can be an array of pages, encoded pages, * or folio pointers. We ignore any encoded bits, and turn any of * them into just a folio that gets free'd. */ void release_pages(release_pages_arg arg, int nr) { struct folio_batch fbatch; int refs[PAGEVEC_SIZE]; struct encoded_page **encoded = arg.encoded_pages; int i; folio_batch_init(&fbatch); for (i = 0; i < nr; i++) { /* Turn any of the argument types into a folio */ struct folio *folio = page_folio(encoded_page_ptr(encoded[i])); /* Is our next entry actually "nr_pages" -> "nr_refs" ? */ refs[fbatch.nr] = 1; if (unlikely(encoded_page_flags(encoded[i]) & ENCODED_PAGE_BIT_NR_PAGES_NEXT)) refs[fbatch.nr] = encoded_nr_pages(encoded[++i]); if (folio_batch_add(&fbatch, folio) > 0) continue; folios_put_refs(&fbatch, refs); } if (fbatch.nr) folios_put_refs(&fbatch, refs); } EXPORT_SYMBOL(release_pages); /* * The folios which we're about to release may be in the deferred lru-addition * queues. That would prevent them from really being freed right now. That's * OK from a correctness point of view but is inefficient - those folios may be * cache-warm and we want to give them back to the page allocator ASAP. * * So __folio_batch_release() will drain those queues here. * folio_batch_move_lru() calls folios_put() directly to avoid * mutual recursion. */ void __folio_batch_release(struct folio_batch *fbatch) { if (!fbatch->percpu_pvec_drained) { lru_add_drain(); fbatch->percpu_pvec_drained = true; } folios_put(fbatch); } EXPORT_SYMBOL(__folio_batch_release); /** * folio_batch_remove_exceptionals() - Prune non-folios from a batch. * @fbatch: The batch to prune * * find_get_entries() fills a batch with both folios and shadow/swap/DAX * entries. This function prunes all the non-folio entries from @fbatch * without leaving holes, so that it can be passed on to folio-only batch * operations. */ void folio_batch_remove_exceptionals(struct folio_batch *fbatch) { unsigned int i, j; for (i = 0, j = 0; i < folio_batch_count(fbatch); i++) { struct folio *folio = fbatch->folios[i]; if (!xa_is_value(folio)) fbatch->folios[j++] = folio; } fbatch->nr = j; } static const struct ctl_table swap_sysctl_table[] = { { .procname = "page-cluster", .data = &page_cluster, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = (void *)&page_cluster_max, } }; /* * Perform any setup for the swap system */ void __init swap_setup(void) { unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT); /* Use a smaller cluster for small-memory machines */ if (megs < 16) page_cluster = 2; else page_cluster = 3; /* * Right now other parts of the system means that we * _really_ don't want to cluster much more */ register_sysctl_init("vm", swap_sysctl_table); }
1 1 181 181 4 55 55 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __ASM_GENERIC_PGALLOC_H #define __ASM_GENERIC_PGALLOC_H #ifdef CONFIG_MMU #define GFP_PGTABLE_KERNEL (GFP_KERNEL | __GFP_ZERO) #define GFP_PGTABLE_USER (GFP_PGTABLE_KERNEL | __GFP_ACCOUNT) /** * __pte_alloc_one_kernel - allocate memory for a PTE-level kernel page table * @mm: the mm_struct of the current context * * This function is intended for architectures that need * anything beyond simple page allocation. * * Return: pointer to the allocated memory or %NULL on error */ static inline pte_t *__pte_alloc_one_kernel_noprof(struct mm_struct *mm) { struct ptdesc *ptdesc = pagetable_alloc_noprof(GFP_PGTABLE_KERNEL & ~__GFP_HIGHMEM, 0); if (!ptdesc) return NULL; if (!pagetable_pte_ctor(mm, ptdesc)) { pagetable_free(ptdesc); return NULL; } return ptdesc_address(ptdesc); } #define __pte_alloc_one_kernel(...) alloc_hooks(__pte_alloc_one_kernel_noprof(__VA_ARGS__)) #ifndef __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL /** * pte_alloc_one_kernel - allocate memory for a PTE-level kernel page table * @mm: the mm_struct of the current context * * Return: pointer to the allocated memory or %NULL on error */ static inline pte_t *pte_alloc_one_kernel_noprof(struct mm_struct *mm) { return __pte_alloc_one_kernel_noprof(mm); } #define pte_alloc_one_kernel(...) alloc_hooks(pte_alloc_one_kernel_noprof(__VA_ARGS__)) #endif /** * pte_free_kernel - free PTE-level kernel page table memory * @mm: the mm_struct of the current context * @pte: pointer to the memory containing the page table */ static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { pagetable_dtor_free(virt_to_ptdesc(pte)); } /** * __pte_alloc_one - allocate memory for a PTE-level user page table * @mm: the mm_struct of the current context * @gfp: GFP flags to use for the allocation * * Allocate memory for a page table and ptdesc and runs pagetable_pte_ctor(). * * This function is intended for architectures that need * anything beyond simple page allocation or must have custom GFP flags. * * Return: `struct page` referencing the ptdesc or %NULL on error */ static inline pgtable_t __pte_alloc_one_noprof(struct mm_struct *mm, gfp_t gfp) { struct ptdesc *ptdesc; ptdesc = pagetable_alloc_noprof(gfp, 0); if (!ptdesc) return NULL; if (!pagetable_pte_ctor(mm, ptdesc)) { pagetable_free(ptdesc); return NULL; } return ptdesc_page(ptdesc); } #define __pte_alloc_one(...) alloc_hooks(__pte_alloc_one_noprof(__VA_ARGS__)) #ifndef __HAVE_ARCH_PTE_ALLOC_ONE /** * pte_alloc_one - allocate a page for PTE-level user page table * @mm: the mm_struct of the current context * * Allocate memory for a page table and ptdesc and runs pagetable_pte_ctor(). * * Return: `struct page` referencing the ptdesc or %NULL on error */ static inline pgtable_t pte_alloc_one_noprof(struct mm_struct *mm) { return __pte_alloc_one_noprof(mm, GFP_PGTABLE_USER); } #define pte_alloc_one(...) alloc_hooks(pte_alloc_one_noprof(__VA_ARGS__)) #endif /* * Should really implement gc for free page table pages. This could be * done with a reference count in struct page. */ /** * pte_free - free PTE-level user page table memory * @mm: the mm_struct of the current context * @pte_page: the `struct page` referencing the ptdesc */ static inline void pte_free(struct mm_struct *mm, struct page *pte_page) { struct ptdesc *ptdesc = page_ptdesc(pte_page); pagetable_dtor_free(ptdesc); } #if CONFIG_PGTABLE_LEVELS > 2 #ifndef __HAVE_ARCH_PMD_ALLOC_ONE /** * pmd_alloc_one - allocate memory for a PMD-level page table * @mm: the mm_struct of the current context * * Allocate memory for a page table and ptdesc and runs pagetable_pmd_ctor(). * * Allocations use %GFP_PGTABLE_USER in user context and * %GFP_PGTABLE_KERNEL in kernel context. * * Return: pointer to the allocated memory or %NULL on error */ static inline pmd_t *pmd_alloc_one_noprof(struct mm_struct *mm, unsigned long addr) { struct ptdesc *ptdesc; gfp_t gfp = GFP_PGTABLE_USER; if (mm == &init_mm) gfp = GFP_PGTABLE_KERNEL; ptdesc = pagetable_alloc_noprof(gfp, 0); if (!ptdesc) return NULL; if (!pagetable_pmd_ctor(mm, ptdesc)) { pagetable_free(ptdesc); return NULL; } return ptdesc_address(ptdesc); } #define pmd_alloc_one(...) alloc_hooks(pmd_alloc_one_noprof(__VA_ARGS__)) #endif #ifndef __HAVE_ARCH_PMD_FREE static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) { struct ptdesc *ptdesc = virt_to_ptdesc(pmd); BUG_ON((unsigned long)pmd & (PAGE_SIZE-1)); pagetable_dtor_free(ptdesc); } #endif #endif /* CONFIG_PGTABLE_LEVELS > 2 */ #if CONFIG_PGTABLE_LEVELS > 3 static inline pud_t *__pud_alloc_one_noprof(struct mm_struct *mm, unsigned long addr) { gfp_t gfp = GFP_PGTABLE_USER; struct ptdesc *ptdesc; if (mm == &init_mm) gfp = GFP_PGTABLE_KERNEL; gfp &= ~__GFP_HIGHMEM; ptdesc = pagetable_alloc_noprof(gfp, 0); if (!ptdesc) return NULL; pagetable_pud_ctor(ptdesc); return ptdesc_address(ptdesc); } #define __pud_alloc_one(...) alloc_hooks(__pud_alloc_one_noprof(__VA_ARGS__)) #ifndef __HAVE_ARCH_PUD_ALLOC_ONE /** * pud_alloc_one - allocate memory for a PUD-level page table * @mm: the mm_struct of the current context * * Allocate memory for a page table using %GFP_PGTABLE_USER for user context * and %GFP_PGTABLE_KERNEL for kernel context. * * Return: pointer to the allocated memory or %NULL on error */ static inline pud_t *pud_alloc_one_noprof(struct mm_struct *mm, unsigned long addr) { return __pud_alloc_one_noprof(mm, addr); } #define pud_alloc_one(...) alloc_hooks(pud_alloc_one_noprof(__VA_ARGS__)) #endif static inline void __pud_free(struct mm_struct *mm, pud_t *pud) { struct ptdesc *ptdesc = virt_to_ptdesc(pud); BUG_ON((unsigned long)pud & (PAGE_SIZE-1)); pagetable_dtor_free(ptdesc); } #ifndef __HAVE_ARCH_PUD_FREE static inline void pud_free(struct mm_struct *mm, pud_t *pud) { __pud_free(mm, pud); } #endif #endif /* CONFIG_PGTABLE_LEVELS > 3 */ #if CONFIG_PGTABLE_LEVELS > 4 static inline p4d_t *__p4d_alloc_one_noprof(struct mm_struct *mm, unsigned long addr) { gfp_t gfp = GFP_PGTABLE_USER; struct ptdesc *ptdesc; if (mm == &init_mm) gfp = GFP_PGTABLE_KERNEL; gfp &= ~__GFP_HIGHMEM; ptdesc = pagetable_alloc_noprof(gfp, 0); if (!ptdesc) return NULL; pagetable_p4d_ctor(ptdesc); return ptdesc_address(ptdesc); } #define __p4d_alloc_one(...) alloc_hooks(__p4d_alloc_one_noprof(__VA_ARGS__)) #ifndef __HAVE_ARCH_P4D_ALLOC_ONE static inline p4d_t *p4d_alloc_one_noprof(struct mm_struct *mm, unsigned long addr) { return __p4d_alloc_one_noprof(mm, addr); } #define p4d_alloc_one(...) alloc_hooks(p4d_alloc_one_noprof(__VA_ARGS__)) #endif static inline void __p4d_free(struct mm_struct *mm, p4d_t *p4d) { struct ptdesc *ptdesc = virt_to_ptdesc(p4d); BUG_ON((unsigned long)p4d & (PAGE_SIZE-1)); pagetable_dtor_free(ptdesc); } #ifndef __HAVE_ARCH_P4D_FREE static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d) { if (!mm_p4d_folded(mm)) __p4d_free(mm, p4d); } #endif #endif /* CONFIG_PGTABLE_LEVELS > 4 */ static inline pgd_t *__pgd_alloc_noprof(struct mm_struct *mm, unsigned int order) { gfp_t gfp = GFP_PGTABLE_USER; struct ptdesc *ptdesc; if (mm == &init_mm) gfp = GFP_PGTABLE_KERNEL; gfp &= ~__GFP_HIGHMEM; ptdesc = pagetable_alloc_noprof(gfp, order); if (!ptdesc) return NULL; pagetable_pgd_ctor(ptdesc); return ptdesc_address(ptdesc); } #define __pgd_alloc(...) alloc_hooks(__pgd_alloc_noprof(__VA_ARGS__)) static inline void __pgd_free(struct mm_struct *mm, pgd_t *pgd) { struct ptdesc *ptdesc = virt_to_ptdesc(pgd); BUG_ON((unsigned long)pgd & (PAGE_SIZE-1)); pagetable_dtor_free(ptdesc); } #ifndef __HAVE_ARCH_PGD_FREE static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { __pgd_free(mm, pgd); } #endif #endif /* CONFIG_MMU */ #endif /* __ASM_GENERIC_PGALLOC_H */
305 305 305 305 304 305 305 303 306 1 304 306 11 11 1 1 135 135 203 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2015 - ARM Ltd * Author: Marc Zyngier <marc.zyngier@arm.com> */ #include <linux/irqflags.h> #include <asm/kvm_hyp.h> #include <asm/kvm_mmu.h> #include <asm/tlbflush.h> struct tlb_inv_context { struct kvm_s2_mmu *mmu; unsigned long flags; u64 tcr; u64 sctlr; }; static void enter_vmid_context(struct kvm_s2_mmu *mmu, struct tlb_inv_context *cxt) { struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); u64 val; local_irq_save(cxt->flags); if (vcpu && mmu != vcpu->arch.hw_mmu) cxt->mmu = vcpu->arch.hw_mmu; else cxt->mmu = NULL; if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { /* * For CPUs that are affected by ARM errata 1165522 or 1530923, * we cannot trust stage-1 to be in a correct state at that * point. Since we do not want to force a full load of the * vcpu state, we prevent the EL1 page-table walker to * allocate new TLBs. This is done by setting the EPD bits * in the TCR_EL1 register. We also need to prevent it to * allocate IPA->PA walks, so we enable the S1 MMU... */ val = cxt->tcr = read_sysreg_el1(SYS_TCR); val |= TCR_EPD1_MASK | TCR_EPD0_MASK; write_sysreg_el1(val, SYS_TCR); val = cxt->sctlr = read_sysreg_el1(SYS_SCTLR); val |= SCTLR_ELx_M; write_sysreg_el1(val, SYS_SCTLR); } /* * With VHE enabled, we have HCR_EL2.{E2H,TGE} = {1,1}, and * most TLB operations target EL2/EL0. In order to affect the * guest TLBs (EL1/EL0), we need to change one of these two * bits. Changing E2H is impossible (goodbye TTBR1_EL2), so * let's flip TGE before executing the TLB operation. * * ARM erratum 1165522 requires some special handling (again), * as we need to make sure both stages of translation are in * place before clearing TGE. __load_stage2() already * has an ISB in order to deal with this. */ __load_stage2(mmu, mmu->arch); val = read_sysreg(hcr_el2); val &= ~HCR_TGE; write_sysreg_hcr(val); isb(); } static void exit_vmid_context(struct tlb_inv_context *cxt) { /* * We're done with the TLB operation, let's restore the host's * view of HCR_EL2. */ write_sysreg_hcr(HCR_HOST_VHE_FLAGS); isb(); /* ... and the stage-2 MMU context that we switched away from */ if (cxt->mmu) __load_stage2(cxt->mmu, cxt->mmu->arch); if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { /* Restore the registers to what they were */ write_sysreg_el1(cxt->tcr, SYS_TCR); write_sysreg_el1(cxt->sctlr, SYS_SCTLR); } local_irq_restore(cxt->flags); } void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa, int level) { struct tlb_inv_context cxt; dsb(ishst); /* Switch to requested VMID */ enter_vmid_context(mmu, &cxt); /* * We could do so much better if we had the VA as well. * Instead, we invalidate Stage-2 for this IPA, and the * whole of Stage-1. Weep... */ ipa >>= 12; __tlbi_level(ipas2e1is, ipa, level); /* * We have to ensure completion of the invalidation at Stage-2, * since a table walk on another CPU could refill a TLB with a * complete (S1 + S2) walk based on the old Stage-2 mapping if * the Stage-1 invalidation happened first. */ dsb(ish); __tlbi(vmalle1is); dsb(ish); isb(); exit_vmid_context(&cxt); } void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu, phys_addr_t ipa, int level) { struct tlb_inv_context cxt; dsb(nshst); /* Switch to requested VMID */ enter_vmid_context(mmu, &cxt); /* * We could do so much better if we had the VA as well. * Instead, we invalidate Stage-2 for this IPA, and the * whole of Stage-1. Weep... */ ipa >>= 12; __tlbi_level(ipas2e1, ipa, level); /* * We have to ensure completion of the invalidation at Stage-2, * since a table walk on another CPU could refill a TLB with a * complete (S1 + S2) walk based on the old Stage-2 mapping if * the Stage-1 invalidation happened first. */ dsb(nsh); __tlbi(vmalle1); dsb(nsh); isb(); exit_vmid_context(&cxt); } void __kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, phys_addr_t start, unsigned long pages) { struct tlb_inv_context cxt; unsigned long stride; /* * Since the range of addresses may not be mapped at * the same level, assume the worst case as PAGE_SIZE */ stride = PAGE_SIZE; start = round_down(start, stride); dsb(ishst); /* Switch to requested VMID */ enter_vmid_context(mmu, &cxt); __flush_s2_tlb_range_op(ipas2e1is, start, pages, stride, TLBI_TTL_UNKNOWN); dsb(ish); __tlbi(vmalle1is); dsb(ish); isb(); exit_vmid_context(&cxt); } void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu) { struct tlb_inv_context cxt; dsb(ishst); /* Switch to requested VMID */ enter_vmid_context(mmu, &cxt); __tlbi(vmalls12e1is); dsb(ish); isb(); exit_vmid_context(&cxt); } void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu) { struct tlb_inv_context cxt; /* Switch to requested VMID */ enter_vmid_context(mmu, &cxt); __tlbi(vmalle1); asm volatile("ic iallu"); dsb(nsh); isb(); exit_vmid_context(&cxt); } void __kvm_flush_vm_context(void) { dsb(ishst); __tlbi(alle1is); dsb(ish); } /* * TLB invalidation emulation for NV. For any given instruction, we * perform the following transformtions: * * - a TLBI targeting EL2 S1 is remapped to EL1 S1 * - a non-shareable TLBI is upgraded to being inner-shareable * - an outer-shareable TLBI is also mapped to inner-shareable * - an nXS TLBI is upgraded to XS */ int __kvm_tlbi_s1e2(struct kvm_s2_mmu *mmu, u64 va, u64 sys_encoding) { struct tlb_inv_context cxt; int ret = 0; /* * The guest will have provided its own DSB ISHST before trapping. * If it hasn't, that's its own problem, and we won't paper over it * (plus, there is plenty of extra synchronisation before we even * get here...). */ if (mmu) enter_vmid_context(mmu, &cxt); switch (sys_encoding) { case OP_TLBI_ALLE2: case OP_TLBI_ALLE2IS: case OP_TLBI_ALLE2OS: case OP_TLBI_VMALLE1: case OP_TLBI_VMALLE1IS: case OP_TLBI_VMALLE1OS: case OP_TLBI_ALLE2NXS: case OP_TLBI_ALLE2ISNXS: case OP_TLBI_ALLE2OSNXS: case OP_TLBI_VMALLE1NXS: case OP_TLBI_VMALLE1ISNXS: case OP_TLBI_VMALLE1OSNXS: __tlbi(vmalle1is); break; case OP_TLBI_VAE2: case OP_TLBI_VAE2IS: case OP_TLBI_VAE2OS: case OP_TLBI_VAE1: case OP_TLBI_VAE1IS: case OP_TLBI_VAE1OS: case OP_TLBI_VAE2NXS: case OP_TLBI_VAE2ISNXS: case OP_TLBI_VAE2OSNXS: case OP_TLBI_VAE1NXS: case OP_TLBI_VAE1ISNXS: case OP_TLBI_VAE1OSNXS: __tlbi(vae1is, va); break; case OP_TLBI_VALE2: case OP_TLBI_VALE2IS: case OP_TLBI_VALE2OS: case OP_TLBI_VALE1: case OP_TLBI_VALE1IS: case OP_TLBI_VALE1OS: case OP_TLBI_VALE2NXS: case OP_TLBI_VALE2ISNXS: case OP_TLBI_VALE2OSNXS: case OP_TLBI_VALE1NXS: case OP_TLBI_VALE1ISNXS: case OP_TLBI_VALE1OSNXS: __tlbi(vale1is, va); break; case OP_TLBI_ASIDE1: case OP_TLBI_ASIDE1IS: case OP_TLBI_ASIDE1OS: case OP_TLBI_ASIDE1NXS: case OP_TLBI_ASIDE1ISNXS: case OP_TLBI_ASIDE1OSNXS: __tlbi(aside1is, va); break; case OP_TLBI_VAAE1: case OP_TLBI_VAAE1IS: case OP_TLBI_VAAE1OS: case OP_TLBI_VAAE1NXS: case OP_TLBI_VAAE1ISNXS: case OP_TLBI_VAAE1OSNXS: __tlbi(vaae1is, va); break; case OP_TLBI_VAALE1: case OP_TLBI_VAALE1IS: case OP_TLBI_VAALE1OS: case OP_TLBI_VAALE1NXS: case OP_TLBI_VAALE1ISNXS: case OP_TLBI_VAALE1OSNXS: __tlbi(vaale1is, va); break; case OP_TLBI_RVAE2: case OP_TLBI_RVAE2IS: case OP_TLBI_RVAE2OS: case OP_TLBI_RVAE1: case OP_TLBI_RVAE1IS: case OP_TLBI_RVAE1OS: case OP_TLBI_RVAE2NXS: case OP_TLBI_RVAE2ISNXS: case OP_TLBI_RVAE2OSNXS: case OP_TLBI_RVAE1NXS: case OP_TLBI_RVAE1ISNXS: case OP_TLBI_RVAE1OSNXS: __tlbi(rvae1is, va); break; case OP_TLBI_RVALE2: case OP_TLBI_RVALE2IS: case OP_TLBI_RVALE2OS: case OP_TLBI_RVALE1: case OP_TLBI_RVALE1IS: case OP_TLBI_RVALE1OS: case OP_TLBI_RVALE2NXS: case OP_TLBI_RVALE2ISNXS: case OP_TLBI_RVALE2OSNXS: case OP_TLBI_RVALE1NXS: case OP_TLBI_RVALE1ISNXS: case OP_TLBI_RVALE1OSNXS: __tlbi(rvale1is, va); break; case OP_TLBI_RVAAE1: case OP_TLBI_RVAAE1IS: case OP_TLBI_RVAAE1OS: case OP_TLBI_RVAAE1NXS: case OP_TLBI_RVAAE1ISNXS: case OP_TLBI_RVAAE1OSNXS: __tlbi(rvaae1is, va); break; case OP_TLBI_RVAALE1: case OP_TLBI_RVAALE1IS: case OP_TLBI_RVAALE1OS: case OP_TLBI_RVAALE1NXS: case OP_TLBI_RVAALE1ISNXS: case OP_TLBI_RVAALE1OSNXS: __tlbi(rvaale1is, va); break; default: ret = -EINVAL; } dsb(ish); isb(); if (mmu) exit_vmid_context(&cxt); return ret; }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_GENERIC_HUGETLB_H #define _ASM_GENERIC_HUGETLB_H #include <linux/swap.h> #include <linux/swapops.h> static inline unsigned long huge_pte_write(pte_t pte) { return pte_write(pte); } static inline unsigned long huge_pte_dirty(pte_t pte) { return pte_dirty(pte); } static inline pte_t huge_pte_mkwrite(pte_t pte) { return pte_mkwrite_novma(pte); } #ifndef __HAVE_ARCH_HUGE_PTE_WRPROTECT static inline pte_t huge_pte_wrprotect(pte_t pte) { return pte_wrprotect(pte); } #endif static inline pte_t huge_pte_mkdirty(pte_t pte) { return pte_mkdirty(pte); } static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot) { return pte_modify(pte, newprot); } #ifndef __HAVE_ARCH_HUGE_PTE_MKUFFD_WP static inline pte_t huge_pte_mkuffd_wp(pte_t pte) { return huge_pte_wrprotect(pte_mkuffd_wp(pte)); } #endif #ifndef __HAVE_ARCH_HUGE_PTE_CLEAR_UFFD_WP static inline pte_t huge_pte_clear_uffd_wp(pte_t pte) { return pte_clear_uffd_wp(pte); } #endif #ifndef __HAVE_ARCH_HUGE_PTE_UFFD_WP static inline int huge_pte_uffd_wp(pte_t pte) { return pte_uffd_wp(pte); } #endif #ifndef __HAVE_ARCH_HUGE_PTE_CLEAR static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long sz) { pte_clear(mm, addr, ptep); } #endif #ifndef __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned long sz) { set_pte_at(mm, addr, ptep, pte); } #endif #ifndef __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long sz) { return ptep_get_and_clear(mm, addr, ptep); } #endif #ifndef __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { return ptep_clear_flush(vma, addr, ptep); } #endif #ifndef __HAVE_ARCH_HUGE_PTE_NONE static inline int huge_pte_none(pte_t pte) { return pte_none(pte); } #endif /* Please refer to comments above pte_none_mostly() for the usage */ #ifndef __HAVE_ARCH_HUGE_PTE_NONE_MOSTLY static inline int huge_pte_none_mostly(pte_t pte) { return huge_pte_none(pte) || is_pte_marker(pte); } #endif #ifndef __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { ptep_set_wrprotect(mm, addr, ptep); } #endif #ifndef __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty) { return ptep_set_access_flags(vma, addr, ptep, pte, dirty); } #endif #ifndef __HAVE_ARCH_HUGE_PTEP_GET static inline pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { return ptep_get(ptep); } #endif #ifndef __HAVE_ARCH_GIGANTIC_PAGE_RUNTIME_SUPPORTED static inline bool gigantic_page_runtime_supported(void) { return IS_ENABLED(CONFIG_ARCH_HAS_GIGANTIC_PAGE); } #endif /* __HAVE_ARCH_GIGANTIC_PAGE_RUNTIME_SUPPORTED */ #endif /* _ASM_GENERIC_HUGETLB_H */
9 9 9 9 9 4 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Ethernet-type device handling. * * Version: @(#)eth.c 1.0.7 05/25/93 * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * Florian La Roche, <rzsfl@rz.uni-sb.de> * Alan Cox, <gw4pts@gw4pts.ampr.org> * * Fixes: * Mr Linux : Arp problems * Alan Cox : Generic queue tidyup (very tiny here) * Alan Cox : eth_header ntohs should be htons * Alan Cox : eth_rebuild_header missing an htons and * minor other things. * Tegge : Arp bug fixes. * Florian : Removed many unnecessary functions, code cleanup * and changes for new arp and skbuff. * Alan Cox : Redid header building to reflect new format. * Alan Cox : ARP only when compiled with CONFIG_INET * Greg Page : 802.2 and SNAP stuff. * Alan Cox : MAC layer pointers/new format. * Paul Gortmaker : eth_copy_and_sum shouldn't csum padding. * Alan Cox : Protect against forwarding explosions with * older network drivers and IFF_ALLMULTI. * Christer Weinigel : Better rebuild header message. * Andrew Morton : 26Feb01: kill ether_setup() - use netdev_boot_setup(). */ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/ip.h> #include <linux/netdevice.h> #include <linux/nvmem-consumer.h> #include <linux/etherdevice.h> #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/init.h> #include <linux/if_ether.h> #include <linux/of_net.h> #include <linux/pci.h> #include <linux/property.h> #include <net/dst.h> #include <net/arp.h> #include <net/sock.h> #include <net/ipv6.h> #include <net/ip.h> #include <net/dsa.h> #include <net/flow_dissector.h> #include <net/gro.h> #include <linux/uaccess.h> #include <net/pkt_sched.h> /** * eth_header - create the Ethernet header * @skb: buffer to alter * @dev: source device * @type: Ethernet type field * @daddr: destination address (NULL leave destination address) * @saddr: source address (NULL use device source address) * @len: packet length (<= skb->len) * * * Set the protocol type. For a packet of type ETH_P_802_3/2 we put the length * in here instead. */ int eth_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned int len) { struct ethhdr *eth = skb_push(skb, ETH_HLEN); if (type != ETH_P_802_3 && type != ETH_P_802_2) eth->h_proto = htons(type); else eth->h_proto = htons(len); /* * Set the source hardware address. */ if (!saddr) saddr = dev->dev_addr; memcpy(eth->h_source, saddr, ETH_ALEN); if (daddr) { memcpy(eth->h_dest, daddr, ETH_ALEN); return ETH_HLEN; } /* * Anyway, the loopback-device should never use this function... */ if (dev->flags & (IFF_LOOPBACK | IFF_NOARP)) { eth_zero_addr(eth->h_dest); return ETH_HLEN; } return -ETH_HLEN; } EXPORT_SYMBOL(eth_header); /** * eth_get_headlen - determine the length of header for an ethernet frame * @dev: pointer to network device * @data: pointer to start of frame * @len: total length of frame * * Make a best effort attempt to pull the length for all of the headers for * a given frame in a linear buffer. */ u32 eth_get_headlen(const struct net_device *dev, const void *data, u32 len) { const unsigned int flags = FLOW_DISSECTOR_F_PARSE_1ST_FRAG; const struct ethhdr *eth = (const struct ethhdr *)data; struct flow_keys_basic keys; /* this should never happen, but better safe than sorry */ if (unlikely(len < sizeof(*eth))) return len; /* parse any remaining L2/L3 headers, check for L4 */ if (!skb_flow_dissect_flow_keys_basic(dev_net(dev), NULL, &keys, data, eth->h_proto, sizeof(*eth), len, flags)) return max_t(u32, keys.control.thoff, sizeof(*eth)); /* parse for any L4 headers */ return min_t(u32, __skb_get_poff(NULL, data, &keys, len), len); } EXPORT_SYMBOL(eth_get_headlen); /** * eth_type_trans - determine the packet's protocol ID. * @skb: received socket data * @dev: receiving network device * * The rule here is that we * assume 802.3 if the type field is short enough to be a length. * This is normal practice and works for any 'now in use' protocol. */ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) { unsigned short _service_access_point; const unsigned short *sap; const struct ethhdr *eth; skb->dev = dev; skb_reset_mac_header(skb); eth = eth_skb_pull_mac(skb); eth_skb_pkt_type(skb, dev); /* * Some variants of DSA tagging don't have an ethertype field * at all, so we check here whether one of those tagging * variants has been configured on the receiving interface, * and if so, set skb->protocol without looking at the packet. */ if (unlikely(netdev_uses_dsa(dev))) return htons(ETH_P_XDSA); if (likely(eth_proto_is_802_3(eth->h_proto))) return eth->h_proto; /* * This is a magic hack to spot IPX packets. Older Novell breaks * the protocol design and runs IPX over 802.3 without an 802.2 LLC * layer. We look for FFFF which isn't a used 802.2 SSAP/DSAP. This * won't work for fault tolerant netware but does for the rest. */ sap = skb_header_pointer(skb, 0, sizeof(*sap), &_service_access_point); if (sap && *sap == 0xFFFF) return htons(ETH_P_802_3); /* * Real 802.2 LLC */ return htons(ETH_P_802_2); } EXPORT_SYMBOL(eth_type_trans); /** * eth_header_parse - extract hardware address from packet * @skb: packet to extract header from * @haddr: destination buffer */ int eth_header_parse(const struct sk_buff *skb, unsigned char *haddr) { const struct ethhdr *eth = eth_hdr(skb); memcpy(haddr, eth->h_source, ETH_ALEN); return ETH_ALEN; } EXPORT_SYMBOL(eth_header_parse); /** * eth_header_cache - fill cache entry from neighbour * @neigh: source neighbour * @hh: destination cache entry * @type: Ethernet type field * * Create an Ethernet header template from the neighbour. */ int eth_header_cache(const struct neighbour *neigh, struct hh_cache *hh, __be16 type) { struct ethhdr *eth; const struct net_device *dev = neigh->dev; eth = (struct ethhdr *) (((u8 *) hh->hh_data) + (HH_DATA_OFF(sizeof(*eth)))); if (type == htons(ETH_P_802_3)) return -1; eth->h_proto = type; memcpy(eth->h_source, dev->dev_addr, ETH_ALEN); memcpy(eth->h_dest, neigh->ha, ETH_ALEN); /* Pairs with READ_ONCE() in neigh_resolve_output(), * neigh_hh_output() and neigh_update_hhs(). */ smp_store_release(&hh->hh_len, ETH_HLEN); return 0; } EXPORT_SYMBOL(eth_header_cache); /** * eth_header_cache_update - update cache entry * @hh: destination cache entry * @dev: network device * @haddr: new hardware address * * Called by Address Resolution module to notify changes in address. */ void eth_header_cache_update(struct hh_cache *hh, const struct net_device *dev, const unsigned char *haddr) { memcpy(((u8 *) hh->hh_data) + HH_DATA_OFF(sizeof(struct ethhdr)), haddr, ETH_ALEN); } EXPORT_SYMBOL(eth_header_cache_update); /** * eth_header_parse_protocol - extract protocol from L2 header * @skb: packet to extract protocol from */ __be16 eth_header_parse_protocol(const struct sk_buff *skb) { const struct ethhdr *eth = eth_hdr(skb); return eth->h_proto; } EXPORT_SYMBOL(eth_header_parse_protocol); /** * eth_prepare_mac_addr_change - prepare for mac change * @dev: network device * @p: socket address */ int eth_prepare_mac_addr_change(struct net_device *dev, void *p) { struct sockaddr *addr = p; if (!(dev->priv_flags & IFF_LIVE_ADDR_CHANGE) && netif_running(dev)) return -EBUSY; if (!is_valid_ether_addr(addr->sa_data)) return -EADDRNOTAVAIL; return 0; } EXPORT_SYMBOL(eth_prepare_mac_addr_change); /** * eth_commit_mac_addr_change - commit mac change * @dev: network device * @p: socket address */ void eth_commit_mac_addr_change(struct net_device *dev, void *p) { struct sockaddr *addr = p; eth_hw_addr_set(dev, addr->sa_data); } EXPORT_SYMBOL(eth_commit_mac_addr_change); /** * eth_mac_addr - set new Ethernet hardware address * @dev: network device * @p: socket address * * Change hardware address of device. * * This doesn't change hardware matching, so needs to be overridden * for most real devices. */ int eth_mac_addr(struct net_device *dev, void *p) { int ret; ret = eth_prepare_mac_addr_change(dev, p); if (ret < 0) return ret; eth_commit_mac_addr_change(dev, p); return 0; } EXPORT_SYMBOL(eth_mac_addr); int eth_validate_addr(struct net_device *dev) { if (!is_valid_ether_addr(dev->dev_addr)) return -EADDRNOTAVAIL; return 0; } EXPORT_SYMBOL(eth_validate_addr); const struct header_ops eth_header_ops ____cacheline_aligned = { .create = eth_header, .parse = eth_header_parse, .cache = eth_header_cache, .cache_update = eth_header_cache_update, .parse_protocol = eth_header_parse_protocol, }; /** * ether_setup - setup Ethernet network device * @dev: network device * * Fill in the fields of the device structure with Ethernet-generic values. */ void ether_setup(struct net_device *dev) { dev->header_ops = &eth_header_ops; dev->type = ARPHRD_ETHER; dev->hard_header_len = ETH_HLEN; dev->min_header_len = ETH_HLEN; dev->mtu = ETH_DATA_LEN; dev->min_mtu = ETH_MIN_MTU; dev->max_mtu = ETH_DATA_LEN; dev->addr_len = ETH_ALEN; dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN; dev->flags = IFF_BROADCAST|IFF_MULTICAST; dev->priv_flags |= IFF_TX_SKB_SHARING; eth_broadcast_addr(dev->broadcast); } EXPORT_SYMBOL(ether_setup); /** * alloc_etherdev_mqs - Allocates and sets up an Ethernet device * @sizeof_priv: Size of additional driver-private structure to be allocated * for this Ethernet device * @txqs: The number of TX queues this device has. * @rxqs: The number of RX queues this device has. * * Fill in the fields of the device structure with Ethernet-generic * values. Basically does everything except registering the device. * * Constructs a new net device, complete with a private data area of * size (sizeof_priv). A 32-byte (not bit) alignment is enforced for * this private data area. */ struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs, unsigned int rxqs) { return alloc_netdev_mqs(sizeof_priv, "eth%d", NET_NAME_ENUM, ether_setup, txqs, rxqs); } EXPORT_SYMBOL(alloc_etherdev_mqs); ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len) { return sysfs_emit(buf, "%*phC\n", len, addr); } EXPORT_SYMBOL(sysfs_format_mac); struct sk_buff *eth_gro_receive(struct list_head *head, struct sk_buff *skb) { const struct packet_offload *ptype; unsigned int hlen, off_eth; struct sk_buff *pp = NULL; struct ethhdr *eh, *eh2; struct sk_buff *p; __be16 type; int flush = 1; off_eth = skb_gro_offset(skb); hlen = off_eth + sizeof(*eh); eh = skb_gro_header(skb, hlen, off_eth); if (unlikely(!eh)) goto out; flush = 0; list_for_each_entry(p, head, list) { if (!NAPI_GRO_CB(p)->same_flow) continue; eh2 = (struct ethhdr *)(p->data + off_eth); if (compare_ether_header(eh, eh2)) { NAPI_GRO_CB(p)->same_flow = 0; continue; } } type = eh->h_proto; ptype = gro_find_receive_by_type(type); if (ptype == NULL) { flush = 1; goto out; } skb_gro_pull(skb, sizeof(*eh)); skb_gro_postpull_rcsum(skb, eh, sizeof(*eh)); pp = indirect_call_gro_receive_inet(ptype->callbacks.gro_receive, ipv6_gro_receive, inet_gro_receive, head, skb); out: skb_gro_flush_final(skb, pp, flush); return pp; } EXPORT_SYMBOL(eth_gro_receive); int eth_gro_complete(struct sk_buff *skb, int nhoff) { struct ethhdr *eh = (struct ethhdr *)(skb->data + nhoff); __be16 type = eh->h_proto; struct packet_offload *ptype; int err = -ENOSYS; if (skb->encapsulation) skb_set_inner_mac_header(skb, nhoff); ptype = gro_find_complete_by_type(type); if (ptype != NULL) err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete, ipv6_gro_complete, inet_gro_complete, skb, nhoff + sizeof(*eh)); return err; } EXPORT_SYMBOL(eth_gro_complete); static struct packet_offload eth_packet_offload __read_mostly = { .type = cpu_to_be16(ETH_P_TEB), .priority = 10, .callbacks = { .gro_receive = eth_gro_receive, .gro_complete = eth_gro_complete, }, }; static int __init eth_offload_init(void) { dev_add_offload(&eth_packet_offload); return 0; } fs_initcall(eth_offload_init); unsigned char * __weak arch_get_platform_mac_address(void) { return NULL; } int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr) { unsigned char *addr; int ret; ret = of_get_mac_address(dev->of_node, mac_addr); if (!ret) return 0; addr = arch_get_platform_mac_address(); if (!addr) return -ENODEV; ether_addr_copy(mac_addr, addr); return 0; } EXPORT_SYMBOL(eth_platform_get_mac_address); /** * platform_get_ethdev_address - Set netdev's MAC address from a given device * @dev: Pointer to the device * @netdev: Pointer to netdev to write the address to * * Wrapper around eth_platform_get_mac_address() which writes the address * directly to netdev->dev_addr. */ int platform_get_ethdev_address(struct device *dev, struct net_device *netdev) { u8 addr[ETH_ALEN] __aligned(2); int ret; ret = eth_platform_get_mac_address(dev, addr); if (!ret) eth_hw_addr_set(netdev, addr); return ret; } EXPORT_SYMBOL(platform_get_ethdev_address); /** * nvmem_get_mac_address - Obtain the MAC address from an nvmem cell named * 'mac-address' associated with given device. * * @dev: Device with which the mac-address cell is associated. * @addrbuf: Buffer to which the MAC address will be copied on success. * * Returns 0 on success or a negative error number on failure. */ int nvmem_get_mac_address(struct device *dev, void *addrbuf) { struct nvmem_cell *cell; const void *mac; size_t len; cell = nvmem_cell_get(dev, "mac-address"); if (IS_ERR(cell)) return PTR_ERR(cell); mac = nvmem_cell_read(cell, &len); nvmem_cell_put(cell); if (IS_ERR(mac)) return PTR_ERR(mac); if (len != ETH_ALEN || !is_valid_ether_addr(mac)) { kfree(mac); return -EINVAL; } ether_addr_copy(addrbuf, mac); kfree(mac); return 0; } static int fwnode_get_mac_addr(struct fwnode_handle *fwnode, const char *name, char *addr) { int ret; ret = fwnode_property_read_u8_array(fwnode, name, addr, ETH_ALEN); if (ret) return ret; if (!is_valid_ether_addr(addr)) return -EINVAL; return 0; } /** * fwnode_get_mac_address - Get the MAC from the firmware node * @fwnode: Pointer to the firmware node * @addr: Address of buffer to store the MAC in * * Search the firmware node for the best MAC address to use. 'mac-address' is * checked first, because that is supposed to contain to "most recent" MAC * address. If that isn't set, then 'local-mac-address' is checked next, * because that is the default address. If that isn't set, then the obsolete * 'address' is checked, just in case we're using an old device tree. * * Note that the 'address' property is supposed to contain a virtual address of * the register set, but some DTS files have redefined that property to be the * MAC address. * * All-zero MAC addresses are rejected, because those could be properties that * exist in the firmware tables, but were not updated by the firmware. For * example, the DTS could define 'mac-address' and 'local-mac-address', with * zero MAC addresses. Some older U-Boots only initialized 'local-mac-address'. * In this case, the real MAC is in 'local-mac-address', and 'mac-address' * exists but is all zeros. */ int fwnode_get_mac_address(struct fwnode_handle *fwnode, char *addr) { if (!fwnode_get_mac_addr(fwnode, "mac-address", addr) || !fwnode_get_mac_addr(fwnode, "local-mac-address", addr) || !fwnode_get_mac_addr(fwnode, "address", addr)) return 0; return -ENOENT; } EXPORT_SYMBOL(fwnode_get_mac_address); /** * device_get_mac_address - Get the MAC for a given device * @dev: Pointer to the device * @addr: Address of buffer to store the MAC in */ int device_get_mac_address(struct device *dev, char *addr) { return fwnode_get_mac_address(dev_fwnode(dev), addr); } EXPORT_SYMBOL(device_get_mac_address); /** * device_get_ethdev_address - Set netdev's MAC address from a given device * @dev: Pointer to the device * @netdev: Pointer to netdev to write the address to * * Wrapper around device_get_mac_address() which writes the address * directly to netdev->dev_addr. */ int device_get_ethdev_address(struct device *dev, struct net_device *netdev) { u8 addr[ETH_ALEN]; int ret; ret = device_get_mac_address(dev, addr); if (!ret) eth_hw_addr_set(netdev, addr); return ret; } EXPORT_SYMBOL(device_get_ethdev_address);
241 241 241 3 3 2 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 // SPDX-License-Identifier: GPL-2.0-only #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/workqueue.h> #include <linux/rtnetlink.h> #include <linux/cache.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/delay.h> #include <linux/sched.h> #include <linux/idr.h> #include <linux/rculist.h> #include <linux/nsproxy.h> #include <linux/fs.h> #include <linux/proc_ns.h> #include <linux/file.h> #include <linux/export.h> #include <linux/user_namespace.h> #include <linux/net_namespace.h> #include <linux/sched/task.h> #include <linux/uidgid.h> #include <linux/proc_fs.h> #include <net/aligned_data.h> #include <net/sock.h> #include <net/netlink.h> #include <net/net_namespace.h> #include <net/netns/generic.h> /* * Our network namespace constructor/destructor lists */ static LIST_HEAD(pernet_list); static struct list_head *first_device = &pernet_list; LIST_HEAD(net_namespace_list); EXPORT_SYMBOL_GPL(net_namespace_list); /* Protects net_namespace_list. Nests iside rtnl_lock() */ DECLARE_RWSEM(net_rwsem); EXPORT_SYMBOL_GPL(net_rwsem); #ifdef CONFIG_KEYS static struct key_tag init_net_key_domain = { .usage = REFCOUNT_INIT(1) }; #endif struct net init_net; EXPORT_SYMBOL(init_net); static bool init_net_initialized; /* * pernet_ops_rwsem: protects: pernet_list, net_generic_ids, * init_net_initialized and first_device pointer. * This is internal net namespace object. Please, don't use it * outside. */ DECLARE_RWSEM(pernet_ops_rwsem); #define MIN_PERNET_OPS_ID \ ((sizeof(struct net_generic) + sizeof(void *) - 1) / sizeof(void *)) #define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */ static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS; static struct net_generic *net_alloc_generic(void) { unsigned int gen_ptrs = READ_ONCE(max_gen_ptrs); unsigned int generic_size; struct net_generic *ng; generic_size = offsetof(struct net_generic, ptr[gen_ptrs]); ng = kzalloc(generic_size, GFP_KERNEL); if (ng) ng->s.len = gen_ptrs; return ng; } static int net_assign_generic(struct net *net, unsigned int id, void *data) { struct net_generic *ng, *old_ng; BUG_ON(id < MIN_PERNET_OPS_ID); old_ng = rcu_dereference_protected(net->gen, lockdep_is_held(&pernet_ops_rwsem)); if (old_ng->s.len > id) { old_ng->ptr[id] = data; return 0; } ng = net_alloc_generic(); if (!ng) return -ENOMEM; /* * Some synchronisation notes: * * The net_generic explores the net->gen array inside rcu * read section. Besides once set the net->gen->ptr[x] * pointer never changes (see rules in netns/generic.h). * * That said, we simply duplicate this array and schedule * the old copy for kfree after a grace period. */ memcpy(&ng->ptr[MIN_PERNET_OPS_ID], &old_ng->ptr[MIN_PERNET_OPS_ID], (old_ng->s.len - MIN_PERNET_OPS_ID) * sizeof(void *)); ng->ptr[id] = data; rcu_assign_pointer(net->gen, ng); kfree_rcu(old_ng, s.rcu); return 0; } static int ops_init(const struct pernet_operations *ops, struct net *net) { struct net_generic *ng; int err = -ENOMEM; void *data = NULL; if (ops->id) { data = kzalloc(ops->size, GFP_KERNEL); if (!data) goto out; err = net_assign_generic(net, *ops->id, data); if (err) goto cleanup; } err = 0; if (ops->init) err = ops->init(net); if (!err) return 0; if (ops->id) { ng = rcu_dereference_protected(net->gen, lockdep_is_held(&pernet_ops_rwsem)); ng->ptr[*ops->id] = NULL; } cleanup: kfree(data); out: return err; } static void ops_pre_exit_list(const struct pernet_operations *ops, struct list_head *net_exit_list) { struct net *net; if (ops->pre_exit) { list_for_each_entry(net, net_exit_list, exit_list) ops->pre_exit(net); } } static void ops_exit_rtnl_list(const struct list_head *ops_list, const struct pernet_operations *ops, struct list_head *net_exit_list) { const struct pernet_operations *saved_ops = ops; LIST_HEAD(dev_kill_list); struct net *net; rtnl_lock(); list_for_each_entry(net, net_exit_list, exit_list) { __rtnl_net_lock(net); ops = saved_ops; list_for_each_entry_continue_reverse(ops, ops_list, list) { if (ops->exit_rtnl) ops->exit_rtnl(net, &dev_kill_list); } __rtnl_net_unlock(net); } unregister_netdevice_many(&dev_kill_list); rtnl_unlock(); } static void ops_exit_list(const struct pernet_operations *ops, struct list_head *net_exit_list) { if (ops->exit) { struct net *net; list_for_each_entry(net, net_exit_list, exit_list) { ops->exit(net); cond_resched(); } } if (ops->exit_batch) ops->exit_batch(net_exit_list); } static void ops_free_list(const struct pernet_operations *ops, struct list_head *net_exit_list) { struct net *net; if (ops->id) { list_for_each_entry(net, net_exit_list, exit_list) kfree(net_generic(net, *ops->id)); } } static void ops_undo_list(const struct list_head *ops_list, const struct pernet_operations *ops, struct list_head *net_exit_list, bool expedite_rcu) { const struct pernet_operations *saved_ops; bool hold_rtnl = false; if (!ops) ops = list_entry(ops_list, typeof(*ops), list); saved_ops = ops; list_for_each_entry_continue_reverse(ops, ops_list, list) { hold_rtnl |= !!ops->exit_rtnl; ops_pre_exit_list(ops, net_exit_list); } /* Another CPU might be rcu-iterating the list, wait for it. * This needs to be before calling the exit() notifiers, so the * rcu_barrier() after ops_undo_list() isn't sufficient alone. * Also the pre_exit() and exit() methods need this barrier. */ if (expedite_rcu) synchronize_rcu_expedited(); else synchronize_rcu(); if (hold_rtnl) ops_exit_rtnl_list(ops_list, saved_ops, net_exit_list); ops = saved_ops; list_for_each_entry_continue_reverse(ops, ops_list, list) ops_exit_list(ops, net_exit_list); ops = saved_ops; list_for_each_entry_continue_reverse(ops, ops_list, list) ops_free_list(ops, net_exit_list); } static void ops_undo_single(struct pernet_operations *ops, struct list_head *net_exit_list) { LIST_HEAD(ops_list); list_add(&ops->list, &ops_list); ops_undo_list(&ops_list, NULL, net_exit_list, false); list_del(&ops->list); } /* should be called with nsid_lock held */ static int alloc_netid(struct net *net, struct net *peer, int reqid) { int min = 0, max = 0; if (reqid >= 0) { min = reqid; max = reqid + 1; } return idr_alloc(&net->netns_ids, peer, min, max, GFP_ATOMIC); } /* This function is used by idr_for_each(). If net is equal to peer, the * function returns the id so that idr_for_each() stops. Because we cannot * returns the id 0 (idr_for_each() will not stop), we return the magic value * NET_ID_ZERO (-1) for it. */ #define NET_ID_ZERO -1 static int net_eq_idr(int id, void *net, void *peer) { if (net_eq(net, peer)) return id ? : NET_ID_ZERO; return 0; } /* Must be called from RCU-critical section or with nsid_lock held */ static int __peernet2id(const struct net *net, struct net *peer) { int id = idr_for_each(&net->netns_ids, net_eq_idr, peer); /* Magic value for id 0. */ if (id == NET_ID_ZERO) return 0; if (id > 0) return id; return NETNSA_NSID_NOT_ASSIGNED; } static void rtnl_net_notifyid(struct net *net, int cmd, int id, u32 portid, struct nlmsghdr *nlh, gfp_t gfp); /* This function returns the id of a peer netns. If no id is assigned, one will * be allocated and returned. */ int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp) { int id; if (refcount_read(&net->ns.count) == 0) return NETNSA_NSID_NOT_ASSIGNED; spin_lock(&net->nsid_lock); id = __peernet2id(net, peer); if (id >= 0) { spin_unlock(&net->nsid_lock); return id; } /* When peer is obtained from RCU lists, we may race with * its cleanup. Check whether it's alive, and this guarantees * we never hash a peer back to net->netns_ids, after it has * just been idr_remove()'d from there in cleanup_net(). */ if (!maybe_get_net(peer)) { spin_unlock(&net->nsid_lock); return NETNSA_NSID_NOT_ASSIGNED; } id = alloc_netid(net, peer, -1); spin_unlock(&net->nsid_lock); put_net(peer); if (id < 0) return NETNSA_NSID_NOT_ASSIGNED; rtnl_net_notifyid(net, RTM_NEWNSID, id, 0, NULL, gfp); return id; } EXPORT_SYMBOL_GPL(peernet2id_alloc); /* This function returns, if assigned, the id of a peer netns. */ int peernet2id(const struct net *net, struct net *peer) { int id; rcu_read_lock(); id = __peernet2id(net, peer); rcu_read_unlock(); return id; } EXPORT_SYMBOL(peernet2id); /* This function returns true is the peer netns has an id assigned into the * current netns. */ bool peernet_has_id(const struct net *net, struct net *peer) { return peernet2id(net, peer) >= 0; } struct net *get_net_ns_by_id(const struct net *net, int id) { struct net *peer; if (id < 0) return NULL; rcu_read_lock(); peer = idr_find(&net->netns_ids, id); if (peer) peer = maybe_get_net(peer); rcu_read_unlock(); return peer; } EXPORT_SYMBOL_GPL(get_net_ns_by_id); static __net_init void preinit_net_sysctl(struct net *net) { net->core.sysctl_somaxconn = SOMAXCONN; /* Limits per socket sk_omem_alloc usage. * TCP zerocopy regular usage needs 128 KB. */ net->core.sysctl_optmem_max = 128 * 1024; net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED; net->core.sysctl_tstamp_allow_data = 1; } /* init code that must occur even if setup_net() is not called. */ static __net_init void preinit_net(struct net *net, struct user_namespace *user_ns) { refcount_set(&net->passive, 1); refcount_set(&net->ns.count, 1); ref_tracker_dir_init(&net->refcnt_tracker, 128, "net_refcnt"); ref_tracker_dir_init(&net->notrefcnt_tracker, 128, "net_notrefcnt"); get_random_bytes(&net->hash_mix, sizeof(u32)); net->dev_base_seq = 1; net->user_ns = user_ns; idr_init(&net->netns_ids); spin_lock_init(&net->nsid_lock); mutex_init(&net->ipv4.ra_mutex); #ifdef CONFIG_DEBUG_NET_SMALL_RTNL mutex_init(&net->rtnl_mutex); lock_set_cmp_fn(&net->rtnl_mutex, rtnl_net_lock_cmp_fn, NULL); #endif INIT_LIST_HEAD(&net->ptype_all); INIT_LIST_HEAD(&net->ptype_specific); preinit_net_sysctl(net); } /* * setup_net runs the initializers for the network namespace object. */ static __net_init int setup_net(struct net *net) { /* Must be called with pernet_ops_rwsem held */ const struct pernet_operations *ops; LIST_HEAD(net_exit_list); int error = 0; net->net_cookie = atomic64_inc_return(&net_aligned_data.net_cookie); list_for_each_entry(ops, &pernet_list, list) { error = ops_init(ops, net); if (error < 0) goto out_undo; } down_write(&net_rwsem); list_add_tail_rcu(&net->list, &net_namespace_list); up_write(&net_rwsem); out: return error; out_undo: /* Walk through the list backwards calling the exit functions * for the pernet modules whose init functions did not fail. */ list_add(&net->exit_list, &net_exit_list); ops_undo_list(&pernet_list, ops, &net_exit_list, false); rcu_barrier(); goto out; } #ifdef CONFIG_NET_NS static struct ucounts *inc_net_namespaces(struct user_namespace *ns) { return inc_ucount(ns, current_euid(), UCOUNT_NET_NAMESPACES); } static void dec_net_namespaces(struct ucounts *ucounts) { dec_ucount(ucounts, UCOUNT_NET_NAMESPACES); } static struct kmem_cache *net_cachep __ro_after_init; static struct workqueue_struct *netns_wq; static struct net *net_alloc(void) { struct net *net = NULL; struct net_generic *ng; ng = net_alloc_generic(); if (!ng) goto out; net = kmem_cache_zalloc(net_cachep, GFP_KERNEL); if (!net) goto out_free; #ifdef CONFIG_KEYS net->key_domain = kzalloc(sizeof(struct key_tag), GFP_KERNEL); if (!net->key_domain) goto out_free_2; refcount_set(&net->key_domain->usage, 1); #endif rcu_assign_pointer(net->gen, ng); out: return net; #ifdef CONFIG_KEYS out_free_2: kmem_cache_free(net_cachep, net); net = NULL; #endif out_free: kfree(ng); goto out; } static LLIST_HEAD(defer_free_list); static void net_complete_free(void) { struct llist_node *kill_list; struct net *net, *next; /* Get the list of namespaces to free from last round. */ kill_list = llist_del_all(&defer_free_list); llist_for_each_entry_safe(net, next, kill_list, defer_free_list) kmem_cache_free(net_cachep, net); } void net_passive_dec(struct net *net) { if (refcount_dec_and_test(&net->passive)) { kfree(rcu_access_pointer(net->gen)); /* There should not be any trackers left there. */ ref_tracker_dir_exit(&net->notrefcnt_tracker); /* Wait for an extra rcu_barrier() before final free. */ llist_add(&net->defer_free_list, &defer_free_list); } } void net_drop_ns(void *p) { struct net *net = (struct net *)p; if (net) net_passive_dec(net); } struct net *copy_net_ns(unsigned long flags, struct user_namespace *user_ns, struct net *old_net) { struct ucounts *ucounts; struct net *net; int rv; if (!(flags & CLONE_NEWNET)) return get_net(old_net); ucounts = inc_net_namespaces(user_ns); if (!ucounts) return ERR_PTR(-ENOSPC); net = net_alloc(); if (!net) { rv = -ENOMEM; goto dec_ucounts; } preinit_net(net, user_ns); net->ucounts = ucounts; get_user_ns(user_ns); rv = down_read_killable(&pernet_ops_rwsem); if (rv < 0) goto put_userns; rv = setup_net(net); up_read(&pernet_ops_rwsem); if (rv < 0) { put_userns: #ifdef CONFIG_KEYS key_remove_domain(net->key_domain); #endif put_user_ns(user_ns); net_passive_dec(net); dec_ucounts: dec_net_namespaces(ucounts); return ERR_PTR(rv); } return net; } /** * net_ns_get_ownership - get sysfs ownership data for @net * @net: network namespace in question (can be NULL) * @uid: kernel user ID for sysfs objects * @gid: kernel group ID for sysfs objects * * Returns the uid/gid pair of root in the user namespace associated with the * given network namespace. */ void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid) { if (net) { kuid_t ns_root_uid = make_kuid(net->user_ns, 0); kgid_t ns_root_gid = make_kgid(net->user_ns, 0); if (uid_valid(ns_root_uid)) *uid = ns_root_uid; if (gid_valid(ns_root_gid)) *gid = ns_root_gid; } else { *uid = GLOBAL_ROOT_UID; *gid = GLOBAL_ROOT_GID; } } EXPORT_SYMBOL_GPL(net_ns_get_ownership); static void unhash_nsid(struct net *net, struct net *last) { struct net *tmp; /* This function is only called from cleanup_net() work, * and this work is the only process, that may delete * a net from net_namespace_list. So, when the below * is executing, the list may only grow. Thus, we do not * use for_each_net_rcu() or net_rwsem. */ for_each_net(tmp) { int id; spin_lock(&tmp->nsid_lock); id = __peernet2id(tmp, net); if (id >= 0) idr_remove(&tmp->netns_ids, id); spin_unlock(&tmp->nsid_lock); if (id >= 0) rtnl_net_notifyid(tmp, RTM_DELNSID, id, 0, NULL, GFP_KERNEL); if (tmp == last) break; } spin_lock(&net->nsid_lock); idr_destroy(&net->netns_ids); spin_unlock(&net->nsid_lock); } static LLIST_HEAD(cleanup_list); struct task_struct *cleanup_net_task; static void cleanup_net(struct work_struct *work) { struct llist_node *net_kill_list; struct net *net, *tmp, *last; LIST_HEAD(net_exit_list); WRITE_ONCE(cleanup_net_task, current); /* Atomically snapshot the list of namespaces to cleanup */ net_kill_list = llist_del_all(&cleanup_list); down_read(&pernet_ops_rwsem); /* Don't let anyone else find us. */ down_write(&net_rwsem); llist_for_each_entry(net, net_kill_list, cleanup_list) list_del_rcu(&net->list); /* Cache last net. After we unlock rtnl, no one new net * added to net_namespace_list can assign nsid pointer * to a net from net_kill_list (see peernet2id_alloc()). * So, we skip them in unhash_nsid(). * * Note, that unhash_nsid() does not delete nsid links * between net_kill_list's nets, as they've already * deleted from net_namespace_list. But, this would be * useless anyway, as netns_ids are destroyed there. */ last = list_last_entry(&net_namespace_list, struct net, list); up_write(&net_rwsem); llist_for_each_entry(net, net_kill_list, cleanup_list) { unhash_nsid(net, last); list_add_tail(&net->exit_list, &net_exit_list); } ops_undo_list(&pernet_list, NULL, &net_exit_list, true); up_read(&pernet_ops_rwsem); /* Ensure there are no outstanding rcu callbacks using this * network namespace. */ rcu_barrier(); net_complete_free(); /* Finally it is safe to free my network namespace structure */ list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) { list_del_init(&net->exit_list); dec_net_namespaces(net->ucounts); #ifdef CONFIG_KEYS key_remove_domain(net->key_domain); #endif put_user_ns(net->user_ns); net_passive_dec(net); } WRITE_ONCE(cleanup_net_task, NULL); } /** * net_ns_barrier - wait until concurrent net_cleanup_work is done * * cleanup_net runs from work queue and will first remove namespaces * from the global list, then run net exit functions. * * Call this in module exit path to make sure that all netns * ->exit ops have been invoked before the function is removed. */ void net_ns_barrier(void) { down_write(&pernet_ops_rwsem); up_write(&pernet_ops_rwsem); } EXPORT_SYMBOL(net_ns_barrier); static DECLARE_WORK(net_cleanup_work, cleanup_net); void __put_net(struct net *net) { ref_tracker_dir_exit(&net->refcnt_tracker); /* Cleanup the network namespace in process context */ if (llist_add(&net->cleanup_list, &cleanup_list)) queue_work(netns_wq, &net_cleanup_work); } EXPORT_SYMBOL_GPL(__put_net); /** * get_net_ns - increment the refcount of the network namespace * @ns: common namespace (net) * * Returns the net's common namespace or ERR_PTR() if ref is zero. */ struct ns_common *get_net_ns(struct ns_common *ns) { struct net *net; net = maybe_get_net(container_of(ns, struct net, ns)); if (net) return &net->ns; return ERR_PTR(-EINVAL); } EXPORT_SYMBOL_GPL(get_net_ns); struct net *get_net_ns_by_fd(int fd) { CLASS(fd, f)(fd); if (fd_empty(f)) return ERR_PTR(-EBADF); if (proc_ns_file(fd_file(f))) { struct ns_common *ns = get_proc_ns(file_inode(fd_file(f))); if (ns->ops == &netns_operations) return get_net(container_of(ns, struct net, ns)); } return ERR_PTR(-EINVAL); } EXPORT_SYMBOL_GPL(get_net_ns_by_fd); #endif struct net *get_net_ns_by_pid(pid_t pid) { struct task_struct *tsk; struct net *net; /* Lookup the network namespace */ net = ERR_PTR(-ESRCH); rcu_read_lock(); tsk = find_task_by_vpid(pid); if (tsk) { struct nsproxy *nsproxy; task_lock(tsk); nsproxy = tsk->nsproxy; if (nsproxy) net = get_net(nsproxy->net_ns); task_unlock(tsk); } rcu_read_unlock(); return net; } EXPORT_SYMBOL_GPL(get_net_ns_by_pid); #ifdef CONFIG_NET_NS_REFCNT_TRACKER static void net_ns_net_debugfs(struct net *net) { ref_tracker_dir_symlink(&net->refcnt_tracker, "netns-%llx-%u-refcnt", net->net_cookie, net->ns.inum); ref_tracker_dir_symlink(&net->notrefcnt_tracker, "netns-%llx-%u-notrefcnt", net->net_cookie, net->ns.inum); } static int __init init_net_debugfs(void) { ref_tracker_dir_debugfs(&init_net.refcnt_tracker); ref_tracker_dir_debugfs(&init_net.notrefcnt_tracker); net_ns_net_debugfs(&init_net); return 0; } late_initcall(init_net_debugfs); #else static void net_ns_net_debugfs(struct net *net) { } #endif static __net_init int net_ns_net_init(struct net *net) { #ifdef CONFIG_NET_NS net->ns.ops = &netns_operations; #endif net->ns.inum = PROC_NET_INIT_INO; if (net != &init_net) { int ret = ns_alloc_inum(&net->ns); if (ret) return ret; } net_ns_net_debugfs(net); return 0; } static __net_exit void net_ns_net_exit(struct net *net) { /* * Initial network namespace doesn't exit so we don't need any * special checks here. */ ns_free_inum(&net->ns); } static struct pernet_operations __net_initdata net_ns_ops = { .init = net_ns_net_init, .exit = net_ns_net_exit, }; static const struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = { [NETNSA_NONE] = { .type = NLA_UNSPEC }, [NETNSA_NSID] = { .type = NLA_S32 }, [NETNSA_PID] = { .type = NLA_U32 }, [NETNSA_FD] = { .type = NLA_U32 }, [NETNSA_TARGET_NSID] = { .type = NLA_S32 }, }; static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *tb[NETNSA_MAX + 1]; struct nlattr *nla; struct net *peer; int nsid, err; err = nlmsg_parse_deprecated(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX, rtnl_net_policy, extack); if (err < 0) return err; if (!tb[NETNSA_NSID]) { NL_SET_ERR_MSG(extack, "nsid is missing"); return -EINVAL; } nsid = nla_get_s32(tb[NETNSA_NSID]); if (tb[NETNSA_PID]) { peer = get_net_ns_by_pid(nla_get_u32(tb[NETNSA_PID])); nla = tb[NETNSA_PID]; } else if (tb[NETNSA_FD]) { peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD])); nla = tb[NETNSA_FD]; } else { NL_SET_ERR_MSG(extack, "Peer netns reference is missing"); return -EINVAL; } if (IS_ERR(peer)) { NL_SET_BAD_ATTR(extack, nla); NL_SET_ERR_MSG(extack, "Peer netns reference is invalid"); return PTR_ERR(peer); } spin_lock(&net->nsid_lock); if (__peernet2id(net, peer) >= 0) { spin_unlock(&net->nsid_lock); err = -EEXIST; NL_SET_BAD_ATTR(extack, nla); NL_SET_ERR_MSG(extack, "Peer netns already has a nsid assigned"); goto out; } err = alloc_netid(net, peer, nsid); spin_unlock(&net->nsid_lock); if (err >= 0) { rtnl_net_notifyid(net, RTM_NEWNSID, err, NETLINK_CB(skb).portid, nlh, GFP_KERNEL); err = 0; } else if (err == -ENOSPC && nsid >= 0) { err = -EEXIST; NL_SET_BAD_ATTR(extack, tb[NETNSA_NSID]); NL_SET_ERR_MSG(extack, "The specified nsid is already used"); } out: put_net(peer); return err; } static int rtnl_net_get_size(void) { return NLMSG_ALIGN(sizeof(struct rtgenmsg)) + nla_total_size(sizeof(s32)) /* NETNSA_NSID */ + nla_total_size(sizeof(s32)) /* NETNSA_CURRENT_NSID */ ; } struct net_fill_args { u32 portid; u32 seq; int flags; int cmd; int nsid; bool add_ref; int ref_nsid; }; static int rtnl_net_fill(struct sk_buff *skb, struct net_fill_args *args) { struct nlmsghdr *nlh; struct rtgenmsg *rth; nlh = nlmsg_put(skb, args->portid, args->seq, args->cmd, sizeof(*rth), args->flags); if (!nlh) return -EMSGSIZE; rth = nlmsg_data(nlh); rth->rtgen_family = AF_UNSPEC; if (nla_put_s32(skb, NETNSA_NSID, args->nsid)) goto nla_put_failure; if (args->add_ref && nla_put_s32(skb, NETNSA_CURRENT_NSID, args->ref_nsid)) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int rtnl_net_valid_getid_req(struct sk_buff *skb, const struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack) { int i, err; if (!netlink_strict_get_check(skb)) return nlmsg_parse_deprecated(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX, rtnl_net_policy, extack); err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX, rtnl_net_policy, extack); if (err) return err; for (i = 0; i <= NETNSA_MAX; i++) { if (!tb[i]) continue; switch (i) { case NETNSA_PID: case NETNSA_FD: case NETNSA_NSID: case NETNSA_TARGET_NSID: break; default: NL_SET_ERR_MSG(extack, "Unsupported attribute in peer netns getid request"); return -EINVAL; } } return 0; } static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *tb[NETNSA_MAX + 1]; struct net_fill_args fillargs = { .portid = NETLINK_CB(skb).portid, .seq = nlh->nlmsg_seq, .cmd = RTM_NEWNSID, }; struct net *peer, *target = net; struct nlattr *nla; struct sk_buff *msg; int err; err = rtnl_net_valid_getid_req(skb, nlh, tb, extack); if (err < 0) return err; if (tb[NETNSA_PID]) { peer = get_net_ns_by_pid(nla_get_u32(tb[NETNSA_PID])); nla = tb[NETNSA_PID]; } else if (tb[NETNSA_FD]) { peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD])); nla = tb[NETNSA_FD]; } else if (tb[NETNSA_NSID]) { peer = get_net_ns_by_id(net, nla_get_s32(tb[NETNSA_NSID])); if (!peer) peer = ERR_PTR(-ENOENT); nla = tb[NETNSA_NSID]; } else { NL_SET_ERR_MSG(extack, "Peer netns reference is missing"); return -EINVAL; } if (IS_ERR(peer)) { NL_SET_BAD_ATTR(extack, nla); NL_SET_ERR_MSG(extack, "Peer netns reference is invalid"); return PTR_ERR(peer); } if (tb[NETNSA_TARGET_NSID]) { int id = nla_get_s32(tb[NETNSA_TARGET_NSID]); target = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, id); if (IS_ERR(target)) { NL_SET_BAD_ATTR(extack, tb[NETNSA_TARGET_NSID]); NL_SET_ERR_MSG(extack, "Target netns reference is invalid"); err = PTR_ERR(target); goto out; } fillargs.add_ref = true; fillargs.ref_nsid = peernet2id(net, peer); } msg = nlmsg_new(rtnl_net_get_size(), GFP_KERNEL); if (!msg) { err = -ENOMEM; goto out; } fillargs.nsid = peernet2id(target, peer); err = rtnl_net_fill(msg, &fillargs); if (err < 0) goto err_out; err = rtnl_unicast(msg, net, NETLINK_CB(skb).portid); goto out; err_out: nlmsg_free(msg); out: if (fillargs.add_ref) put_net(target); put_net(peer); return err; } struct rtnl_net_dump_cb { struct net *tgt_net; struct net *ref_net; struct sk_buff *skb; struct net_fill_args fillargs; int idx; int s_idx; }; /* Runs in RCU-critical section. */ static int rtnl_net_dumpid_one(int id, void *peer, void *data) { struct rtnl_net_dump_cb *net_cb = (struct rtnl_net_dump_cb *)data; int ret; if (net_cb->idx < net_cb->s_idx) goto cont; net_cb->fillargs.nsid = id; if (net_cb->fillargs.add_ref) net_cb->fillargs.ref_nsid = __peernet2id(net_cb->ref_net, peer); ret = rtnl_net_fill(net_cb->skb, &net_cb->fillargs); if (ret < 0) return ret; cont: net_cb->idx++; return 0; } static int rtnl_valid_dump_net_req(const struct nlmsghdr *nlh, struct sock *sk, struct rtnl_net_dump_cb *net_cb, struct netlink_callback *cb) { struct netlink_ext_ack *extack = cb->extack; struct nlattr *tb[NETNSA_MAX + 1]; int err, i; err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX, rtnl_net_policy, extack); if (err < 0) return err; for (i = 0; i <= NETNSA_MAX; i++) { if (!tb[i]) continue; if (i == NETNSA_TARGET_NSID) { struct net *net; net = rtnl_get_net_ns_capable(sk, nla_get_s32(tb[i])); if (IS_ERR(net)) { NL_SET_BAD_ATTR(extack, tb[i]); NL_SET_ERR_MSG(extack, "Invalid target network namespace id"); return PTR_ERR(net); } net_cb->fillargs.add_ref = true; net_cb->ref_net = net_cb->tgt_net; net_cb->tgt_net = net; } else { NL_SET_BAD_ATTR(extack, tb[i]); NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request"); return -EINVAL; } } return 0; } static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb) { struct rtnl_net_dump_cb net_cb = { .tgt_net = sock_net(skb->sk), .skb = skb, .fillargs = { .portid = NETLINK_CB(cb->skb).portid, .seq = cb->nlh->nlmsg_seq, .flags = NLM_F_MULTI, .cmd = RTM_NEWNSID, }, .idx = 0, .s_idx = cb->args[0], }; int err = 0; if (cb->strict_check) { err = rtnl_valid_dump_net_req(cb->nlh, skb->sk, &net_cb, cb); if (err < 0) goto end; } rcu_read_lock(); idr_for_each(&net_cb.tgt_net->netns_ids, rtnl_net_dumpid_one, &net_cb); rcu_read_unlock(); cb->args[0] = net_cb.idx; end: if (net_cb.fillargs.add_ref) put_net(net_cb.tgt_net); return err; } static void rtnl_net_notifyid(struct net *net, int cmd, int id, u32 portid, struct nlmsghdr *nlh, gfp_t gfp) { struct net_fill_args fillargs = { .portid = portid, .seq = nlh ? nlh->nlmsg_seq : 0, .cmd = cmd, .nsid = id, }; struct sk_buff *msg; int err = -ENOMEM; msg = nlmsg_new(rtnl_net_get_size(), gfp); if (!msg) goto out; err = rtnl_net_fill(msg, &fillargs); if (err < 0) goto err_out; rtnl_notify(msg, net, portid, RTNLGRP_NSID, nlh, gfp); return; err_out: nlmsg_free(msg); out: rtnl_set_sk_err(net, RTNLGRP_NSID, err); } #ifdef CONFIG_NET_NS static void __init netns_ipv4_struct_check(void) { /* TX readonly hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, sysctl_tcp_early_retrans); CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, sysctl_tcp_tso_win_divisor); CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, sysctl_tcp_tso_rtt_log); CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, sysctl_tcp_autocorking); CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, sysctl_tcp_min_snd_mss); CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, sysctl_tcp_notsent_lowat); CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, sysctl_tcp_limit_output_bytes); CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, sysctl_tcp_min_rtt_wlen); CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, sysctl_tcp_wmem); CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, sysctl_ip_fwd_use_pmtu); CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_tx, 33); /* TXRX readonly hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_txrx, sysctl_tcp_moderate_rcvbuf); CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_txrx, 1); /* RX readonly hotpath cache line */ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, sysctl_ip_early_demux); CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, sysctl_tcp_early_demux); CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, sysctl_tcp_l3mdev_accept); CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, sysctl_tcp_reordering); CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, sysctl_tcp_rmem); CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_rx, 22); } #endif static const struct rtnl_msg_handler net_ns_rtnl_msg_handlers[] __initconst = { {.msgtype = RTM_NEWNSID, .doit = rtnl_net_newid, .flags = RTNL_FLAG_DOIT_UNLOCKED}, {.msgtype = RTM_GETNSID, .doit = rtnl_net_getid, .dumpit = rtnl_net_dumpid, .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, }; void __init net_ns_init(void) { struct net_generic *ng; #ifdef CONFIG_NET_NS netns_ipv4_struct_check(); net_cachep = kmem_cache_create("net_namespace", sizeof(struct net), SMP_CACHE_BYTES, SLAB_PANIC|SLAB_ACCOUNT, NULL); /* Create workqueue for cleanup */ netns_wq = create_singlethread_workqueue("netns"); if (!netns_wq) panic("Could not create netns workq"); #endif ng = net_alloc_generic(); if (!ng) panic("Could not allocate generic netns"); rcu_assign_pointer(init_net.gen, ng); #ifdef CONFIG_KEYS init_net.key_domain = &init_net_key_domain; #endif preinit_net(&init_net, &init_user_ns); down_write(&pernet_ops_rwsem); if (setup_net(&init_net)) panic("Could not setup the initial network namespace"); init_net_initialized = true; up_write(&pernet_ops_rwsem); if (register_pernet_subsys(&net_ns_ops)) panic("Could not register network namespace subsystems"); rtnl_register_many(net_ns_rtnl_msg_handlers); } #ifdef CONFIG_NET_NS static int __register_pernet_operations(struct list_head *list, struct pernet_operations *ops) { LIST_HEAD(net_exit_list); struct net *net; int error; list_add_tail(&ops->list, list); if (ops->init || ops->id) { /* We held write locked pernet_ops_rwsem, and parallel * setup_net() and cleanup_net() are not possible. */ for_each_net(net) { error = ops_init(ops, net); if (error) goto out_undo; list_add_tail(&net->exit_list, &net_exit_list); } } return 0; out_undo: /* If I have an error cleanup all namespaces I initialized */ list_del(&ops->list); ops_undo_single(ops, &net_exit_list); return error; } static void __unregister_pernet_operations(struct pernet_operations *ops) { LIST_HEAD(net_exit_list); struct net *net; /* See comment in __register_pernet_operations() */ for_each_net(net) list_add_tail(&net->exit_list, &net_exit_list); list_del(&ops->list); ops_undo_single(ops, &net_exit_list); } #else static int __register_pernet_operations(struct list_head *list, struct pernet_operations *ops) { if (!init_net_initialized) { list_add_tail(&ops->list, list); return 0; } return ops_init(ops, &init_net); } static void __unregister_pernet_operations(struct pernet_operations *ops) { if (!init_net_initialized) { list_del(&ops->list); } else { LIST_HEAD(net_exit_list); list_add(&init_net.exit_list, &net_exit_list); ops_undo_single(ops, &net_exit_list); } } #endif /* CONFIG_NET_NS */ static DEFINE_IDA(net_generic_ids); static int register_pernet_operations(struct list_head *list, struct pernet_operations *ops) { int error; if (WARN_ON(!!ops->id ^ !!ops->size)) return -EINVAL; if (ops->id) { error = ida_alloc_min(&net_generic_ids, MIN_PERNET_OPS_ID, GFP_KERNEL); if (error < 0) return error; *ops->id = error; /* This does not require READ_ONCE as writers already hold * pernet_ops_rwsem. But WRITE_ONCE is needed to protect * net_alloc_generic. */ WRITE_ONCE(max_gen_ptrs, max(max_gen_ptrs, *ops->id + 1)); } error = __register_pernet_operations(list, ops); if (error) { rcu_barrier(); if (ops->id) ida_free(&net_generic_ids, *ops->id); } return error; } static void unregister_pernet_operations(struct pernet_operations *ops) { __unregister_pernet_operations(ops); rcu_barrier(); if (ops->id) ida_free(&net_generic_ids, *ops->id); } /** * register_pernet_subsys - register a network namespace subsystem * @ops: pernet operations structure for the subsystem * * Register a subsystem which has init and exit functions * that are called when network namespaces are created and * destroyed respectively. * * When registered all network namespace init functions are * called for every existing network namespace. Allowing kernel * modules to have a race free view of the set of network namespaces. * * When a new network namespace is created all of the init * methods are called in the order in which they were registered. * * When a network namespace is destroyed all of the exit methods * are called in the reverse of the order with which they were * registered. */ int register_pernet_subsys(struct pernet_operations *ops) { int error; down_write(&pernet_ops_rwsem); error = register_pernet_operations(first_device, ops); up_write(&pernet_ops_rwsem); return error; } EXPORT_SYMBOL_GPL(register_pernet_subsys); /** * unregister_pernet_subsys - unregister a network namespace subsystem * @ops: pernet operations structure to manipulate * * Remove the pernet operations structure from the list to be * used when network namespaces are created or destroyed. In * addition run the exit method for all existing network * namespaces. */ void unregister_pernet_subsys(struct pernet_operations *ops) { down_write(&pernet_ops_rwsem); unregister_pernet_operations(ops); up_write(&pernet_ops_rwsem); } EXPORT_SYMBOL_GPL(unregister_pernet_subsys); /** * register_pernet_device - register a network namespace device * @ops: pernet operations structure for the subsystem * * Register a device which has init and exit functions * that are called when network namespaces are created and * destroyed respectively. * * When registered all network namespace init functions are * called for every existing network namespace. Allowing kernel * modules to have a race free view of the set of network namespaces. * * When a new network namespace is created all of the init * methods are called in the order in which they were registered. * * When a network namespace is destroyed all of the exit methods * are called in the reverse of the order with which they were * registered. */ int register_pernet_device(struct pernet_operations *ops) { int error; down_write(&pernet_ops_rwsem); error = register_pernet_operations(&pernet_list, ops); if (!error && (first_device == &pernet_list)) first_device = &ops->list; up_write(&pernet_ops_rwsem); return error; } EXPORT_SYMBOL_GPL(register_pernet_device); /** * unregister_pernet_device - unregister a network namespace netdevice * @ops: pernet operations structure to manipulate * * Remove the pernet operations structure from the list to be * used when network namespaces are created or destroyed. In * addition run the exit method for all existing network * namespaces. */ void unregister_pernet_device(struct pernet_operations *ops) { down_write(&pernet_ops_rwsem); if (&ops->list == first_device) first_device = first_device->next; unregister_pernet_operations(ops); up_write(&pernet_ops_rwsem); } EXPORT_SYMBOL_GPL(unregister_pernet_device); #ifdef CONFIG_NET_NS static struct ns_common *netns_get(struct task_struct *task) { struct net *net = NULL; struct nsproxy *nsproxy; task_lock(task); nsproxy = task->nsproxy; if (nsproxy) net = get_net(nsproxy->net_ns); task_unlock(task); return net ? &net->ns : NULL; } static inline struct net *to_net_ns(struct ns_common *ns) { return container_of(ns, struct net, ns); } static void netns_put(struct ns_common *ns) { put_net(to_net_ns(ns)); } static int netns_install(struct nsset *nsset, struct ns_common *ns) { struct nsproxy *nsproxy = nsset->nsproxy; struct net *net = to_net_ns(ns); if (!ns_capable(net->user_ns, CAP_SYS_ADMIN) || !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) return -EPERM; put_net(nsproxy->net_ns); nsproxy->net_ns = get_net(net); return 0; } static struct user_namespace *netns_owner(struct ns_common *ns) { return to_net_ns(ns)->user_ns; } const struct proc_ns_operations netns_operations = { .name = "net", .type = CLONE_NEWNET, .get = netns_get, .put = netns_put, .install = netns_install, .owner = netns_owner, }; #endif
8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Global definitions for the Ethernet IEEE 802.3 interface. * * Version: @(#)if_ether.h 1.0.1a 02/08/94 * * Author: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Donald Becker, <becker@super.org> * Alan Cox, <alan@lxorguk.ukuu.org.uk> * Steve Whitehouse, <gw7rrm@eeshack3.swan.ac.uk> */ #ifndef _LINUX_IF_ETHER_H #define _LINUX_IF_ETHER_H #include <linux/skbuff.h> #include <uapi/linux/if_ether.h> /* XX:XX:XX:XX:XX:XX */ #define MAC_ADDR_STR_LEN (3 * ETH_ALEN - 1) static inline struct ethhdr *eth_hdr(const struct sk_buff *skb) { return (struct ethhdr *)skb_mac_header(skb); } /* Prefer this version in TX path, instead of * skb_reset_mac_header() + eth_hdr() */ static inline struct ethhdr *skb_eth_hdr(const struct sk_buff *skb) { return (struct ethhdr *)skb->data; } static inline struct ethhdr *inner_eth_hdr(const struct sk_buff *skb) { return (struct ethhdr *)skb_inner_mac_header(skb); } int eth_header_parse(const struct sk_buff *skb, unsigned char *haddr); extern ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len); #endif /* _LINUX_IF_ETHER_H */
1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _FUTEX_H #define _FUTEX_H #include <linux/futex.h> #include <linux/rtmutex.h> #include <linux/sched/wake_q.h> #include <linux/compat.h> #include <linux/uaccess.h> #include <linux/cleanup.h> #ifdef CONFIG_PREEMPT_RT #include <linux/rcuwait.h> #endif #include <asm/futex.h> /* * Futex flags used to encode options to functions and preserve them across * restarts. */ #define FLAGS_SIZE_8 0x0000 #define FLAGS_SIZE_16 0x0001 #define FLAGS_SIZE_32 0x0002 #define FLAGS_SIZE_64 0x0003 #define FLAGS_SIZE_MASK 0x0003 #ifdef CONFIG_MMU # define FLAGS_SHARED 0x0010 #else /* * NOMMU does not have per process address space. Let the compiler optimize * code away. */ # define FLAGS_SHARED 0x0000 #endif #define FLAGS_CLOCKRT 0x0020 #define FLAGS_HAS_TIMEOUT 0x0040 #define FLAGS_NUMA 0x0080 #define FLAGS_STRICT 0x0100 #define FLAGS_MPOL 0x0200 /* FUTEX_ to FLAGS_ */ static inline unsigned int futex_to_flags(unsigned int op) { unsigned int flags = FLAGS_SIZE_32; if (!(op & FUTEX_PRIVATE_FLAG)) flags |= FLAGS_SHARED; if (op & FUTEX_CLOCK_REALTIME) flags |= FLAGS_CLOCKRT; return flags; } #define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_NUMA | FUTEX2_MPOL | FUTEX2_PRIVATE) /* FUTEX2_ to FLAGS_ */ static inline unsigned int futex2_to_flags(unsigned int flags2) { unsigned int flags = flags2 & FUTEX2_SIZE_MASK; if (!(flags2 & FUTEX2_PRIVATE)) flags |= FLAGS_SHARED; if (flags2 & FUTEX2_NUMA) flags |= FLAGS_NUMA; if (flags2 & FUTEX2_MPOL) flags |= FLAGS_MPOL; return flags; } static inline unsigned int futex_size(unsigned int flags) { return 1 << (flags & FLAGS_SIZE_MASK); } static inline bool futex_flags_valid(unsigned int flags) { /* Only 64bit futexes for 64bit code */ if (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall()) { if ((flags & FLAGS_SIZE_MASK) == FLAGS_SIZE_64) return false; } /* Only 32bit futexes are implemented -- for now */ if ((flags & FLAGS_SIZE_MASK) != FLAGS_SIZE_32) return false; /* * Must be able to represent both FUTEX_NO_NODE and every valid nodeid * in a futex word. */ if (flags & FLAGS_NUMA) { int bits = 8 * futex_size(flags); u64 max = ~0ULL; max >>= 64 - bits; if (nr_node_ids >= max) return false; } return true; } static inline bool futex_validate_input(unsigned int flags, u64 val) { int bits = 8 * futex_size(flags); if (bits < 64 && (val >> bits)) return false; return true; } #ifdef CONFIG_FAIL_FUTEX extern bool should_fail_futex(bool fshared); #else static inline bool should_fail_futex(bool fshared) { return false; } #endif /* * Hash buckets are shared by all the futex_keys that hash to the same * location. Each key may have multiple futex_q structures, one for each task * waiting on a futex. */ struct futex_hash_bucket { atomic_t waiters; spinlock_t lock; struct plist_head chain; struct futex_private_hash *priv; } ____cacheline_aligned_in_smp; /* * Priority Inheritance state: */ struct futex_pi_state { /* * list of 'owned' pi_state instances - these have to be * cleaned up in do_exit() if the task exits prematurely: */ struct list_head list; /* * The PI object: */ struct rt_mutex_base pi_mutex; struct task_struct *owner; refcount_t refcount; union futex_key key; } __randomize_layout; struct futex_q; typedef void (futex_wake_fn)(struct wake_q_head *wake_q, struct futex_q *q); /** * struct futex_q - The hashed futex queue entry, one per waiting task * @list: priority-sorted list of tasks waiting on this futex * @task: the task waiting on the futex * @lock_ptr: the hash bucket lock * @wake: the wake handler for this queue * @wake_data: data associated with the wake handler * @key: the key the futex is hashed on * @pi_state: optional priority inheritance state * @rt_waiter: rt_waiter storage for use with requeue_pi * @requeue_pi_key: the requeue_pi target futex key * @bitset: bitset for the optional bitmasked wakeup * @requeue_state: State field for futex_requeue_pi() * @drop_hb_ref: Waiter should drop the extra hash bucket reference if true * @requeue_wait: RCU wait for futex_requeue_pi() (RT only) * * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so * we can wake only the relevant ones (hashed queues may be shared). * * A futex_q has a woken state, just like tasks have TASK_RUNNING. * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. * The order of wakeup is always to make the first condition true, then * the second. * * PI futexes are typically woken before they are removed from the hash list via * the rt_mutex code. See futex_unqueue_pi(). */ struct futex_q { struct plist_node list; struct task_struct *task; spinlock_t *lock_ptr; futex_wake_fn *wake; void *wake_data; union futex_key key; struct futex_pi_state *pi_state; struct rt_mutex_waiter *rt_waiter; union futex_key *requeue_pi_key; u32 bitset; atomic_t requeue_state; bool drop_hb_ref; #ifdef CONFIG_PREEMPT_RT struct rcuwait requeue_wait; #endif } __randomize_layout; extern const struct futex_q futex_q_init; enum futex_access { FUTEX_READ, FUTEX_WRITE }; extern int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key, enum futex_access rw); extern void futex_q_lockptr_lock(struct futex_q *q); extern struct hrtimer_sleeper * futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, int flags, u64 range_ns); extern struct futex_hash_bucket *futex_hash(union futex_key *key); #ifdef CONFIG_FUTEX_PRIVATE_HASH extern void futex_hash_get(struct futex_hash_bucket *hb); extern void futex_hash_put(struct futex_hash_bucket *hb); extern struct futex_private_hash *futex_private_hash(void); extern void futex_private_hash_put(struct futex_private_hash *fph); #else /* !CONFIG_FUTEX_PRIVATE_HASH */ static inline void futex_hash_get(struct futex_hash_bucket *hb) { } static inline void futex_hash_put(struct futex_hash_bucket *hb) { } static inline struct futex_private_hash *futex_private_hash(void) { return NULL; } static inline void futex_private_hash_put(struct futex_private_hash *fph) { } #endif DEFINE_CLASS(hb, struct futex_hash_bucket *, if (_T) futex_hash_put(_T), futex_hash(key), union futex_key *key); DEFINE_CLASS(private_hash, struct futex_private_hash *, if (_T) futex_private_hash_put(_T), futex_private_hash(), void); /** * futex_match - Check whether two futex keys are equal * @key1: Pointer to key1 * @key2: Pointer to key2 * * Return 1 if two futex_keys are equal, 0 otherwise. */ static inline int futex_match(union futex_key *key1, union futex_key *key2) { return (key1 && key2 && key1->both.word == key2->both.word && key1->both.ptr == key2->both.ptr && key1->both.offset == key2->both.offset); } extern int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, struct futex_q *q, union futex_key *key2, struct task_struct *task); extern void futex_do_wait(struct futex_q *q, struct hrtimer_sleeper *timeout); extern bool __futex_wake_mark(struct futex_q *q); extern void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q); extern int fault_in_user_writeable(u32 __user *uaddr); extern struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key); static inline int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval) { int ret; pagefault_disable(); ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval); pagefault_enable(); return ret; } /* * This does a plain atomic user space read, and the user pointer has * already been verified earlier by get_futex_key() to be both aligned * and actually in user space, just like futex_atomic_cmpxchg_inatomic(). * * We still want to avoid any speculation, and while __get_user() is * the traditional model for this, it's actually slower than doing * this manually these days. * * We could just have a per-architecture special function for it, * the same way we do futex_atomic_cmpxchg_inatomic(), but rather * than force everybody to do that, write it out long-hand using * the low-level user-access infrastructure. * * This looks a bit overkill, but generally just results in a couple * of instructions. */ static __always_inline int futex_get_value(u32 *dest, u32 __user *from) { u32 val; if (can_do_masked_user_access()) from = masked_user_access_begin(from); else if (!user_read_access_begin(from, sizeof(*from))) return -EFAULT; unsafe_get_user(val, from, Efault); user_read_access_end(); *dest = val; return 0; Efault: user_read_access_end(); return -EFAULT; } static __always_inline int futex_put_value(u32 val, u32 __user *to) { if (can_do_masked_user_access()) to = masked_user_access_begin(to); else if (!user_write_access_begin(to, sizeof(*to))) return -EFAULT; unsafe_put_user(val, to, Efault); user_write_access_end(); return 0; Efault: user_write_access_end(); return -EFAULT; } static inline int futex_get_value_locked(u32 *dest, u32 __user *from) { int ret; pagefault_disable(); ret = futex_get_value(dest, from); pagefault_enable(); return ret; } extern void __futex_unqueue(struct futex_q *q); extern void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb, struct task_struct *task); extern int futex_unqueue(struct futex_q *q); /** * futex_queue() - Enqueue the futex_q on the futex_hash_bucket * @q: The futex_q to enqueue * @hb: The destination hash bucket * @task: Task queueing this futex * * The hb->lock must be held by the caller, and is released here. A call to * futex_queue() is typically paired with exactly one call to futex_unqueue(). The * exceptions involve the PI related operations, which may use futex_unqueue_pi() * or nothing if the unqueue is done as part of the wake process and the unqueue * state is implicit in the state of woken task (see futex_wait_requeue_pi() for * an example). * * Note that @task may be NULL, for async usage of futexes. */ static inline void futex_queue(struct futex_q *q, struct futex_hash_bucket *hb, struct task_struct *task) __releases(&hb->lock) { __futex_queue(q, hb, task); spin_unlock(&hb->lock); } extern void futex_unqueue_pi(struct futex_q *q); extern void wait_for_owner_exiting(int ret, struct task_struct *exiting); /* * Reflects a new waiter being added to the waitqueue. */ static inline void futex_hb_waiters_inc(struct futex_hash_bucket *hb) { #ifdef CONFIG_SMP atomic_inc(&hb->waiters); /* * Full barrier (A), see the ordering comment above. */ smp_mb__after_atomic(); #endif } /* * Reflects a waiter being removed from the waitqueue by wakeup * paths. */ static inline void futex_hb_waiters_dec(struct futex_hash_bucket *hb) { #ifdef CONFIG_SMP atomic_dec(&hb->waiters); #endif } static inline int futex_hb_waiters_pending(struct futex_hash_bucket *hb) { #ifdef CONFIG_SMP /* * Full barrier (B), see the ordering comment above. */ smp_mb(); return atomic_read(&hb->waiters); #else return 1; #endif } extern void futex_q_lock(struct futex_q *q, struct futex_hash_bucket *hb); extern void futex_q_unlock(struct futex_hash_bucket *hb); extern int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, union futex_key *key, struct futex_pi_state **ps, struct task_struct *task, struct task_struct **exiting, int set_waiters); extern int refill_pi_state_cache(void); extern void get_pi_state(struct futex_pi_state *pi_state); extern void put_pi_state(struct futex_pi_state *pi_state); extern int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked); /* * Express the locking dependencies for lockdep: */ static inline void double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) { if (hb1 > hb2) swap(hb1, hb2); spin_lock(&hb1->lock); if (hb1 != hb2) spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING); } static inline void double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) { spin_unlock(&hb1->lock); if (hb1 != hb2) spin_unlock(&hb2->lock); } /* syscalls */ extern int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset, u32 __user *uaddr2); extern int futex_requeue(u32 __user *uaddr1, unsigned int flags1, u32 __user *uaddr2, unsigned int flags2, int nr_wake, int nr_requeue, u32 *cmpval, int requeue_pi); extern int __futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, struct hrtimer_sleeper *to, u32 bitset); extern int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset); /** * struct futex_vector - Auxiliary struct for futex_waitv() * @w: Userspace provided data * @q: Kernel side data * * Struct used to build an array with all data need for futex_waitv() */ struct futex_vector { struct futex_waitv w; struct futex_q q; }; extern int futex_parse_waitv(struct futex_vector *futexv, struct futex_waitv __user *uwaitv, unsigned int nr_futexes, futex_wake_fn *wake, void *wake_data); extern int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken); extern int futex_unqueue_multiple(struct futex_vector *v, int count); extern int futex_wait_multiple(struct futex_vector *vs, unsigned int count, struct hrtimer_sleeper *to); extern int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset); extern int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, int nr_wake, int nr_wake2, int op); extern int futex_unlock_pi(u32 __user *uaddr, unsigned int flags); extern int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock); #endif /* _FUTEX_H */
1424 270 12 561 703 766 662 1200 740 738 135 1185 588 125 144 1447 1447 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Based on arch/arm/include/asm/atomic.h * * Copyright (C) 1996 Russell King. * Copyright (C) 2002 Deep Blue Solutions Ltd. * Copyright (C) 2012 ARM Ltd. */ #ifndef __ASM_ATOMIC_H #define __ASM_ATOMIC_H #include <linux/compiler.h> #include <linux/types.h> #include <asm/barrier.h> #include <asm/cmpxchg.h> #include <asm/lse.h> #define ATOMIC_OP(op) \ static __always_inline void arch_##op(int i, atomic_t *v) \ { \ __lse_ll_sc_body(op, i, v); \ } ATOMIC_OP(atomic_andnot) ATOMIC_OP(atomic_or) ATOMIC_OP(atomic_xor) ATOMIC_OP(atomic_add) ATOMIC_OP(atomic_and) ATOMIC_OP(atomic_sub) #undef ATOMIC_OP #define ATOMIC_FETCH_OP(name, op) \ static __always_inline int arch_##op##name(int i, atomic_t *v) \ { \ return __lse_ll_sc_body(op##name, i, v); \ } #define ATOMIC_FETCH_OPS(op) \ ATOMIC_FETCH_OP(_relaxed, op) \ ATOMIC_FETCH_OP(_acquire, op) \ ATOMIC_FETCH_OP(_release, op) \ ATOMIC_FETCH_OP( , op) ATOMIC_FETCH_OPS(atomic_fetch_andnot) ATOMIC_FETCH_OPS(atomic_fetch_or) ATOMIC_FETCH_OPS(atomic_fetch_xor) ATOMIC_FETCH_OPS(atomic_fetch_add) ATOMIC_FETCH_OPS(atomic_fetch_and) ATOMIC_FETCH_OPS(atomic_fetch_sub) ATOMIC_FETCH_OPS(atomic_add_return) ATOMIC_FETCH_OPS(atomic_sub_return) #undef ATOMIC_FETCH_OP #undef ATOMIC_FETCH_OPS #define ATOMIC64_OP(op) \ static __always_inline void arch_##op(long i, atomic64_t *v) \ { \ __lse_ll_sc_body(op, i, v); \ } ATOMIC64_OP(atomic64_andnot) ATOMIC64_OP(atomic64_or) ATOMIC64_OP(atomic64_xor) ATOMIC64_OP(atomic64_add) ATOMIC64_OP(atomic64_and) ATOMIC64_OP(atomic64_sub) #undef ATOMIC64_OP #define ATOMIC64_FETCH_OP(name, op) \ static __always_inline long arch_##op##name(long i, atomic64_t *v) \ { \ return __lse_ll_sc_body(op##name, i, v); \ } #define ATOMIC64_FETCH_OPS(op) \ ATOMIC64_FETCH_OP(_relaxed, op) \ ATOMIC64_FETCH_OP(_acquire, op) \ ATOMIC64_FETCH_OP(_release, op) \ ATOMIC64_FETCH_OP( , op) ATOMIC64_FETCH_OPS(atomic64_fetch_andnot) ATOMIC64_FETCH_OPS(atomic64_fetch_or) ATOMIC64_FETCH_OPS(atomic64_fetch_xor) ATOMIC64_FETCH_OPS(atomic64_fetch_add) ATOMIC64_FETCH_OPS(atomic64_fetch_and) ATOMIC64_FETCH_OPS(atomic64_fetch_sub) ATOMIC64_FETCH_OPS(atomic64_add_return) ATOMIC64_FETCH_OPS(atomic64_sub_return) #undef ATOMIC64_FETCH_OP #undef ATOMIC64_FETCH_OPS static __always_inline long arch_atomic64_dec_if_positive(atomic64_t *v) { return __lse_ll_sc_body(atomic64_dec_if_positive, v); } #define arch_atomic_read(v) __READ_ONCE((v)->counter) #define arch_atomic_set(v, i) __WRITE_ONCE(((v)->counter), (i)) #define arch_atomic_add_return_relaxed arch_atomic_add_return_relaxed #define arch_atomic_add_return_acquire arch_atomic_add_return_acquire #define arch_atomic_add_return_release arch_atomic_add_return_release #define arch_atomic_add_return arch_atomic_add_return #define arch_atomic_sub_return_relaxed arch_atomic_sub_return_relaxed #define arch_atomic_sub_return_acquire arch_atomic_sub_return_acquire #define arch_atomic_sub_return_release arch_atomic_sub_return_release #define arch_atomic_sub_return arch_atomic_sub_return #define arch_atomic_fetch_add_relaxed arch_atomic_fetch_add_relaxed #define arch_atomic_fetch_add_acquire arch_atomic_fetch_add_acquire #define arch_atomic_fetch_add_release arch_atomic_fetch_add_release #define arch_atomic_fetch_add arch_atomic_fetch_add #define arch_atomic_fetch_sub_relaxed arch_atomic_fetch_sub_relaxed #define arch_atomic_fetch_sub_acquire arch_atomic_fetch_sub_acquire #define arch_atomic_fetch_sub_release arch_atomic_fetch_sub_release #define arch_atomic_fetch_sub arch_atomic_fetch_sub #define arch_atomic_fetch_and_relaxed arch_atomic_fetch_and_relaxed #define arch_atomic_fetch_and_acquire arch_atomic_fetch_and_acquire #define arch_atomic_fetch_and_release arch_atomic_fetch_and_release #define arch_atomic_fetch_and arch_atomic_fetch_and #define arch_atomic_fetch_andnot_relaxed arch_atomic_fetch_andnot_relaxed #define arch_atomic_fetch_andnot_acquire arch_atomic_fetch_andnot_acquire #define arch_atomic_fetch_andnot_release arch_atomic_fetch_andnot_release #define arch_atomic_fetch_andnot arch_atomic_fetch_andnot #define arch_atomic_fetch_or_relaxed arch_atomic_fetch_or_relaxed #define arch_atomic_fetch_or_acquire arch_atomic_fetch_or_acquire #define arch_atomic_fetch_or_release arch_atomic_fetch_or_release #define arch_atomic_fetch_or arch_atomic_fetch_or #define arch_atomic_fetch_xor_relaxed arch_atomic_fetch_xor_relaxed #define arch_atomic_fetch_xor_acquire arch_atomic_fetch_xor_acquire #define arch_atomic_fetch_xor_release arch_atomic_fetch_xor_release #define arch_atomic_fetch_xor arch_atomic_fetch_xor #define arch_atomic_andnot arch_atomic_andnot /* * 64-bit arch_atomic operations. */ #define ATOMIC64_INIT ATOMIC_INIT #define arch_atomic64_read arch_atomic_read #define arch_atomic64_set arch_atomic_set #define arch_atomic64_add_return_relaxed arch_atomic64_add_return_relaxed #define arch_atomic64_add_return_acquire arch_atomic64_add_return_acquire #define arch_atomic64_add_return_release arch_atomic64_add_return_release #define arch_atomic64_add_return arch_atomic64_add_return #define arch_atomic64_sub_return_relaxed arch_atomic64_sub_return_relaxed #define arch_atomic64_sub_return_acquire arch_atomic64_sub_return_acquire #define arch_atomic64_sub_return_release arch_atomic64_sub_return_release #define arch_atomic64_sub_return arch_atomic64_sub_return #define arch_atomic64_fetch_add_relaxed arch_atomic64_fetch_add_relaxed #define arch_atomic64_fetch_add_acquire arch_atomic64_fetch_add_acquire #define arch_atomic64_fetch_add_release arch_atomic64_fetch_add_release #define arch_atomic64_fetch_add arch_atomic64_fetch_add #define arch_atomic64_fetch_sub_relaxed arch_atomic64_fetch_sub_relaxed #define arch_atomic64_fetch_sub_acquire arch_atomic64_fetch_sub_acquire #define arch_atomic64_fetch_sub_release arch_atomic64_fetch_sub_release #define arch_atomic64_fetch_sub arch_atomic64_fetch_sub #define arch_atomic64_fetch_and_relaxed arch_atomic64_fetch_and_relaxed #define arch_atomic64_fetch_and_acquire arch_atomic64_fetch_and_acquire #define arch_atomic64_fetch_and_release arch_atomic64_fetch_and_release #define arch_atomic64_fetch_and arch_atomic64_fetch_and #define arch_atomic64_fetch_andnot_relaxed arch_atomic64_fetch_andnot_relaxed #define arch_atomic64_fetch_andnot_acquire arch_atomic64_fetch_andnot_acquire #define arch_atomic64_fetch_andnot_release arch_atomic64_fetch_andnot_release #define arch_atomic64_fetch_andnot arch_atomic64_fetch_andnot #define arch_atomic64_fetch_or_relaxed arch_atomic64_fetch_or_relaxed #define arch_atomic64_fetch_or_acquire arch_atomic64_fetch_or_acquire #define arch_atomic64_fetch_or_release arch_atomic64_fetch_or_release #define arch_atomic64_fetch_or arch_atomic64_fetch_or #define arch_atomic64_fetch_xor_relaxed arch_atomic64_fetch_xor_relaxed #define arch_atomic64_fetch_xor_acquire arch_atomic64_fetch_xor_acquire #define arch_atomic64_fetch_xor_release arch_atomic64_fetch_xor_release #define arch_atomic64_fetch_xor arch_atomic64_fetch_xor #define arch_atomic64_andnot arch_atomic64_andnot #define arch_atomic64_dec_if_positive arch_atomic64_dec_if_positive #endif /* __ASM_ATOMIC_H */
4 4 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/dsa/user.c - user device handling * Copyright (c) 2008-2009 Marvell Semiconductor */ #include <linux/list.h> #include <linux/etherdevice.h> #include <linux/netdevice.h> #include <linux/phy.h> #include <linux/phy_fixed.h> #include <linux/phylink.h> #include <linux/of_net.h> #include <linux/of_mdio.h> #include <linux/mdio.h> #include <net/rtnetlink.h> #include <net/pkt_cls.h> #include <net/selftests.h> #include <net/tc_act/tc_mirred.h> #include <linux/if_bridge.h> #include <linux/if_hsr.h> #include <net/dcbnl.h> #include <linux/netpoll.h> #include <linux/string.h> #include "conduit.h" #include "dsa.h" #include "netlink.h" #include "port.h" #include "switch.h" #include "tag.h" #include "user.h" struct dsa_switchdev_event_work { struct net_device *dev; struct net_device *orig_dev; struct work_struct work; unsigned long event; /* Specific for SWITCHDEV_FDB_ADD_TO_DEVICE and * SWITCHDEV_FDB_DEL_TO_DEVICE */ unsigned char addr[ETH_ALEN]; u16 vid; bool host_addr; }; enum dsa_standalone_event { DSA_UC_ADD, DSA_UC_DEL, DSA_MC_ADD, DSA_MC_DEL, }; struct dsa_standalone_event_work { struct work_struct work; struct net_device *dev; enum dsa_standalone_event event; unsigned char addr[ETH_ALEN]; u16 vid; }; struct dsa_host_vlan_rx_filtering_ctx { struct net_device *dev; const unsigned char *addr; enum dsa_standalone_event event; }; static bool dsa_switch_supports_uc_filtering(struct dsa_switch *ds) { return ds->ops->port_fdb_add && ds->ops->port_fdb_del && ds->fdb_isolation && !ds->vlan_filtering_is_global && !ds->needs_standalone_vlan_filtering; } static bool dsa_switch_supports_mc_filtering(struct dsa_switch *ds) { return ds->ops->port_mdb_add && ds->ops->port_mdb_del && ds->fdb_isolation && !ds->vlan_filtering_is_global && !ds->needs_standalone_vlan_filtering; } static void dsa_user_standalone_event_work(struct work_struct *work) { struct dsa_standalone_event_work *standalone_work = container_of(work, struct dsa_standalone_event_work, work); const unsigned char *addr = standalone_work->addr; struct net_device *dev = standalone_work->dev; struct dsa_port *dp = dsa_user_to_port(dev); struct switchdev_obj_port_mdb mdb; struct dsa_switch *ds = dp->ds; u16 vid = standalone_work->vid; int err; switch (standalone_work->event) { case DSA_UC_ADD: err = dsa_port_standalone_host_fdb_add(dp, addr, vid); if (err) { dev_err(ds->dev, "port %d failed to add %pM vid %d to fdb: %d\n", dp->index, addr, vid, err); break; } break; case DSA_UC_DEL: err = dsa_port_standalone_host_fdb_del(dp, addr, vid); if (err) { dev_err(ds->dev, "port %d failed to delete %pM vid %d from fdb: %d\n", dp->index, addr, vid, err); } break; case DSA_MC_ADD: ether_addr_copy(mdb.addr, addr); mdb.vid = vid; err = dsa_port_standalone_host_mdb_add(dp, &mdb); if (err) { dev_err(ds->dev, "port %d failed to add %pM vid %d to mdb: %d\n", dp->index, addr, vid, err); break; } break; case DSA_MC_DEL: ether_addr_copy(mdb.addr, addr); mdb.vid = vid; err = dsa_port_standalone_host_mdb_del(dp, &mdb); if (err) { dev_err(ds->dev, "port %d failed to delete %pM vid %d from mdb: %d\n", dp->index, addr, vid, err); } break; } kfree(standalone_work); } static int dsa_user_schedule_standalone_work(struct net_device *dev, enum dsa_standalone_event event, const unsigned char *addr, u16 vid) { struct dsa_standalone_event_work *standalone_work; standalone_work = kzalloc(sizeof(*standalone_work), GFP_ATOMIC); if (!standalone_work) return -ENOMEM; INIT_WORK(&standalone_work->work, dsa_user_standalone_event_work); standalone_work->event = event; standalone_work->dev = dev; ether_addr_copy(standalone_work->addr, addr); standalone_work->vid = vid; dsa_schedule_work(&standalone_work->work); return 0; } static int dsa_user_host_vlan_rx_filtering(void *arg, int vid) { struct dsa_host_vlan_rx_filtering_ctx *ctx = arg; return dsa_user_schedule_standalone_work(ctx->dev, ctx->event, ctx->addr, vid); } static int dsa_user_vlan_for_each(struct net_device *dev, int (*cb)(void *arg, int vid), void *arg) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_vlan *v; int err; lockdep_assert_held(&dev->addr_list_lock); err = cb(arg, 0); if (err) return err; list_for_each_entry(v, &dp->user_vlans, list) { err = cb(arg, v->vid); if (err) return err; } return 0; } static int dsa_user_sync_uc(struct net_device *dev, const unsigned char *addr) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_host_vlan_rx_filtering_ctx ctx = { .dev = dev, .addr = addr, .event = DSA_UC_ADD, }; dev_uc_add(conduit, addr); if (!dsa_switch_supports_uc_filtering(dp->ds)) return 0; return dsa_user_vlan_for_each(dev, dsa_user_host_vlan_rx_filtering, &ctx); } static int dsa_user_unsync_uc(struct net_device *dev, const unsigned char *addr) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_host_vlan_rx_filtering_ctx ctx = { .dev = dev, .addr = addr, .event = DSA_UC_DEL, }; dev_uc_del(conduit, addr); if (!dsa_switch_supports_uc_filtering(dp->ds)) return 0; return dsa_user_vlan_for_each(dev, dsa_user_host_vlan_rx_filtering, &ctx); } static int dsa_user_sync_mc(struct net_device *dev, const unsigned char *addr) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_host_vlan_rx_filtering_ctx ctx = { .dev = dev, .addr = addr, .event = DSA_MC_ADD, }; dev_mc_add(conduit, addr); if (!dsa_switch_supports_mc_filtering(dp->ds)) return 0; return dsa_user_vlan_for_each(dev, dsa_user_host_vlan_rx_filtering, &ctx); } static int dsa_user_unsync_mc(struct net_device *dev, const unsigned char *addr) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_host_vlan_rx_filtering_ctx ctx = { .dev = dev, .addr = addr, .event = DSA_MC_DEL, }; dev_mc_del(conduit, addr); if (!dsa_switch_supports_mc_filtering(dp->ds)) return 0; return dsa_user_vlan_for_each(dev, dsa_user_host_vlan_rx_filtering, &ctx); } void dsa_user_sync_ha(struct net_device *dev) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; struct netdev_hw_addr *ha; netif_addr_lock_bh(dev); netdev_for_each_synced_mc_addr(ha, dev) dsa_user_sync_mc(dev, ha->addr); netdev_for_each_synced_uc_addr(ha, dev) dsa_user_sync_uc(dev, ha->addr); netif_addr_unlock_bh(dev); if (dsa_switch_supports_uc_filtering(ds) || dsa_switch_supports_mc_filtering(ds)) dsa_flush_workqueue(); } void dsa_user_unsync_ha(struct net_device *dev) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; struct netdev_hw_addr *ha; netif_addr_lock_bh(dev); netdev_for_each_synced_uc_addr(ha, dev) dsa_user_unsync_uc(dev, ha->addr); netdev_for_each_synced_mc_addr(ha, dev) dsa_user_unsync_mc(dev, ha->addr); netif_addr_unlock_bh(dev); if (dsa_switch_supports_uc_filtering(ds) || dsa_switch_supports_mc_filtering(ds)) dsa_flush_workqueue(); } /* user mii_bus handling ***************************************************/ static int dsa_user_phy_read(struct mii_bus *bus, int addr, int reg) { struct dsa_switch *ds = bus->priv; if (ds->phys_mii_mask & (1 << addr)) return ds->ops->phy_read(ds, addr, reg); return 0xffff; } static int dsa_user_phy_write(struct mii_bus *bus, int addr, int reg, u16 val) { struct dsa_switch *ds = bus->priv; if (ds->phys_mii_mask & (1 << addr)) return ds->ops->phy_write(ds, addr, reg, val); return 0; } void dsa_user_mii_bus_init(struct dsa_switch *ds) { ds->user_mii_bus->priv = (void *)ds; ds->user_mii_bus->name = "dsa user smi"; ds->user_mii_bus->read = dsa_user_phy_read; ds->user_mii_bus->write = dsa_user_phy_write; snprintf(ds->user_mii_bus->id, MII_BUS_ID_SIZE, "dsa-%d.%d", ds->dst->index, ds->index); ds->user_mii_bus->parent = ds->dev; ds->user_mii_bus->phy_mask = ~ds->phys_mii_mask; } /* user device handling ****************************************************/ static int dsa_user_get_iflink(const struct net_device *dev) { return READ_ONCE(dsa_user_to_conduit(dev)->ifindex); } int dsa_user_host_uc_install(struct net_device *dev, const u8 *addr) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int err; if (dsa_switch_supports_uc_filtering(ds)) { err = dsa_port_standalone_host_fdb_add(dp, addr, 0); if (err) goto out; } if (!ether_addr_equal(addr, conduit->dev_addr)) { err = dev_uc_add(conduit, addr); if (err < 0) goto del_host_addr; } return 0; del_host_addr: if (dsa_switch_supports_uc_filtering(ds)) dsa_port_standalone_host_fdb_del(dp, addr, 0); out: return err; } void dsa_user_host_uc_uninstall(struct net_device *dev) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (!ether_addr_equal(dev->dev_addr, conduit->dev_addr)) dev_uc_del(conduit, dev->dev_addr); if (dsa_switch_supports_uc_filtering(ds)) dsa_port_standalone_host_fdb_del(dp, dev->dev_addr, 0); } static int dsa_user_open(struct net_device *dev) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); int err; err = dev_open(conduit, NULL); if (err < 0) { netdev_err(dev, "failed to open conduit %s\n", conduit->name); goto out; } err = dsa_user_host_uc_install(dev, dev->dev_addr); if (err) goto out; err = dsa_port_enable_rt(dp, dev->phydev); if (err) goto out_del_host_uc; return 0; out_del_host_uc: dsa_user_host_uc_uninstall(dev); out: return err; } static int dsa_user_close(struct net_device *dev) { struct dsa_port *dp = dsa_user_to_port(dev); dsa_port_disable_rt(dp); dsa_user_host_uc_uninstall(dev); return 0; } static void dsa_user_manage_host_flood(struct net_device *dev) { bool mc = dev->flags & (IFF_PROMISC | IFF_ALLMULTI); struct dsa_port *dp = dsa_user_to_port(dev); bool uc = dev->flags & IFF_PROMISC; dsa_port_set_host_flood(dp, uc, mc); } static void dsa_user_change_rx_flags(struct net_device *dev, int change) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (change & IFF_ALLMULTI) dev_set_allmulti(conduit, dev->flags & IFF_ALLMULTI ? 1 : -1); if (change & IFF_PROMISC) dev_set_promiscuity(conduit, dev->flags & IFF_PROMISC ? 1 : -1); if (dsa_switch_supports_uc_filtering(ds) && dsa_switch_supports_mc_filtering(ds)) dsa_user_manage_host_flood(dev); } static void dsa_user_set_rx_mode(struct net_device *dev) { __dev_mc_sync(dev, dsa_user_sync_mc, dsa_user_unsync_mc); __dev_uc_sync(dev, dsa_user_sync_uc, dsa_user_unsync_uc); } static int dsa_user_set_mac_address(struct net_device *dev, void *a) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; struct sockaddr *addr = a; int err; if (!is_valid_ether_addr(addr->sa_data)) return -EADDRNOTAVAIL; if (ds->ops->port_set_mac_address) { err = ds->ops->port_set_mac_address(ds, dp->index, addr->sa_data); if (err) return err; } /* If the port is down, the address isn't synced yet to hardware or * to the DSA conduit, so there is nothing to change. */ if (!(dev->flags & IFF_UP)) goto out_change_dev_addr; err = dsa_user_host_uc_install(dev, addr->sa_data); if (err) return err; dsa_user_host_uc_uninstall(dev); out_change_dev_addr: eth_hw_addr_set(dev, addr->sa_data); return 0; } struct dsa_user_dump_ctx { struct net_device *dev; struct sk_buff *skb; struct netlink_callback *cb; int idx; }; static int dsa_user_port_fdb_do_dump(const unsigned char *addr, u16 vid, bool is_static, void *data) { struct dsa_user_dump_ctx *dump = data; struct ndo_fdb_dump_context *ctx = (void *)dump->cb->ctx; u32 portid = NETLINK_CB(dump->cb->skb).portid; u32 seq = dump->cb->nlh->nlmsg_seq; struct nlmsghdr *nlh; struct ndmsg *ndm; if (dump->idx < ctx->fdb_idx) goto skip; nlh = nlmsg_put(dump->skb, portid, seq, RTM_NEWNEIGH, sizeof(*ndm), NLM_F_MULTI); if (!nlh) return -EMSGSIZE; ndm = nlmsg_data(nlh); ndm->ndm_family = AF_BRIDGE; ndm->ndm_pad1 = 0; ndm->ndm_pad2 = 0; ndm->ndm_flags = NTF_SELF; ndm->ndm_type = 0; ndm->ndm_ifindex = dump->dev->ifindex; ndm->ndm_state = is_static ? NUD_NOARP : NUD_REACHABLE; if (nla_put(dump->skb, NDA_LLADDR, ETH_ALEN, addr)) goto nla_put_failure; if (vid && nla_put_u16(dump->skb, NDA_VLAN, vid)) goto nla_put_failure; nlmsg_end(dump->skb, nlh); skip: dump->idx++; return 0; nla_put_failure: nlmsg_cancel(dump->skb, nlh); return -EMSGSIZE; } static int dsa_user_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *filter_dev, int *idx) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_user_dump_ctx dump = { .dev = dev, .skb = skb, .cb = cb, .idx = *idx, }; int err; err = dsa_port_fdb_dump(dp, dsa_user_port_fdb_do_dump, &dump); *idx = dump.idx; return err; } static int dsa_user_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { struct dsa_user_priv *p = netdev_priv(dev); return phylink_mii_ioctl(p->dp->pl, ifr, cmd); } static int dsa_user_port_attr_set(struct net_device *dev, const void *ctx, const struct switchdev_attr *attr, struct netlink_ext_ack *extack) { struct dsa_port *dp = dsa_user_to_port(dev); int ret; if (ctx && ctx != dp) return 0; switch (attr->id) { case SWITCHDEV_ATTR_ID_PORT_STP_STATE: if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev)) return -EOPNOTSUPP; ret = dsa_port_set_state(dp, attr->u.stp_state, true); break; case SWITCHDEV_ATTR_ID_PORT_MST_STATE: if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev)) return -EOPNOTSUPP; ret = dsa_port_set_mst_state(dp, &attr->u.mst_state, extack); break; case SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING: if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev)) return -EOPNOTSUPP; ret = dsa_port_vlan_filtering(dp, attr->u.vlan_filtering, extack); break; case SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME: if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev)) return -EOPNOTSUPP; ret = dsa_port_ageing_time(dp, attr->u.ageing_time); break; case SWITCHDEV_ATTR_ID_BRIDGE_MST: if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev)) return -EOPNOTSUPP; ret = dsa_port_mst_enable(dp, attr->u.mst, extack); break; case SWITCHDEV_ATTR_ID_PORT_PRE_BRIDGE_FLAGS: if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev)) return -EOPNOTSUPP; ret = dsa_port_pre_bridge_flags(dp, attr->u.brport_flags, extack); break; case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS: if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev)) return -EOPNOTSUPP; ret = dsa_port_bridge_flags(dp, attr->u.brport_flags, extack); break; case SWITCHDEV_ATTR_ID_VLAN_MSTI: if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev)) return -EOPNOTSUPP; ret = dsa_port_vlan_msti(dp, &attr->u.vlan_msti); break; default: ret = -EOPNOTSUPP; break; } return ret; } /* Must be called under rcu_read_lock() */ static int dsa_user_vlan_check_for_8021q_uppers(struct net_device *user, const struct switchdev_obj_port_vlan *vlan) { struct net_device *upper_dev; struct list_head *iter; netdev_for_each_upper_dev_rcu(user, upper_dev, iter) { u16 vid; if (!is_vlan_dev(upper_dev)) continue; vid = vlan_dev_vlan_id(upper_dev); if (vid == vlan->vid) return -EBUSY; } return 0; } static int dsa_user_vlan_add(struct net_device *dev, const struct switchdev_obj *obj, struct netlink_ext_ack *extack) { struct dsa_port *dp = dsa_user_to_port(dev); struct switchdev_obj_port_vlan *vlan; int err; if (dsa_port_skip_vlan_configuration(dp)) { NL_SET_ERR_MSG_MOD(extack, "skipping configuration of VLAN"); return 0; } vlan = SWITCHDEV_OBJ_PORT_VLAN(obj); /* Deny adding a bridge VLAN when there is already an 802.1Q upper with * the same VID. */ if (br_vlan_enabled(dsa_port_bridge_dev_get(dp))) { rcu_read_lock(); err = dsa_user_vlan_check_for_8021q_uppers(dev, vlan); rcu_read_unlock(); if (err) { NL_SET_ERR_MSG_MOD(extack, "Port already has a VLAN upper with this VID"); return err; } } return dsa_port_vlan_add(dp, vlan, extack); } /* Offload a VLAN installed on the bridge or on a foreign interface by * installing it as a VLAN towards the CPU port. */ static int dsa_user_host_vlan_add(struct net_device *dev, const struct switchdev_obj *obj, struct netlink_ext_ack *extack) { struct dsa_port *dp = dsa_user_to_port(dev); struct switchdev_obj_port_vlan vlan; /* Do nothing if this is a software bridge */ if (!dp->bridge) return -EOPNOTSUPP; if (dsa_port_skip_vlan_configuration(dp)) { NL_SET_ERR_MSG_MOD(extack, "skipping configuration of VLAN"); return 0; } vlan = *SWITCHDEV_OBJ_PORT_VLAN(obj); /* Even though drivers often handle CPU membership in special ways, * it doesn't make sense to program a PVID, so clear this flag. */ vlan.flags &= ~BRIDGE_VLAN_INFO_PVID; return dsa_port_host_vlan_add(dp, &vlan, extack); } static int dsa_user_port_obj_add(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj, struct netlink_ext_ack *extack) { struct dsa_port *dp = dsa_user_to_port(dev); int err; if (ctx && ctx != dp) return 0; switch (obj->id) { case SWITCHDEV_OBJ_ID_PORT_MDB: if (!dsa_port_offloads_bridge_port(dp, obj->orig_dev)) return -EOPNOTSUPP; err = dsa_port_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; case SWITCHDEV_OBJ_ID_HOST_MDB: if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev)) return -EOPNOTSUPP; err = dsa_port_bridge_host_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; case SWITCHDEV_OBJ_ID_PORT_VLAN: if (dsa_port_offloads_bridge_port(dp, obj->orig_dev)) err = dsa_user_vlan_add(dev, obj, extack); else err = dsa_user_host_vlan_add(dev, obj, extack); break; case SWITCHDEV_OBJ_ID_MRP: if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev)) return -EOPNOTSUPP; err = dsa_port_mrp_add(dp, SWITCHDEV_OBJ_MRP(obj)); break; case SWITCHDEV_OBJ_ID_RING_ROLE_MRP: if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev)) return -EOPNOTSUPP; err = dsa_port_mrp_add_ring_role(dp, SWITCHDEV_OBJ_RING_ROLE_MRP(obj)); break; default: err = -EOPNOTSUPP; break; } return err; } static int dsa_user_vlan_del(struct net_device *dev, const struct switchdev_obj *obj) { struct dsa_port *dp = dsa_user_to_port(dev); struct switchdev_obj_port_vlan *vlan; if (dsa_port_skip_vlan_configuration(dp)) return 0; vlan = SWITCHDEV_OBJ_PORT_VLAN(obj); return dsa_port_vlan_del(dp, vlan); } static int dsa_user_host_vlan_del(struct net_device *dev, const struct switchdev_obj *obj) { struct dsa_port *dp = dsa_user_to_port(dev); struct switchdev_obj_port_vlan *vlan; /* Do nothing if this is a software bridge */ if (!dp->bridge) return -EOPNOTSUPP; if (dsa_port_skip_vlan_configuration(dp)) return 0; vlan = SWITCHDEV_OBJ_PORT_VLAN(obj); return dsa_port_host_vlan_del(dp, vlan); } static int dsa_user_port_obj_del(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj) { struct dsa_port *dp = dsa_user_to_port(dev); int err; if (ctx && ctx != dp) return 0; switch (obj->id) { case SWITCHDEV_OBJ_ID_PORT_MDB: if (!dsa_port_offloads_bridge_port(dp, obj->orig_dev)) return -EOPNOTSUPP; err = dsa_port_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; case SWITCHDEV_OBJ_ID_HOST_MDB: if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev)) return -EOPNOTSUPP; err = dsa_port_bridge_host_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; case SWITCHDEV_OBJ_ID_PORT_VLAN: if (dsa_port_offloads_bridge_port(dp, obj->orig_dev)) err = dsa_user_vlan_del(dev, obj); else err = dsa_user_host_vlan_del(dev, obj); break; case SWITCHDEV_OBJ_ID_MRP: if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev)) return -EOPNOTSUPP; err = dsa_port_mrp_del(dp, SWITCHDEV_OBJ_MRP(obj)); break; case SWITCHDEV_OBJ_ID_RING_ROLE_MRP: if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev)) return -EOPNOTSUPP; err = dsa_port_mrp_del_ring_role(dp, SWITCHDEV_OBJ_RING_ROLE_MRP(obj)); break; default: err = -EOPNOTSUPP; break; } return err; } static netdev_tx_t dsa_user_netpoll_send_skb(struct net_device *dev, struct sk_buff *skb) { #ifdef CONFIG_NET_POLL_CONTROLLER struct dsa_user_priv *p = netdev_priv(dev); return netpoll_send_skb(p->netpoll, skb); #else BUG(); return NETDEV_TX_OK; #endif } static void dsa_skb_tx_timestamp(struct dsa_user_priv *p, struct sk_buff *skb) { struct dsa_switch *ds = p->dp->ds; if (!(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP_NOBPF)) return; if (!ds->ops->port_txtstamp) return; ds->ops->port_txtstamp(ds, p->dp->index, skb); } netdev_tx_t dsa_enqueue_skb(struct sk_buff *skb, struct net_device *dev) { /* SKB for netpoll still need to be mangled with the protocol-specific * tag to be successfully transmitted */ if (unlikely(netpoll_tx_running(dev))) return dsa_user_netpoll_send_skb(dev, skb); /* Queue the SKB for transmission on the parent interface, but * do not modify its EtherType */ skb->dev = dsa_user_to_conduit(dev); dev_queue_xmit(skb); return NETDEV_TX_OK; } EXPORT_SYMBOL_GPL(dsa_enqueue_skb); static netdev_tx_t dsa_user_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_user_priv *p = netdev_priv(dev); struct sk_buff *nskb; dev_sw_netstats_tx_add(dev, 1, skb->len); memset(skb->cb, 0, sizeof(skb->cb)); /* Handle tx timestamp if any */ dsa_skb_tx_timestamp(p, skb); if (skb_ensure_writable_head_tail(skb, dev)) { dev_kfree_skb_any(skb); return NETDEV_TX_OK; } /* needed_tailroom should still be 'warm' in the cache line from * skb_ensure_writable_head_tail(), which has also ensured that * padding is safe. */ if (dev->needed_tailroom) eth_skb_pad(skb); /* Transmit function may have to reallocate the original SKB, * in which case it must have freed it. Only free it here on error. */ nskb = p->xmit(skb, dev); if (!nskb) { kfree_skb(skb); return NETDEV_TX_OK; } return dsa_enqueue_skb(nskb, dev); } /* ethtool operations *******************************************************/ static void dsa_user_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo) { strscpy(drvinfo->driver, "dsa", sizeof(drvinfo->driver)); strscpy(drvinfo->fw_version, "N/A", sizeof(drvinfo->fw_version)); strscpy(drvinfo->bus_info, "platform", sizeof(drvinfo->bus_info)); } static int dsa_user_get_regs_len(struct net_device *dev) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_regs_len) return ds->ops->get_regs_len(ds, dp->index); return -EOPNOTSUPP; } static void dsa_user_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *_p) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_regs) ds->ops->get_regs(ds, dp->index, regs, _p); } static int dsa_user_nway_reset(struct net_device *dev) { struct dsa_port *dp = dsa_user_to_port(dev); return phylink_ethtool_nway_reset(dp->pl); } static int dsa_user_get_eeprom_len(struct net_device *dev) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->cd && ds->cd->eeprom_len) return ds->cd->eeprom_len; if (ds->ops->get_eeprom_len) return ds->ops->get_eeprom_len(ds); return 0; } static int dsa_user_get_eeprom(struct net_device *dev, struct ethtool_eeprom *eeprom, u8 *data) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_eeprom) return ds->ops->get_eeprom(ds, eeprom, data); return -EOPNOTSUPP; } static int dsa_user_set_eeprom(struct net_device *dev, struct ethtool_eeprom *eeprom, u8 *data) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->set_eeprom) return ds->ops->set_eeprom(ds, eeprom, data); return -EOPNOTSUPP; } static void dsa_user_get_strings(struct net_device *dev, uint32_t stringset, uint8_t *data) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (stringset == ETH_SS_STATS) { ethtool_puts(&data, "tx_packets"); ethtool_puts(&data, "tx_bytes"); ethtool_puts(&data, "rx_packets"); ethtool_puts(&data, "rx_bytes"); if (ds->ops->get_strings) ds->ops->get_strings(ds, dp->index, stringset, data); } else if (stringset == ETH_SS_TEST) { net_selftest_get_strings(data); } } static void dsa_user_get_ethtool_stats(struct net_device *dev, struct ethtool_stats *stats, uint64_t *data) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; struct pcpu_sw_netstats *s; unsigned int start; int i; for_each_possible_cpu(i) { u64 tx_packets, tx_bytes, rx_packets, rx_bytes; s = per_cpu_ptr(dev->tstats, i); do { start = u64_stats_fetch_begin(&s->syncp); tx_packets = u64_stats_read(&s->tx_packets); tx_bytes = u64_stats_read(&s->tx_bytes); rx_packets = u64_stats_read(&s->rx_packets); rx_bytes = u64_stats_read(&s->rx_bytes); } while (u64_stats_fetch_retry(&s->syncp, start)); data[0] += tx_packets; data[1] += tx_bytes; data[2] += rx_packets; data[3] += rx_bytes; } if (ds->ops->get_ethtool_stats) ds->ops->get_ethtool_stats(ds, dp->index, data + 4); } static int dsa_user_get_sset_count(struct net_device *dev, int sset) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (sset == ETH_SS_STATS) { int count = 0; if (ds->ops->get_sset_count) { count = ds->ops->get_sset_count(ds, dp->index, sset); if (count < 0) return count; } return count + 4; } else if (sset == ETH_SS_TEST) { return net_selftest_get_count(); } return -EOPNOTSUPP; } static void dsa_user_get_eth_phy_stats(struct net_device *dev, struct ethtool_eth_phy_stats *phy_stats) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_eth_phy_stats) ds->ops->get_eth_phy_stats(ds, dp->index, phy_stats); } static void dsa_user_get_eth_mac_stats(struct net_device *dev, struct ethtool_eth_mac_stats *mac_stats) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_eth_mac_stats) ds->ops->get_eth_mac_stats(ds, dp->index, mac_stats); } static void dsa_user_get_eth_ctrl_stats(struct net_device *dev, struct ethtool_eth_ctrl_stats *ctrl_stats) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_eth_ctrl_stats) ds->ops->get_eth_ctrl_stats(ds, dp->index, ctrl_stats); } static void dsa_user_get_rmon_stats(struct net_device *dev, struct ethtool_rmon_stats *rmon_stats, const struct ethtool_rmon_hist_range **ranges) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_rmon_stats) ds->ops->get_rmon_stats(ds, dp->index, rmon_stats, ranges); } static void dsa_user_get_ts_stats(struct net_device *dev, struct ethtool_ts_stats *ts_stats) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_ts_stats) ds->ops->get_ts_stats(ds, dp->index, ts_stats); } static void dsa_user_net_selftest(struct net_device *ndev, struct ethtool_test *etest, u64 *buf) { struct dsa_port *dp = dsa_user_to_port(ndev); struct dsa_switch *ds = dp->ds; if (ds->ops->self_test) { ds->ops->self_test(ds, dp->index, etest, buf); return; } net_selftest(ndev, etest, buf); } static int dsa_user_get_mm(struct net_device *dev, struct ethtool_mm_state *state) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (!ds->ops->get_mm) return -EOPNOTSUPP; return ds->ops->get_mm(ds, dp->index, state); } static int dsa_user_set_mm(struct net_device *dev, struct ethtool_mm_cfg *cfg, struct netlink_ext_ack *extack) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (!ds->ops->set_mm) return -EOPNOTSUPP; return ds->ops->set_mm(ds, dp->index, cfg, extack); } static void dsa_user_get_mm_stats(struct net_device *dev, struct ethtool_mm_stats *stats) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_mm_stats) ds->ops->get_mm_stats(ds, dp->index, stats); } static void dsa_user_get_wol(struct net_device *dev, struct ethtool_wolinfo *w) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; phylink_ethtool_get_wol(dp->pl, w); if (ds->ops->get_wol) ds->ops->get_wol(ds, dp->index, w); } static int dsa_user_set_wol(struct net_device *dev, struct ethtool_wolinfo *w) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int ret = -EOPNOTSUPP; phylink_ethtool_set_wol(dp->pl, w); if (ds->ops->set_wol) ret = ds->ops->set_wol(ds, dp->index, w); return ret; } static int dsa_user_set_eee(struct net_device *dev, struct ethtool_keee *e) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int ret; /* Check whether the switch supports EEE */ if (!ds->ops->support_eee || !ds->ops->support_eee(ds, dp->index)) return -EOPNOTSUPP; /* If the port is using phylink managed EEE, then an unimplemented * set_mac_eee() is permissible. */ if (!phylink_mac_implements_lpi(ds->phylink_mac_ops)) { /* Port's PHY and MAC both need to be EEE capable */ if (!dev->phydev) return -ENODEV; if (!ds->ops->set_mac_eee) return -EOPNOTSUPP; ret = ds->ops->set_mac_eee(ds, dp->index, e); if (ret) return ret; } else if (ds->ops->set_mac_eee) { ret = ds->ops->set_mac_eee(ds, dp->index, e); if (ret) return ret; } return phylink_ethtool_set_eee(dp->pl, e); } static int dsa_user_get_eee(struct net_device *dev, struct ethtool_keee *e) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; /* Check whether the switch supports EEE */ if (!ds->ops->support_eee || !ds->ops->support_eee(ds, dp->index)) return -EOPNOTSUPP; /* Port's PHY and MAC both need to be EEE capable */ if (!dev->phydev) return -ENODEV; return phylink_ethtool_get_eee(dp->pl, e); } static int dsa_user_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { struct dsa_port *dp = dsa_user_to_port(dev); return phylink_ethtool_ksettings_get(dp->pl, cmd); } static int dsa_user_set_link_ksettings(struct net_device *dev, const struct ethtool_link_ksettings *cmd) { struct dsa_port *dp = dsa_user_to_port(dev); return phylink_ethtool_ksettings_set(dp->pl, cmd); } static void dsa_user_get_pause_stats(struct net_device *dev, struct ethtool_pause_stats *pause_stats) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_pause_stats) ds->ops->get_pause_stats(ds, dp->index, pause_stats); } static void dsa_user_get_pauseparam(struct net_device *dev, struct ethtool_pauseparam *pause) { struct dsa_port *dp = dsa_user_to_port(dev); phylink_ethtool_get_pauseparam(dp->pl, pause); } static int dsa_user_set_pauseparam(struct net_device *dev, struct ethtool_pauseparam *pause) { struct dsa_port *dp = dsa_user_to_port(dev); return phylink_ethtool_set_pauseparam(dp->pl, pause); } #ifdef CONFIG_NET_POLL_CONTROLLER static int dsa_user_netpoll_setup(struct net_device *dev) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_user_priv *p = netdev_priv(dev); struct netpoll *netpoll; int err = 0; netpoll = kzalloc(sizeof(*netpoll), GFP_KERNEL); if (!netpoll) return -ENOMEM; err = __netpoll_setup(netpoll, conduit); if (err) { kfree(netpoll); goto out; } p->netpoll = netpoll; out: return err; } static void dsa_user_netpoll_cleanup(struct net_device *dev) { struct dsa_user_priv *p = netdev_priv(dev); struct netpoll *netpoll = p->netpoll; if (!netpoll) return; p->netpoll = NULL; __netpoll_free(netpoll); } static void dsa_user_poll_controller(struct net_device *dev) { } #endif static struct dsa_mall_tc_entry * dsa_user_mall_tc_entry_find(struct net_device *dev, unsigned long cookie) { struct dsa_user_priv *p = netdev_priv(dev); struct dsa_mall_tc_entry *mall_tc_entry; list_for_each_entry(mall_tc_entry, &p->mall_tc_list, list) if (mall_tc_entry->cookie == cookie) return mall_tc_entry; return NULL; } static int dsa_user_add_cls_matchall_mirred(struct net_device *dev, struct tc_cls_matchall_offload *cls, bool ingress, bool ingress_target) { struct netlink_ext_ack *extack = cls->common.extack; struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_user_priv *p = netdev_priv(dev); struct dsa_mall_mirror_tc_entry *mirror; struct dsa_mall_tc_entry *mall_tc_entry; struct dsa_switch *ds = dp->ds; struct flow_action_entry *act; struct dsa_port *to_dp; int err; if (cls->common.protocol != htons(ETH_P_ALL)) { NL_SET_ERR_MSG_MOD(extack, "Can only offload \"protocol all\" matchall filter"); return -EOPNOTSUPP; } if (!ds->ops->port_mirror_add) { NL_SET_ERR_MSG_MOD(extack, "Switch does not support mirroring operation"); return -EOPNOTSUPP; } if (!flow_action_basic_hw_stats_check(&cls->rule->action, extack)) return -EOPNOTSUPP; act = &cls->rule->action.entries[0]; if (!act->dev) return -EINVAL; if (dsa_user_dev_check(act->dev)) { if (ingress_target) { /* We can only fulfill this using software assist */ if (cls->common.skip_sw) { NL_SET_ERR_MSG_MOD(extack, "Can only mirred to ingress of DSA user port if filter also runs in software"); return -EOPNOTSUPP; } to_dp = dp->cpu_dp; } else { to_dp = dsa_user_to_port(act->dev); } } else { /* Handle mirroring to foreign target ports as a mirror towards * the CPU. The software tc rule will take the packets from * there. */ if (cls->common.skip_sw) { NL_SET_ERR_MSG_MOD(extack, "Can only mirred to CPU if filter also runs in software"); return -EOPNOTSUPP; } to_dp = dp->cpu_dp; } if (dp->ds != to_dp->ds) { NL_SET_ERR_MSG_MOD(extack, "Cross-chip mirroring not implemented"); return -EOPNOTSUPP; } mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL); if (!mall_tc_entry) return -ENOMEM; mall_tc_entry->cookie = cls->cookie; mall_tc_entry->type = DSA_PORT_MALL_MIRROR; mirror = &mall_tc_entry->mirror; mirror->to_local_port = to_dp->index; mirror->ingress = ingress; err = ds->ops->port_mirror_add(ds, dp->index, mirror, ingress, extack); if (err) { kfree(mall_tc_entry); return err; } list_add_tail(&mall_tc_entry->list, &p->mall_tc_list); return err; } static int dsa_user_add_cls_matchall_police(struct net_device *dev, struct tc_cls_matchall_offload *cls, bool ingress) { struct netlink_ext_ack *extack = cls->common.extack; struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_user_priv *p = netdev_priv(dev); struct dsa_mall_policer_tc_entry *policer; struct dsa_mall_tc_entry *mall_tc_entry; struct dsa_switch *ds = dp->ds; struct flow_action_entry *act; int err; if (!ds->ops->port_policer_add) { NL_SET_ERR_MSG_MOD(extack, "Policing offload not implemented"); return -EOPNOTSUPP; } if (!ingress) { NL_SET_ERR_MSG_MOD(extack, "Only supported on ingress qdisc"); return -EOPNOTSUPP; } if (!flow_action_basic_hw_stats_check(&cls->rule->action, extack)) return -EOPNOTSUPP; list_for_each_entry(mall_tc_entry, &p->mall_tc_list, list) { if (mall_tc_entry->type == DSA_PORT_MALL_POLICER) { NL_SET_ERR_MSG_MOD(extack, "Only one port policer allowed"); return -EEXIST; } } act = &cls->rule->action.entries[0]; mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL); if (!mall_tc_entry) return -ENOMEM; mall_tc_entry->cookie = cls->cookie; mall_tc_entry->type = DSA_PORT_MALL_POLICER; policer = &mall_tc_entry->policer; policer->rate_bytes_per_sec = act->police.rate_bytes_ps; policer->burst = act->police.burst; err = ds->ops->port_policer_add(ds, dp->index, policer); if (err) { kfree(mall_tc_entry); return err; } list_add_tail(&mall_tc_entry->list, &p->mall_tc_list); return err; } static int dsa_user_add_cls_matchall(struct net_device *dev, struct tc_cls_matchall_offload *cls, bool ingress) { const struct flow_action *action = &cls->rule->action; struct netlink_ext_ack *extack = cls->common.extack; if (!flow_offload_has_one_action(action)) { NL_SET_ERR_MSG_MOD(extack, "Cannot offload matchall filter with more than one action"); return -EOPNOTSUPP; } switch (action->entries[0].id) { case FLOW_ACTION_MIRRED: return dsa_user_add_cls_matchall_mirred(dev, cls, ingress, false); case FLOW_ACTION_MIRRED_INGRESS: return dsa_user_add_cls_matchall_mirred(dev, cls, ingress, true); case FLOW_ACTION_POLICE: return dsa_user_add_cls_matchall_police(dev, cls, ingress); default: NL_SET_ERR_MSG_MOD(extack, "Unknown action"); break; } return -EOPNOTSUPP; } static void dsa_user_del_cls_matchall(struct net_device *dev, struct tc_cls_matchall_offload *cls) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_mall_tc_entry *mall_tc_entry; struct dsa_switch *ds = dp->ds; mall_tc_entry = dsa_user_mall_tc_entry_find(dev, cls->cookie); if (!mall_tc_entry) return; list_del(&mall_tc_entry->list); switch (mall_tc_entry->type) { case DSA_PORT_MALL_MIRROR: if (ds->ops->port_mirror_del) ds->ops->port_mirror_del(ds, dp->index, &mall_tc_entry->mirror); break; case DSA_PORT_MALL_POLICER: if (ds->ops->port_policer_del) ds->ops->port_policer_del(ds, dp->index); break; default: WARN_ON(1); } kfree(mall_tc_entry); } static int dsa_user_setup_tc_cls_matchall(struct net_device *dev, struct tc_cls_matchall_offload *cls, bool ingress) { if (cls->common.chain_index) return -EOPNOTSUPP; switch (cls->command) { case TC_CLSMATCHALL_REPLACE: return dsa_user_add_cls_matchall(dev, cls, ingress); case TC_CLSMATCHALL_DESTROY: dsa_user_del_cls_matchall(dev, cls); return 0; default: return -EOPNOTSUPP; } } static int dsa_user_add_cls_flower(struct net_device *dev, struct flow_cls_offload *cls, bool ingress) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int port = dp->index; if (!ds->ops->cls_flower_add) return -EOPNOTSUPP; return ds->ops->cls_flower_add(ds, port, cls, ingress); } static int dsa_user_del_cls_flower(struct net_device *dev, struct flow_cls_offload *cls, bool ingress) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int port = dp->index; if (!ds->ops->cls_flower_del) return -EOPNOTSUPP; return ds->ops->cls_flower_del(ds, port, cls, ingress); } static int dsa_user_stats_cls_flower(struct net_device *dev, struct flow_cls_offload *cls, bool ingress) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int port = dp->index; if (!ds->ops->cls_flower_stats) return -EOPNOTSUPP; return ds->ops->cls_flower_stats(ds, port, cls, ingress); } static int dsa_user_setup_tc_cls_flower(struct net_device *dev, struct flow_cls_offload *cls, bool ingress) { switch (cls->command) { case FLOW_CLS_REPLACE: return dsa_user_add_cls_flower(dev, cls, ingress); case FLOW_CLS_DESTROY: return dsa_user_del_cls_flower(dev, cls, ingress); case FLOW_CLS_STATS: return dsa_user_stats_cls_flower(dev, cls, ingress); default: return -EOPNOTSUPP; } } static int dsa_user_setup_tc_block_cb(enum tc_setup_type type, void *type_data, void *cb_priv, bool ingress) { struct net_device *dev = cb_priv; if (!tc_can_offload(dev)) return -EOPNOTSUPP; switch (type) { case TC_SETUP_CLSMATCHALL: return dsa_user_setup_tc_cls_matchall(dev, type_data, ingress); case TC_SETUP_CLSFLOWER: return dsa_user_setup_tc_cls_flower(dev, type_data, ingress); default: return -EOPNOTSUPP; } } static int dsa_user_setup_tc_block_cb_ig(enum tc_setup_type type, void *type_data, void *cb_priv) { return dsa_user_setup_tc_block_cb(type, type_data, cb_priv, true); } static int dsa_user_setup_tc_block_cb_eg(enum tc_setup_type type, void *type_data, void *cb_priv) { return dsa_user_setup_tc_block_cb(type, type_data, cb_priv, false); } static LIST_HEAD(dsa_user_block_cb_list); static int dsa_user_setup_tc_block(struct net_device *dev, struct flow_block_offload *f) { struct flow_block_cb *block_cb; flow_setup_cb_t *cb; if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS) cb = dsa_user_setup_tc_block_cb_ig; else if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS) cb = dsa_user_setup_tc_block_cb_eg; else return -EOPNOTSUPP; f->driver_block_list = &dsa_user_block_cb_list; switch (f->command) { case FLOW_BLOCK_BIND: if (flow_block_cb_is_busy(cb, dev, &dsa_user_block_cb_list)) return -EBUSY; block_cb = flow_block_cb_alloc(cb, dev, dev, NULL); if (IS_ERR(block_cb)) return PTR_ERR(block_cb); flow_block_cb_add(block_cb, f); list_add_tail(&block_cb->driver_list, &dsa_user_block_cb_list); return 0; case FLOW_BLOCK_UNBIND: block_cb = flow_block_cb_lookup(f->block, cb, dev); if (!block_cb) return -ENOENT; flow_block_cb_remove(block_cb, f); list_del(&block_cb->driver_list); return 0; default: return -EOPNOTSUPP; } } static int dsa_user_setup_ft_block(struct dsa_switch *ds, int port, void *type_data) { struct net_device *conduit = dsa_port_to_conduit(dsa_to_port(ds, port)); if (!conduit->netdev_ops->ndo_setup_tc) return -EOPNOTSUPP; return conduit->netdev_ops->ndo_setup_tc(conduit, TC_SETUP_FT, type_data); } static int dsa_user_setup_tc(struct net_device *dev, enum tc_setup_type type, void *type_data) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; switch (type) { case TC_SETUP_BLOCK: return dsa_user_setup_tc_block(dev, type_data); case TC_SETUP_FT: return dsa_user_setup_ft_block(ds, dp->index, type_data); default: break; } if (!ds->ops->port_setup_tc) return -EOPNOTSUPP; return ds->ops->port_setup_tc(ds, dp->index, type, type_data); } static int dsa_user_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *nfc, u32 *rule_locs) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (!ds->ops->get_rxnfc) return -EOPNOTSUPP; return ds->ops->get_rxnfc(ds, dp->index, nfc, rule_locs); } static int dsa_user_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *nfc) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (!ds->ops->set_rxnfc) return -EOPNOTSUPP; return ds->ops->set_rxnfc(ds, dp->index, nfc); } static int dsa_user_get_ts_info(struct net_device *dev, struct kernel_ethtool_ts_info *ts) { struct dsa_user_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->dp->ds; if (!ds->ops->get_ts_info) return -EOPNOTSUPP; return ds->ops->get_ts_info(ds, p->dp->index, ts); } static int dsa_user_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid) { struct dsa_port *dp = dsa_user_to_port(dev); struct switchdev_obj_port_vlan vlan = { .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, .vid = vid, /* This API only allows programming tagged, non-PVID VIDs */ .flags = 0, }; struct netlink_ext_ack extack = {0}; struct dsa_switch *ds = dp->ds; struct netdev_hw_addr *ha; struct dsa_vlan *v; int ret; /* User port... */ ret = dsa_port_vlan_add(dp, &vlan, &extack); if (ret) { if (extack._msg) netdev_err(dev, "%s\n", extack._msg); return ret; } /* And CPU port... */ ret = dsa_port_host_vlan_add(dp, &vlan, &extack); if (ret) { if (extack._msg) netdev_err(dev, "CPU port %d: %s\n", dp->cpu_dp->index, extack._msg); return ret; } if (!dsa_switch_supports_uc_filtering(ds) && !dsa_switch_supports_mc_filtering(ds)) return 0; v = kzalloc(sizeof(*v), GFP_KERNEL); if (!v) { ret = -ENOMEM; goto rollback; } netif_addr_lock_bh(dev); v->vid = vid; list_add_tail(&v->list, &dp->user_vlans); if (dsa_switch_supports_mc_filtering(ds)) { netdev_for_each_synced_mc_addr(ha, dev) { dsa_user_schedule_standalone_work(dev, DSA_MC_ADD, ha->addr, vid); } } if (dsa_switch_supports_uc_filtering(ds)) { netdev_for_each_synced_uc_addr(ha, dev) { dsa_user_schedule_standalone_work(dev, DSA_UC_ADD, ha->addr, vid); } } netif_addr_unlock_bh(dev); dsa_flush_workqueue(); return 0; rollback: dsa_port_host_vlan_del(dp, &vlan); dsa_port_vlan_del(dp, &vlan); return ret; } static int dsa_user_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid) { struct dsa_port *dp = dsa_user_to_port(dev); struct switchdev_obj_port_vlan vlan = { .vid = vid, /* This API only allows programming tagged, non-PVID VIDs */ .flags = 0, }; struct dsa_switch *ds = dp->ds; struct netdev_hw_addr *ha; struct dsa_vlan *v; int err; err = dsa_port_vlan_del(dp, &vlan); if (err) return err; err = dsa_port_host_vlan_del(dp, &vlan); if (err) return err; if (!dsa_switch_supports_uc_filtering(ds) && !dsa_switch_supports_mc_filtering(ds)) return 0; netif_addr_lock_bh(dev); v = dsa_vlan_find(&dp->user_vlans, &vlan); if (!v) { netif_addr_unlock_bh(dev); return -ENOENT; } list_del(&v->list); kfree(v); if (dsa_switch_supports_mc_filtering(ds)) { netdev_for_each_synced_mc_addr(ha, dev) { dsa_user_schedule_standalone_work(dev, DSA_MC_DEL, ha->addr, vid); } } if (dsa_switch_supports_uc_filtering(ds)) { netdev_for_each_synced_uc_addr(ha, dev) { dsa_user_schedule_standalone_work(dev, DSA_UC_DEL, ha->addr, vid); } } netif_addr_unlock_bh(dev); dsa_flush_workqueue(); return 0; } static int dsa_user_restore_vlan(struct net_device *vdev, int vid, void *arg) { __be16 proto = vdev ? vlan_dev_vlan_proto(vdev) : htons(ETH_P_8021Q); return dsa_user_vlan_rx_add_vid(arg, proto, vid); } static int dsa_user_clear_vlan(struct net_device *vdev, int vid, void *arg) { __be16 proto = vdev ? vlan_dev_vlan_proto(vdev) : htons(ETH_P_8021Q); return dsa_user_vlan_rx_kill_vid(arg, proto, vid); } /* Keep the VLAN RX filtering list in sync with the hardware only if VLAN * filtering is enabled. The baseline is that only ports that offload a * VLAN-aware bridge are VLAN-aware, and standalone ports are VLAN-unaware, * but there are exceptions for quirky hardware. * * If ds->vlan_filtering_is_global = true, then standalone ports which share * the same switch with other ports that offload a VLAN-aware bridge are also * inevitably VLAN-aware. * * To summarize, a DSA switch port offloads: * * - If standalone (this includes software bridge, software LAG): * - if ds->needs_standalone_vlan_filtering = true, OR if * (ds->vlan_filtering_is_global = true AND there are bridges spanning * this switch chip which have vlan_filtering=1) * - the 8021q upper VLANs * - else (standalone VLAN filtering is not needed, VLAN filtering is not * global, or it is, but no port is under a VLAN-aware bridge): * - no VLAN (any 8021q upper is a software VLAN) * * - If under a vlan_filtering=0 bridge which it offload: * - if ds->configure_vlan_while_not_filtering = true (default): * - the bridge VLANs. These VLANs are committed to hardware but inactive. * - else (deprecated): * - no VLAN. The bridge VLANs are not restored when VLAN awareness is * enabled, so this behavior is broken and discouraged. * * - If under a vlan_filtering=1 bridge which it offload: * - the bridge VLANs * - the 8021q upper VLANs */ int dsa_user_manage_vlan_filtering(struct net_device *user, bool vlan_filtering) { int err; if (vlan_filtering) { user->features |= NETIF_F_HW_VLAN_CTAG_FILTER; err = vlan_for_each(user, dsa_user_restore_vlan, user); if (err) { vlan_for_each(user, dsa_user_clear_vlan, user); user->features &= ~NETIF_F_HW_VLAN_CTAG_FILTER; return err; } } else { err = vlan_for_each(user, dsa_user_clear_vlan, user); if (err) return err; user->features &= ~NETIF_F_HW_VLAN_CTAG_FILTER; } return 0; } struct dsa_hw_port { struct list_head list; struct net_device *dev; int old_mtu; }; static int dsa_hw_port_list_set_mtu(struct list_head *hw_port_list, int mtu) { const struct dsa_hw_port *p; int err; list_for_each_entry(p, hw_port_list, list) { if (p->dev->mtu == mtu) continue; err = dev_set_mtu(p->dev, mtu); if (err) goto rollback; } return 0; rollback: list_for_each_entry_continue_reverse(p, hw_port_list, list) { if (p->dev->mtu == p->old_mtu) continue; if (dev_set_mtu(p->dev, p->old_mtu)) netdev_err(p->dev, "Failed to restore MTU\n"); } return err; } static void dsa_hw_port_list_free(struct list_head *hw_port_list) { struct dsa_hw_port *p, *n; list_for_each_entry_safe(p, n, hw_port_list, list) kfree(p); } /* Make the hardware datapath to/from @dev limited to a common MTU */ static void dsa_bridge_mtu_normalization(struct dsa_port *dp) { struct list_head hw_port_list; struct dsa_switch_tree *dst; int min_mtu = ETH_MAX_MTU; struct dsa_port *other_dp; int err; if (!dp->ds->mtu_enforcement_ingress) return; if (!dp->bridge) return; INIT_LIST_HEAD(&hw_port_list); /* Populate the list of ports that are part of the same bridge * as the newly added/modified port */ list_for_each_entry(dst, &dsa_tree_list, list) { list_for_each_entry(other_dp, &dst->ports, list) { struct dsa_hw_port *hw_port; struct net_device *user; if (other_dp->type != DSA_PORT_TYPE_USER) continue; if (!dsa_port_bridge_same(dp, other_dp)) continue; if (!other_dp->ds->mtu_enforcement_ingress) continue; user = other_dp->user; if (min_mtu > user->mtu) min_mtu = user->mtu; hw_port = kzalloc(sizeof(*hw_port), GFP_KERNEL); if (!hw_port) goto out; hw_port->dev = user; hw_port->old_mtu = user->mtu; list_add(&hw_port->list, &hw_port_list); } } /* Attempt to configure the entire hardware bridge to the newly added * interface's MTU first, regardless of whether the intention of the * user was to raise or lower it. */ err = dsa_hw_port_list_set_mtu(&hw_port_list, dp->user->mtu); if (!err) goto out; /* Clearly that didn't work out so well, so just set the minimum MTU on * all hardware bridge ports now. If this fails too, then all ports will * still have their old MTU rolled back anyway. */ dsa_hw_port_list_set_mtu(&hw_port_list, min_mtu); out: dsa_hw_port_list_free(&hw_port_list); } int dsa_user_change_mtu(struct net_device *dev, int new_mtu) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_port *cpu_dp = dp->cpu_dp; struct dsa_switch *ds = dp->ds; struct dsa_port *other_dp; int largest_mtu = 0; int new_conduit_mtu; int old_conduit_mtu; int mtu_limit; int overhead; int cpu_mtu; int err; if (!ds->ops->port_change_mtu) return -EOPNOTSUPP; dsa_tree_for_each_user_port(other_dp, ds->dst) { int user_mtu; /* During probe, this function will be called for each user * device, while not all of them have been allocated. That's * ok, it doesn't change what the maximum is, so ignore it. */ if (!other_dp->user) continue; /* Pretend that we already applied the setting, which we * actually haven't (still haven't done all integrity checks) */ if (dp == other_dp) user_mtu = new_mtu; else user_mtu = other_dp->user->mtu; if (largest_mtu < user_mtu) largest_mtu = user_mtu; } overhead = dsa_tag_protocol_overhead(cpu_dp->tag_ops); mtu_limit = min_t(int, conduit->max_mtu, dev->max_mtu + overhead); old_conduit_mtu = conduit->mtu; new_conduit_mtu = largest_mtu + overhead; if (new_conduit_mtu > mtu_limit) return -ERANGE; /* If the conduit MTU isn't over limit, there's no need to check the CPU * MTU, since that surely isn't either. */ cpu_mtu = largest_mtu; /* Start applying stuff */ if (new_conduit_mtu != old_conduit_mtu) { err = dev_set_mtu(conduit, new_conduit_mtu); if (err < 0) goto out_conduit_failed; /* We only need to propagate the MTU of the CPU port to * upstream switches, so emit a notifier which updates them. */ err = dsa_port_mtu_change(cpu_dp, cpu_mtu); if (err) goto out_cpu_failed; } err = ds->ops->port_change_mtu(ds, dp->index, new_mtu); if (err) goto out_port_failed; WRITE_ONCE(dev->mtu, new_mtu); dsa_bridge_mtu_normalization(dp); return 0; out_port_failed: if (new_conduit_mtu != old_conduit_mtu) dsa_port_mtu_change(cpu_dp, old_conduit_mtu - overhead); out_cpu_failed: if (new_conduit_mtu != old_conduit_mtu) dev_set_mtu(conduit, old_conduit_mtu); out_conduit_failed: return err; } static int __maybe_unused dsa_user_dcbnl_set_apptrust(struct net_device *dev, u8 *sel, int nsel) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int port = dp->index; if (!ds->ops->port_set_apptrust) return -EOPNOTSUPP; return ds->ops->port_set_apptrust(ds, port, sel, nsel); } static int __maybe_unused dsa_user_dcbnl_get_apptrust(struct net_device *dev, u8 *sel, int *nsel) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int port = dp->index; if (!ds->ops->port_get_apptrust) return -EOPNOTSUPP; return ds->ops->port_get_apptrust(ds, port, sel, nsel); } static int __maybe_unused dsa_user_dcbnl_set_default_prio(struct net_device *dev, struct dcb_app *app) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; unsigned long mask, new_prio; int err, port = dp->index; if (!ds->ops->port_set_default_prio) return -EOPNOTSUPP; err = dcb_ieee_setapp(dev, app); if (err) return err; mask = dcb_ieee_getapp_mask(dev, app); new_prio = __fls(mask); err = ds->ops->port_set_default_prio(ds, port, new_prio); if (err) { dcb_ieee_delapp(dev, app); return err; } return 0; } /* Update the DSCP prio entries on all user ports of the switch in case * the switch supports global DSCP prio instead of per port DSCP prios. */ static int dsa_user_dcbnl_ieee_global_dscp_setdel(struct net_device *dev, struct dcb_app *app, bool del) { int (*setdel)(struct net_device *dev, struct dcb_app *app); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; struct dsa_port *other_dp; int err, restore_err; if (del) setdel = dcb_ieee_delapp; else setdel = dcb_ieee_setapp; dsa_switch_for_each_user_port(other_dp, ds) { struct net_device *user = other_dp->user; if (!user || user == dev) continue; err = setdel(user, app); if (err) goto err_try_to_restore; } return 0; err_try_to_restore: /* Revert logic to restore previous state of app entries */ if (!del) setdel = dcb_ieee_delapp; else setdel = dcb_ieee_setapp; dsa_switch_for_each_user_port_continue_reverse(other_dp, ds) { struct net_device *user = other_dp->user; if (!user || user == dev) continue; restore_err = setdel(user, app); if (restore_err) netdev_err(user, "Failed to restore DSCP prio entry configuration\n"); } return err; } static int __maybe_unused dsa_user_dcbnl_add_dscp_prio(struct net_device *dev, struct dcb_app *app) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; unsigned long mask, new_prio; int err, port = dp->index; u8 dscp = app->protocol; if (!ds->ops->port_add_dscp_prio) return -EOPNOTSUPP; if (dscp >= 64) { netdev_err(dev, "DSCP APP entry with protocol value %u is invalid\n", dscp); return -EINVAL; } err = dcb_ieee_setapp(dev, app); if (err) return err; mask = dcb_ieee_getapp_mask(dev, app); new_prio = __fls(mask); err = ds->ops->port_add_dscp_prio(ds, port, dscp, new_prio); if (err) { dcb_ieee_delapp(dev, app); return err; } if (!ds->dscp_prio_mapping_is_global) return 0; err = dsa_user_dcbnl_ieee_global_dscp_setdel(dev, app, false); if (err) { if (ds->ops->port_del_dscp_prio) ds->ops->port_del_dscp_prio(ds, port, dscp, new_prio); dcb_ieee_delapp(dev, app); return err; } return 0; } static int __maybe_unused dsa_user_dcbnl_ieee_setapp(struct net_device *dev, struct dcb_app *app) { switch (app->selector) { case IEEE_8021QAZ_APP_SEL_ETHERTYPE: switch (app->protocol) { case 0: return dsa_user_dcbnl_set_default_prio(dev, app); default: return -EOPNOTSUPP; } break; case IEEE_8021QAZ_APP_SEL_DSCP: return dsa_user_dcbnl_add_dscp_prio(dev, app); default: return -EOPNOTSUPP; } } static int __maybe_unused dsa_user_dcbnl_del_default_prio(struct net_device *dev, struct dcb_app *app) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; unsigned long mask, new_prio; int err, port = dp->index; if (!ds->ops->port_set_default_prio) return -EOPNOTSUPP; err = dcb_ieee_delapp(dev, app); if (err) return err; mask = dcb_ieee_getapp_mask(dev, app); new_prio = mask ? __fls(mask) : 0; err = ds->ops->port_set_default_prio(ds, port, new_prio); if (err) { dcb_ieee_setapp(dev, app); return err; } return 0; } static int __maybe_unused dsa_user_dcbnl_del_dscp_prio(struct net_device *dev, struct dcb_app *app) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int err, port = dp->index; u8 dscp = app->protocol; if (!ds->ops->port_del_dscp_prio) return -EOPNOTSUPP; err = dcb_ieee_delapp(dev, app); if (err) return err; err = ds->ops->port_del_dscp_prio(ds, port, dscp, app->priority); if (err) { dcb_ieee_setapp(dev, app); return err; } if (!ds->dscp_prio_mapping_is_global) return 0; err = dsa_user_dcbnl_ieee_global_dscp_setdel(dev, app, true); if (err) { if (ds->ops->port_add_dscp_prio) ds->ops->port_add_dscp_prio(ds, port, dscp, app->priority); dcb_ieee_setapp(dev, app); return err; } return 0; } static int __maybe_unused dsa_user_dcbnl_ieee_delapp(struct net_device *dev, struct dcb_app *app) { switch (app->selector) { case IEEE_8021QAZ_APP_SEL_ETHERTYPE: switch (app->protocol) { case 0: return dsa_user_dcbnl_del_default_prio(dev, app); default: return -EOPNOTSUPP; } break; case IEEE_8021QAZ_APP_SEL_DSCP: return dsa_user_dcbnl_del_dscp_prio(dev, app); default: return -EOPNOTSUPP; } } /* Pre-populate the DCB application priority table with the priorities * configured during switch setup, which we read from hardware here. */ static int dsa_user_dcbnl_init(struct net_device *dev) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int port = dp->index; int err; if (ds->ops->port_get_default_prio) { int prio = ds->ops->port_get_default_prio(ds, port); struct dcb_app app = { .selector = IEEE_8021QAZ_APP_SEL_ETHERTYPE, .protocol = 0, .priority = prio, }; if (prio < 0) return prio; err = dcb_ieee_setapp(dev, &app); if (err) return err; } if (ds->ops->port_get_dscp_prio) { int protocol; for (protocol = 0; protocol < 64; protocol++) { struct dcb_app app = { .selector = IEEE_8021QAZ_APP_SEL_DSCP, .protocol = protocol, }; int prio; prio = ds->ops->port_get_dscp_prio(ds, port, protocol); if (prio == -EOPNOTSUPP) continue; if (prio < 0) return prio; app.priority = prio; err = dcb_ieee_setapp(dev, &app); if (err) return err; } } return 0; } static const struct ethtool_ops dsa_user_ethtool_ops = { .get_drvinfo = dsa_user_get_drvinfo, .get_regs_len = dsa_user_get_regs_len, .get_regs = dsa_user_get_regs, .nway_reset = dsa_user_nway_reset, .get_link = ethtool_op_get_link, .get_eeprom_len = dsa_user_get_eeprom_len, .get_eeprom = dsa_user_get_eeprom, .set_eeprom = dsa_user_set_eeprom, .get_strings = dsa_user_get_strings, .get_ethtool_stats = dsa_user_get_ethtool_stats, .get_sset_count = dsa_user_get_sset_count, .get_eth_phy_stats = dsa_user_get_eth_phy_stats, .get_eth_mac_stats = dsa_user_get_eth_mac_stats, .get_eth_ctrl_stats = dsa_user_get_eth_ctrl_stats, .get_rmon_stats = dsa_user_get_rmon_stats, .get_ts_stats = dsa_user_get_ts_stats, .set_wol = dsa_user_set_wol, .get_wol = dsa_user_get_wol, .set_eee = dsa_user_set_eee, .get_eee = dsa_user_get_eee, .get_link_ksettings = dsa_user_get_link_ksettings, .set_link_ksettings = dsa_user_set_link_ksettings, .get_pause_stats = dsa_user_get_pause_stats, .get_pauseparam = dsa_user_get_pauseparam, .set_pauseparam = dsa_user_set_pauseparam, .get_rxnfc = dsa_user_get_rxnfc, .set_rxnfc = dsa_user_set_rxnfc, .get_ts_info = dsa_user_get_ts_info, .self_test = dsa_user_net_selftest, .get_mm = dsa_user_get_mm, .set_mm = dsa_user_set_mm, .get_mm_stats = dsa_user_get_mm_stats, }; static const struct dcbnl_rtnl_ops __maybe_unused dsa_user_dcbnl_ops = { .ieee_setapp = dsa_user_dcbnl_ieee_setapp, .ieee_delapp = dsa_user_dcbnl_ieee_delapp, .dcbnl_setapptrust = dsa_user_dcbnl_set_apptrust, .dcbnl_getapptrust = dsa_user_dcbnl_get_apptrust, }; static void dsa_user_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *s) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_stats64) ds->ops->get_stats64(ds, dp->index, s); else dev_get_tstats64(dev, s); } static int dsa_user_fill_forward_path(struct net_device_path_ctx *ctx, struct net_device_path *path) { struct dsa_port *dp = dsa_user_to_port(ctx->dev); struct net_device *conduit = dsa_port_to_conduit(dp); struct dsa_port *cpu_dp = dp->cpu_dp; path->dev = ctx->dev; path->type = DEV_PATH_DSA; path->dsa.proto = cpu_dp->tag_ops->proto; path->dsa.port = dp->index; ctx->dev = conduit; return 0; } static int dsa_user_hwtstamp_get(struct net_device *dev, struct kernel_hwtstamp_config *cfg) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (!ds->ops->port_hwtstamp_get) return -EOPNOTSUPP; return ds->ops->port_hwtstamp_get(ds, dp->index, cfg); } static int dsa_user_hwtstamp_set(struct net_device *dev, struct kernel_hwtstamp_config *cfg, struct netlink_ext_ack *extack) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (!ds->ops->port_hwtstamp_set) return -EOPNOTSUPP; return ds->ops->port_hwtstamp_set(ds, dp->index, cfg, extack); } static const struct net_device_ops dsa_user_netdev_ops = { .ndo_open = dsa_user_open, .ndo_stop = dsa_user_close, .ndo_start_xmit = dsa_user_xmit, .ndo_change_rx_flags = dsa_user_change_rx_flags, .ndo_set_rx_mode = dsa_user_set_rx_mode, .ndo_set_mac_address = dsa_user_set_mac_address, .ndo_fdb_dump = dsa_user_fdb_dump, .ndo_eth_ioctl = dsa_user_ioctl, .ndo_get_iflink = dsa_user_get_iflink, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_netpoll_setup = dsa_user_netpoll_setup, .ndo_netpoll_cleanup = dsa_user_netpoll_cleanup, .ndo_poll_controller = dsa_user_poll_controller, #endif .ndo_setup_tc = dsa_user_setup_tc, .ndo_get_stats64 = dsa_user_get_stats64, .ndo_vlan_rx_add_vid = dsa_user_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = dsa_user_vlan_rx_kill_vid, .ndo_change_mtu = dsa_user_change_mtu, .ndo_fill_forward_path = dsa_user_fill_forward_path, .ndo_hwtstamp_get = dsa_user_hwtstamp_get, .ndo_hwtstamp_set = dsa_user_hwtstamp_set, }; static const struct device_type dsa_type = { .name = "dsa", }; void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up) { const struct dsa_port *dp = dsa_to_port(ds, port); if (dp->pl) phylink_mac_change(dp->pl, up); } EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_change); static void dsa_user_phylink_fixed_state(struct phylink_config *config, struct phylink_link_state *state) { struct dsa_port *dp = dsa_phylink_to_port(config); struct dsa_switch *ds = dp->ds; /* No need to check that this operation is valid, the callback would * not be called if it was not. */ ds->ops->phylink_fixed_state(ds, dp->index, state); } /* user device setup *******************************************************/ static int dsa_user_phy_connect(struct net_device *user_dev, int addr, u32 flags) { struct dsa_port *dp = dsa_user_to_port(user_dev); struct dsa_switch *ds = dp->ds; user_dev->phydev = mdiobus_get_phy(ds->user_mii_bus, addr); if (!user_dev->phydev) { netdev_err(user_dev, "no phy at %d\n", addr); return -ENODEV; } user_dev->phydev->dev_flags |= flags; return phylink_connect_phy(dp->pl, user_dev->phydev); } static int dsa_user_phy_setup(struct net_device *user_dev) { struct dsa_port *dp = dsa_user_to_port(user_dev); struct device_node *port_dn = dp->dn; struct dsa_switch *ds = dp->ds; u32 phy_flags = 0; int ret; dp->pl_config.dev = &user_dev->dev; dp->pl_config.type = PHYLINK_NETDEV; /* The get_fixed_state callback takes precedence over polling the * link GPIO in PHYLINK (see phylink_get_fixed_state). Only set * this if the switch provides such a callback. */ if (ds->ops->phylink_fixed_state) { dp->pl_config.get_fixed_state = dsa_user_phylink_fixed_state; dp->pl_config.poll_fixed_state = true; } ret = dsa_port_phylink_create(dp); if (ret) return ret; if (ds->ops->get_phy_flags) phy_flags = ds->ops->get_phy_flags(ds, dp->index); ret = phylink_of_phy_connect(dp->pl, port_dn, phy_flags); if (ret == -ENODEV && ds->user_mii_bus) { /* We could not connect to a designated PHY or SFP, so try to * use the switch internal MDIO bus instead */ ret = dsa_user_phy_connect(user_dev, dp->index, phy_flags); } if (ret) { netdev_err(user_dev, "failed to connect to PHY: %pe\n", ERR_PTR(ret)); dsa_port_phylink_destroy(dp); } return ret; } void dsa_user_setup_tagger(struct net_device *user) { struct dsa_port *dp = dsa_user_to_port(user); struct net_device *conduit = dsa_port_to_conduit(dp); struct dsa_user_priv *p = netdev_priv(user); const struct dsa_port *cpu_dp = dp->cpu_dp; const struct dsa_switch *ds = dp->ds; user->needed_headroom = cpu_dp->tag_ops->needed_headroom; user->needed_tailroom = cpu_dp->tag_ops->needed_tailroom; /* Try to save one extra realloc later in the TX path (in the conduit) * by also inheriting the conduit's needed headroom and tailroom. * The 8021q driver also does this. */ user->needed_headroom += conduit->needed_headroom; user->needed_tailroom += conduit->needed_tailroom; p->xmit = cpu_dp->tag_ops->xmit; user->features = conduit->vlan_features | NETIF_F_HW_TC; user->hw_features |= NETIF_F_HW_TC; if (user->needed_tailroom) user->features &= ~(NETIF_F_SG | NETIF_F_FRAGLIST); if (ds->needs_standalone_vlan_filtering) user->features |= NETIF_F_HW_VLAN_CTAG_FILTER; user->lltx = true; } int dsa_user_suspend(struct net_device *user_dev) { struct dsa_port *dp = dsa_user_to_port(user_dev); if (!netif_running(user_dev)) return 0; netif_device_detach(user_dev); rtnl_lock(); phylink_stop(dp->pl); rtnl_unlock(); return 0; } int dsa_user_resume(struct net_device *user_dev) { struct dsa_port *dp = dsa_user_to_port(user_dev); if (!netif_running(user_dev)) return 0; netif_device_attach(user_dev); rtnl_lock(); phylink_start(dp->pl); rtnl_unlock(); return 0; } int dsa_user_create(struct dsa_port *port) { struct net_device *conduit = dsa_port_to_conduit(port); struct dsa_switch *ds = port->ds; struct net_device *user_dev; struct dsa_user_priv *p; const char *name; int assign_type; int ret; if (!ds->num_tx_queues) ds->num_tx_queues = 1; if (port->name) { name = port->name; assign_type = NET_NAME_PREDICTABLE; } else { name = "eth%d"; assign_type = NET_NAME_ENUM; } user_dev = alloc_netdev_mqs(sizeof(struct dsa_user_priv), name, assign_type, ether_setup, ds->num_tx_queues, 1); if (user_dev == NULL) return -ENOMEM; user_dev->rtnl_link_ops = &dsa_link_ops; user_dev->ethtool_ops = &dsa_user_ethtool_ops; #if IS_ENABLED(CONFIG_DCB) user_dev->dcbnl_ops = &dsa_user_dcbnl_ops; #endif if (!is_zero_ether_addr(port->mac)) eth_hw_addr_set(user_dev, port->mac); else eth_hw_addr_inherit(user_dev, conduit); user_dev->priv_flags |= IFF_NO_QUEUE; if (dsa_switch_supports_uc_filtering(ds)) user_dev->priv_flags |= IFF_UNICAST_FLT; user_dev->netdev_ops = &dsa_user_netdev_ops; if (ds->ops->port_max_mtu) user_dev->max_mtu = ds->ops->port_max_mtu(ds, port->index); SET_NETDEV_DEVTYPE(user_dev, &dsa_type); SET_NETDEV_DEV(user_dev, port->ds->dev); SET_NETDEV_DEVLINK_PORT(user_dev, &port->devlink_port); user_dev->dev.of_node = port->dn; user_dev->vlan_features = conduit->vlan_features; p = netdev_priv(user_dev); user_dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; ret = gro_cells_init(&p->gcells, user_dev); if (ret) goto out_free; p->dp = port; INIT_LIST_HEAD(&p->mall_tc_list); port->user = user_dev; dsa_user_setup_tagger(user_dev); netif_carrier_off(user_dev); ret = dsa_user_phy_setup(user_dev); if (ret) { netdev_err(user_dev, "error %d setting up PHY for tree %d, switch %d, port %d\n", ret, ds->dst->index, ds->index, port->index); goto out_gcells; } rtnl_lock(); ret = dsa_user_change_mtu(user_dev, ETH_DATA_LEN); if (ret && ret != -EOPNOTSUPP) dev_warn(ds->dev, "nonfatal error %d setting MTU to %d on port %d\n", ret, ETH_DATA_LEN, port->index); ret = register_netdevice(user_dev); if (ret) { netdev_err(conduit, "error %d registering interface %s\n", ret, user_dev->name); rtnl_unlock(); goto out_phy; } if (IS_ENABLED(CONFIG_DCB)) { ret = dsa_user_dcbnl_init(user_dev); if (ret) { netdev_err(user_dev, "failed to initialize DCB: %pe\n", ERR_PTR(ret)); rtnl_unlock(); goto out_unregister; } } ret = netdev_upper_dev_link(conduit, user_dev, NULL); rtnl_unlock(); if (ret) goto out_unregister; return 0; out_unregister: unregister_netdev(user_dev); out_phy: rtnl_lock(); phylink_disconnect_phy(p->dp->pl); rtnl_unlock(); dsa_port_phylink_destroy(p->dp); out_gcells: gro_cells_destroy(&p->gcells); out_free: free_netdev(user_dev); port->user = NULL; return ret; } void dsa_user_destroy(struct net_device *user_dev) { struct net_device *conduit = dsa_user_to_conduit(user_dev); struct dsa_port *dp = dsa_user_to_port(user_dev); struct dsa_user_priv *p = netdev_priv(user_dev); netif_carrier_off(user_dev); rtnl_lock(); netdev_upper_dev_unlink(conduit, user_dev); unregister_netdevice(user_dev); phylink_disconnect_phy(dp->pl); rtnl_unlock(); dsa_port_phylink_destroy(dp); gro_cells_destroy(&p->gcells); free_netdev(user_dev); } int dsa_user_change_conduit(struct net_device *dev, struct net_device *conduit, struct netlink_ext_ack *extack) { struct net_device *old_conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; struct net_device *upper; struct list_head *iter; int err; if (conduit == old_conduit) return 0; if (!ds->ops->port_change_conduit) { NL_SET_ERR_MSG_MOD(extack, "Driver does not support changing DSA conduit"); return -EOPNOTSUPP; } if (!netdev_uses_dsa(conduit)) { NL_SET_ERR_MSG_MOD(extack, "Interface not eligible as DSA conduit"); return -EOPNOTSUPP; } netdev_for_each_upper_dev_rcu(conduit, upper, iter) { if (dsa_user_dev_check(upper)) continue; if (netif_is_bridge_master(upper)) continue; NL_SET_ERR_MSG_MOD(extack, "Cannot join conduit with unknown uppers"); return -EOPNOTSUPP; } /* Since we allow live-changing the DSA conduit, plus we auto-open the * DSA conduit when the user port opens => we need to ensure that the * new DSA conduit is open too. */ if (dev->flags & IFF_UP) { err = dev_open(conduit, extack); if (err) return err; } netdev_upper_dev_unlink(old_conduit, dev); err = netdev_upper_dev_link(conduit, dev, extack); if (err) goto out_revert_old_conduit_unlink; err = dsa_port_change_conduit(dp, conduit, extack); if (err) goto out_revert_conduit_link; /* Update the MTU of the new CPU port through cross-chip notifiers */ err = dsa_user_change_mtu(dev, dev->mtu); if (err && err != -EOPNOTSUPP) { netdev_warn(dev, "nonfatal error updating MTU with new conduit: %pe\n", ERR_PTR(err)); } return 0; out_revert_conduit_link: netdev_upper_dev_unlink(conduit, dev); out_revert_old_conduit_unlink: netdev_upper_dev_link(old_conduit, dev, NULL); return err; } bool dsa_user_dev_check(const struct net_device *dev) { return dev->netdev_ops == &dsa_user_netdev_ops; } EXPORT_SYMBOL_GPL(dsa_user_dev_check); static int dsa_user_changeupper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct netlink_ext_ack *extack; int err = NOTIFY_DONE; struct dsa_port *dp; if (!dsa_user_dev_check(dev)) return err; dp = dsa_user_to_port(dev); extack = netdev_notifier_info_to_extack(&info->info); if (netif_is_bridge_master(info->upper_dev)) { if (info->linking) { err = dsa_port_bridge_join(dp, info->upper_dev, extack); if (!err) dsa_bridge_mtu_normalization(dp); if (err == -EOPNOTSUPP) { NL_SET_ERR_MSG_WEAK_MOD(extack, "Offloading not supported"); err = 0; } err = notifier_from_errno(err); } else { dsa_port_bridge_leave(dp, info->upper_dev); err = NOTIFY_OK; } } else if (netif_is_lag_master(info->upper_dev)) { if (info->linking) { err = dsa_port_lag_join(dp, info->upper_dev, info->upper_info, extack); if (err == -EOPNOTSUPP) { NL_SET_ERR_MSG_WEAK_MOD(extack, "Offloading not supported"); err = 0; } err = notifier_from_errno(err); } else { dsa_port_lag_leave(dp, info->upper_dev); err = NOTIFY_OK; } } else if (is_hsr_master(info->upper_dev)) { if (info->linking) { err = dsa_port_hsr_join(dp, info->upper_dev, extack); if (err == -EOPNOTSUPP) { NL_SET_ERR_MSG_WEAK_MOD(extack, "Offloading not supported"); err = 0; } err = notifier_from_errno(err); } else { dsa_port_hsr_leave(dp, info->upper_dev); err = NOTIFY_OK; } } return err; } static int dsa_user_prechangeupper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct dsa_port *dp; if (!dsa_user_dev_check(dev)) return NOTIFY_DONE; dp = dsa_user_to_port(dev); if (netif_is_bridge_master(info->upper_dev) && !info->linking) dsa_port_pre_bridge_leave(dp, info->upper_dev); else if (netif_is_lag_master(info->upper_dev) && !info->linking) dsa_port_pre_lag_leave(dp, info->upper_dev); /* dsa_port_pre_hsr_leave is not yet necessary since hsr devices cannot * meaningfully placed under a bridge yet */ return NOTIFY_DONE; } static int dsa_user_lag_changeupper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct net_device *lower; struct list_head *iter; int err = NOTIFY_DONE; struct dsa_port *dp; if (!netif_is_lag_master(dev)) return err; netdev_for_each_lower_dev(dev, lower, iter) { if (!dsa_user_dev_check(lower)) continue; dp = dsa_user_to_port(lower); if (!dp->lag) /* Software LAG */ continue; err = dsa_user_changeupper(lower, info); if (notifier_to_errno(err)) break; } return err; } /* Same as dsa_user_lag_changeupper() except that it calls * dsa_user_prechangeupper() */ static int dsa_user_lag_prechangeupper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct net_device *lower; struct list_head *iter; int err = NOTIFY_DONE; struct dsa_port *dp; if (!netif_is_lag_master(dev)) return err; netdev_for_each_lower_dev(dev, lower, iter) { if (!dsa_user_dev_check(lower)) continue; dp = dsa_user_to_port(lower); if (!dp->lag) /* Software LAG */ continue; err = dsa_user_prechangeupper(lower, info); if (notifier_to_errno(err)) break; } return err; } static int dsa_prevent_bridging_8021q_upper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct netlink_ext_ack *ext_ack; struct net_device *user, *br; struct dsa_port *dp; ext_ack = netdev_notifier_info_to_extack(&info->info); if (!is_vlan_dev(dev)) return NOTIFY_DONE; user = vlan_dev_real_dev(dev); if (!dsa_user_dev_check(user)) return NOTIFY_DONE; dp = dsa_user_to_port(user); br = dsa_port_bridge_dev_get(dp); if (!br) return NOTIFY_DONE; /* Deny enslaving a VLAN device into a VLAN-aware bridge */ if (br_vlan_enabled(br) && netif_is_bridge_master(info->upper_dev) && info->linking) { NL_SET_ERR_MSG_MOD(ext_ack, "Cannot make VLAN device join VLAN-aware bridge"); return notifier_from_errno(-EINVAL); } return NOTIFY_DONE; } static int dsa_user_check_8021q_upper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct dsa_port *dp = dsa_user_to_port(dev); struct net_device *br = dsa_port_bridge_dev_get(dp); struct bridge_vlan_info br_info; struct netlink_ext_ack *extack; int err = NOTIFY_DONE; u16 vid; if (!br || !br_vlan_enabled(br)) return NOTIFY_DONE; extack = netdev_notifier_info_to_extack(&info->info); vid = vlan_dev_vlan_id(info->upper_dev); /* br_vlan_get_info() returns -EINVAL or -ENOENT if the * device, respectively the VID is not found, returning * 0 means success, which is a failure for us here. */ err = br_vlan_get_info(br, vid, &br_info); if (err == 0) { NL_SET_ERR_MSG_MOD(extack, "This VLAN is already configured by the bridge"); return notifier_from_errno(-EBUSY); } return NOTIFY_DONE; } static int dsa_user_prechangeupper_sanity_check(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct dsa_switch *ds; struct dsa_port *dp; int err; if (!dsa_user_dev_check(dev)) return dsa_prevent_bridging_8021q_upper(dev, info); dp = dsa_user_to_port(dev); ds = dp->ds; if (ds->ops->port_prechangeupper) { err = ds->ops->port_prechangeupper(ds, dp->index, info); if (err) return notifier_from_errno(err); } if (is_vlan_dev(info->upper_dev)) return dsa_user_check_8021q_upper(dev, info); return NOTIFY_DONE; } /* To be eligible as a DSA conduit, a LAG must have all lower interfaces be * eligible DSA conduits. Additionally, all LAG slaves must be DSA conduits of * switches in the same switch tree. */ static int dsa_lag_conduit_validate(struct net_device *lag_dev, struct netlink_ext_ack *extack) { struct net_device *lower1, *lower2; struct list_head *iter1, *iter2; netdev_for_each_lower_dev(lag_dev, lower1, iter1) { netdev_for_each_lower_dev(lag_dev, lower2, iter2) { if (!netdev_uses_dsa(lower1) || !netdev_uses_dsa(lower2)) { NL_SET_ERR_MSG_MOD(extack, "All LAG ports must be eligible as DSA conduits"); return notifier_from_errno(-EINVAL); } if (lower1 == lower2) continue; if (!dsa_port_tree_same(lower1->dsa_ptr, lower2->dsa_ptr)) { NL_SET_ERR_MSG_MOD(extack, "LAG contains DSA conduits of disjoint switch trees"); return notifier_from_errno(-EINVAL); } } } return NOTIFY_DONE; } static int dsa_conduit_prechangeupper_sanity_check(struct net_device *conduit, struct netdev_notifier_changeupper_info *info) { struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(&info->info); if (!netdev_uses_dsa(conduit)) return NOTIFY_DONE; if (!info->linking) return NOTIFY_DONE; /* Allow DSA switch uppers */ if (dsa_user_dev_check(info->upper_dev)) return NOTIFY_DONE; /* Allow bridge uppers of DSA conduits, subject to further * restrictions in dsa_bridge_prechangelower_sanity_check() */ if (netif_is_bridge_master(info->upper_dev)) return NOTIFY_DONE; /* Allow LAG uppers, subject to further restrictions in * dsa_lag_conduit_prechangelower_sanity_check() */ if (netif_is_lag_master(info->upper_dev)) return dsa_lag_conduit_validate(info->upper_dev, extack); NL_SET_ERR_MSG_MOD(extack, "DSA conduit cannot join unknown upper interfaces"); return notifier_from_errno(-EBUSY); } static int dsa_lag_conduit_prechangelower_sanity_check(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(&info->info); struct net_device *lag_dev = info->upper_dev; struct net_device *lower; struct list_head *iter; if (!netdev_uses_dsa(lag_dev) || !netif_is_lag_master(lag_dev)) return NOTIFY_DONE; if (!info->linking) return NOTIFY_DONE; if (!netdev_uses_dsa(dev)) { NL_SET_ERR_MSG(extack, "Only DSA conduits can join a LAG DSA conduit"); return notifier_from_errno(-EINVAL); } netdev_for_each_lower_dev(lag_dev, lower, iter) { if (!dsa_port_tree_same(dev->dsa_ptr, lower->dsa_ptr)) { NL_SET_ERR_MSG(extack, "Interface is DSA conduit for a different switch tree than this LAG"); return notifier_from_errno(-EINVAL); } break; } return NOTIFY_DONE; } /* Don't allow bridging of DSA conduits, since the bridge layer rx_handler * prevents the DSA fake ethertype handler to be invoked, so we don't get the * chance to strip off and parse the DSA switch tag protocol header (the bridge * layer just returns RX_HANDLER_CONSUMED, stopping RX processing for these * frames). * The only case where that would not be an issue is when bridging can already * be offloaded, such as when the DSA conduit is itself a DSA or plain switchdev * port, and is bridged only with other ports from the same hardware device. */ static int dsa_bridge_prechangelower_sanity_check(struct net_device *new_lower, struct netdev_notifier_changeupper_info *info) { struct net_device *br = info->upper_dev; struct netlink_ext_ack *extack; struct net_device *lower; struct list_head *iter; if (!netif_is_bridge_master(br)) return NOTIFY_DONE; if (!info->linking) return NOTIFY_DONE; extack = netdev_notifier_info_to_extack(&info->info); netdev_for_each_lower_dev(br, lower, iter) { if (!netdev_uses_dsa(new_lower) && !netdev_uses_dsa(lower)) continue; if (!netdev_port_same_parent_id(lower, new_lower)) { NL_SET_ERR_MSG(extack, "Cannot do software bridging with a DSA conduit"); return notifier_from_errno(-EINVAL); } } return NOTIFY_DONE; } static void dsa_tree_migrate_ports_from_lag_conduit(struct dsa_switch_tree *dst, struct net_device *lag_dev) { struct net_device *new_conduit = dsa_tree_find_first_conduit(dst); struct dsa_port *dp; int err; dsa_tree_for_each_user_port(dp, dst) { if (dsa_port_to_conduit(dp) != lag_dev) continue; err = dsa_user_change_conduit(dp->user, new_conduit, NULL); if (err) { netdev_err(dp->user, "failed to restore conduit to %s: %pe\n", new_conduit->name, ERR_PTR(err)); } } } static int dsa_conduit_lag_join(struct net_device *conduit, struct net_device *lag_dev, struct netdev_lag_upper_info *uinfo, struct netlink_ext_ack *extack) { struct dsa_port *cpu_dp = conduit->dsa_ptr; struct dsa_switch_tree *dst = cpu_dp->dst; struct dsa_port *dp; int err; err = dsa_conduit_lag_setup(lag_dev, cpu_dp, uinfo, extack); if (err) return err; dsa_tree_for_each_user_port(dp, dst) { if (dsa_port_to_conduit(dp) != conduit) continue; err = dsa_user_change_conduit(dp->user, lag_dev, extack); if (err) goto restore; } return 0; restore: dsa_tree_for_each_user_port_continue_reverse(dp, dst) { if (dsa_port_to_conduit(dp) != lag_dev) continue; err = dsa_user_change_conduit(dp->user, conduit, NULL); if (err) { netdev_err(dp->user, "failed to restore conduit to %s: %pe\n", conduit->name, ERR_PTR(err)); } } dsa_conduit_lag_teardown(lag_dev, conduit->dsa_ptr); return err; } static void dsa_conduit_lag_leave(struct net_device *conduit, struct net_device *lag_dev) { struct dsa_port *dp, *cpu_dp = lag_dev->dsa_ptr; struct dsa_switch_tree *dst = cpu_dp->dst; struct dsa_port *new_cpu_dp = NULL; struct net_device *lower; struct list_head *iter; netdev_for_each_lower_dev(lag_dev, lower, iter) { if (netdev_uses_dsa(lower)) { new_cpu_dp = lower->dsa_ptr; break; } } if (new_cpu_dp) { /* Update the CPU port of the user ports still under the LAG * so that dsa_port_to_conduit() continues to work properly */ dsa_tree_for_each_user_port(dp, dst) if (dsa_port_to_conduit(dp) == lag_dev) dp->cpu_dp = new_cpu_dp; /* Update the index of the virtual CPU port to match the lowest * physical CPU port */ lag_dev->dsa_ptr = new_cpu_dp; wmb(); } else { /* If the LAG DSA conduit has no ports left, migrate back all * user ports to the first physical CPU port */ dsa_tree_migrate_ports_from_lag_conduit(dst, lag_dev); } /* This DSA conduit has left its LAG in any case, so let * the CPU port leave the hardware LAG as well */ dsa_conduit_lag_teardown(lag_dev, conduit->dsa_ptr); } static int dsa_conduit_changeupper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct netlink_ext_ack *extack; int err = NOTIFY_DONE; if (!netdev_uses_dsa(dev)) return err; extack = netdev_notifier_info_to_extack(&info->info); if (netif_is_lag_master(info->upper_dev)) { if (info->linking) { err = dsa_conduit_lag_join(dev, info->upper_dev, info->upper_info, extack); err = notifier_from_errno(err); } else { dsa_conduit_lag_leave(dev, info->upper_dev); err = NOTIFY_OK; } } return err; } static int dsa_user_netdevice_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); switch (event) { case NETDEV_PRECHANGEUPPER: { struct netdev_notifier_changeupper_info *info = ptr; int err; err = dsa_user_prechangeupper_sanity_check(dev, info); if (notifier_to_errno(err)) return err; err = dsa_conduit_prechangeupper_sanity_check(dev, info); if (notifier_to_errno(err)) return err; err = dsa_lag_conduit_prechangelower_sanity_check(dev, info); if (notifier_to_errno(err)) return err; err = dsa_bridge_prechangelower_sanity_check(dev, info); if (notifier_to_errno(err)) return err; err = dsa_user_prechangeupper(dev, ptr); if (notifier_to_errno(err)) return err; err = dsa_user_lag_prechangeupper(dev, ptr); if (notifier_to_errno(err)) return err; break; } case NETDEV_CHANGEUPPER: { int err; err = dsa_user_changeupper(dev, ptr); if (notifier_to_errno(err)) return err; err = dsa_user_lag_changeupper(dev, ptr); if (notifier_to_errno(err)) return err; err = dsa_conduit_changeupper(dev, ptr); if (notifier_to_errno(err)) return err; break; } case NETDEV_CHANGELOWERSTATE: { struct netdev_notifier_changelowerstate_info *info = ptr; struct dsa_port *dp; int err = 0; if (dsa_user_dev_check(dev)) { dp = dsa_user_to_port(dev); err = dsa_port_lag_change(dp, info->lower_state_info); } /* Mirror LAG port events on DSA conduits that are in * a LAG towards their respective switch CPU ports */ if (netdev_uses_dsa(dev)) { dp = dev->dsa_ptr; err = dsa_port_lag_change(dp, info->lower_state_info); } return notifier_from_errno(err); } case NETDEV_CHANGE: case NETDEV_UP: { /* Track state of conduit port. * DSA driver may require the conduit port (and indirectly * the tagger) to be available for some special operation. */ if (netdev_uses_dsa(dev)) { struct dsa_port *cpu_dp = dev->dsa_ptr; struct dsa_switch_tree *dst = cpu_dp->ds->dst; /* Track when the conduit port is UP */ dsa_tree_conduit_oper_state_change(dst, dev, netif_oper_up(dev)); /* Track when the conduit port is ready and can accept * packet. * NETDEV_UP event is not enough to flag a port as ready. * We also have to wait for linkwatch_do_dev to dev_activate * and emit a NETDEV_CHANGE event. * We check if a conduit port is ready by checking if the dev * have a qdisc assigned and is not noop. */ dsa_tree_conduit_admin_state_change(dst, dev, !qdisc_tx_is_noop(dev)); return NOTIFY_OK; } return NOTIFY_DONE; } case NETDEV_GOING_DOWN: { struct dsa_port *dp, *cpu_dp; struct dsa_switch_tree *dst; LIST_HEAD(close_list); if (!netdev_uses_dsa(dev)) return NOTIFY_DONE; cpu_dp = dev->dsa_ptr; dst = cpu_dp->ds->dst; dsa_tree_conduit_admin_state_change(dst, dev, false); list_for_each_entry(dp, &dst->ports, list) { if (!dsa_port_is_user(dp)) continue; if (dp->cpu_dp != cpu_dp) continue; list_add(&dp->user->close_list, &close_list); } netif_close_many(&close_list, true); return NOTIFY_OK; } default: break; } return NOTIFY_DONE; } static void dsa_fdb_offload_notify(struct dsa_switchdev_event_work *switchdev_work) { struct switchdev_notifier_fdb_info info = {}; info.addr = switchdev_work->addr; info.vid = switchdev_work->vid; info.offloaded = true; call_switchdev_notifiers(SWITCHDEV_FDB_OFFLOADED, switchdev_work->orig_dev, &info.info, NULL); } static void dsa_user_switchdev_event_work(struct work_struct *work) { struct dsa_switchdev_event_work *switchdev_work = container_of(work, struct dsa_switchdev_event_work, work); const unsigned char *addr = switchdev_work->addr; struct net_device *dev = switchdev_work->dev; u16 vid = switchdev_work->vid; struct dsa_switch *ds; struct dsa_port *dp; int err; dp = dsa_user_to_port(dev); ds = dp->ds; switch (switchdev_work->event) { case SWITCHDEV_FDB_ADD_TO_DEVICE: if (switchdev_work->host_addr) err = dsa_port_bridge_host_fdb_add(dp, addr, vid); else if (dp->lag) err = dsa_port_lag_fdb_add(dp, addr, vid); else err = dsa_port_fdb_add(dp, addr, vid); if (err) { dev_err(ds->dev, "port %d failed to add %pM vid %d to fdb: %d\n", dp->index, addr, vid, err); break; } dsa_fdb_offload_notify(switchdev_work); break; case SWITCHDEV_FDB_DEL_TO_DEVICE: if (switchdev_work->host_addr) err = dsa_port_bridge_host_fdb_del(dp, addr, vid); else if (dp->lag) err = dsa_port_lag_fdb_del(dp, addr, vid); else err = dsa_port_fdb_del(dp, addr, vid); if (err) { dev_err(ds->dev, "port %d failed to delete %pM vid %d from fdb: %d\n", dp->index, addr, vid, err); } break; } kfree(switchdev_work); } static bool dsa_foreign_dev_check(const struct net_device *dev, const struct net_device *foreign_dev) { const struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch_tree *dst = dp->ds->dst; if (netif_is_bridge_master(foreign_dev)) return !dsa_tree_offloads_bridge_dev(dst, foreign_dev); if (netif_is_bridge_port(foreign_dev)) return !dsa_tree_offloads_bridge_port(dst, foreign_dev); /* Everything else is foreign */ return true; } static int dsa_user_fdb_event(struct net_device *dev, struct net_device *orig_dev, unsigned long event, const void *ctx, const struct switchdev_notifier_fdb_info *fdb_info) { struct dsa_switchdev_event_work *switchdev_work; struct dsa_port *dp = dsa_user_to_port(dev); bool host_addr = fdb_info->is_local; struct dsa_switch *ds = dp->ds; if (ctx && ctx != dp) return 0; if (!dp->bridge) return 0; if (switchdev_fdb_is_dynamically_learned(fdb_info)) { if (dsa_port_offloads_bridge_port(dp, orig_dev)) return 0; /* FDB entries learned by the software bridge or by foreign * bridge ports should be installed as host addresses only if * the driver requests assisted learning. */ if (!ds->assisted_learning_on_cpu_port) return 0; } /* Also treat FDB entries on foreign interfaces bridged with us as host * addresses. */ if (dsa_foreign_dev_check(dev, orig_dev)) host_addr = true; /* Check early that we're not doing work in vain. * Host addresses on LAG ports still require regular FDB ops, * since the CPU port isn't in a LAG. */ if (dp->lag && !host_addr) { if (!ds->ops->lag_fdb_add || !ds->ops->lag_fdb_del) return -EOPNOTSUPP; } else { if (!ds->ops->port_fdb_add || !ds->ops->port_fdb_del) return -EOPNOTSUPP; } switchdev_work = kzalloc(sizeof(*switchdev_work), GFP_ATOMIC); if (!switchdev_work) return -ENOMEM; netdev_dbg(dev, "%s FDB entry towards %s, addr %pM vid %d%s\n", event == SWITCHDEV_FDB_ADD_TO_DEVICE ? "Adding" : "Deleting", orig_dev->name, fdb_info->addr, fdb_info->vid, host_addr ? " as host address" : ""); INIT_WORK(&switchdev_work->work, dsa_user_switchdev_event_work); switchdev_work->event = event; switchdev_work->dev = dev; switchdev_work->orig_dev = orig_dev; ether_addr_copy(switchdev_work->addr, fdb_info->addr); switchdev_work->vid = fdb_info->vid; switchdev_work->host_addr = host_addr; dsa_schedule_work(&switchdev_work->work); return 0; } /* Called under rcu_read_lock() */ static int dsa_user_switchdev_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = switchdev_notifier_info_to_dev(ptr); int err; switch (event) { case SWITCHDEV_PORT_ATTR_SET: err = switchdev_handle_port_attr_set(dev, ptr, dsa_user_dev_check, dsa_user_port_attr_set); return notifier_from_errno(err); case SWITCHDEV_FDB_ADD_TO_DEVICE: case SWITCHDEV_FDB_DEL_TO_DEVICE: err = switchdev_handle_fdb_event_to_device(dev, event, ptr, dsa_user_dev_check, dsa_foreign_dev_check, dsa_user_fdb_event); return notifier_from_errno(err); default: return NOTIFY_DONE; } return NOTIFY_OK; } static int dsa_user_switchdev_blocking_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = switchdev_notifier_info_to_dev(ptr); int err; switch (event) { case SWITCHDEV_PORT_OBJ_ADD: err = switchdev_handle_port_obj_add_foreign(dev, ptr, dsa_user_dev_check, dsa_foreign_dev_check, dsa_user_port_obj_add); return notifier_from_errno(err); case SWITCHDEV_PORT_OBJ_DEL: err = switchdev_handle_port_obj_del_foreign(dev, ptr, dsa_user_dev_check, dsa_foreign_dev_check, dsa_user_port_obj_del); return notifier_from_errno(err); case SWITCHDEV_PORT_ATTR_SET: err = switchdev_handle_port_attr_set(dev, ptr, dsa_user_dev_check, dsa_user_port_attr_set); return notifier_from_errno(err); } return NOTIFY_DONE; } static struct notifier_block dsa_user_nb __read_mostly = { .notifier_call = dsa_user_netdevice_event, }; struct notifier_block dsa_user_switchdev_notifier = { .notifier_call = dsa_user_switchdev_event, }; struct notifier_block dsa_user_switchdev_blocking_notifier = { .notifier_call = dsa_user_switchdev_blocking_event, }; int dsa_user_register_notifier(void) { struct notifier_block *nb; int err; err = register_netdevice_notifier(&dsa_user_nb); if (err) return err; err = register_switchdev_notifier(&dsa_user_switchdev_notifier); if (err) goto err_switchdev_nb; nb = &dsa_user_switchdev_blocking_notifier; err = register_switchdev_blocking_notifier(nb); if (err) goto err_switchdev_blocking_nb; return 0; err_switchdev_blocking_nb: unregister_switchdev_notifier(&dsa_user_switchdev_notifier); err_switchdev_nb: unregister_netdevice_notifier(&dsa_user_nb); return err; } void dsa_user_unregister_notifier(void) { struct notifier_block *nb; int err; nb = &dsa_user_switchdev_blocking_notifier; err = unregister_switchdev_blocking_notifier(nb); if (err) pr_err("DSA: failed to unregister switchdev blocking notifier (%d)\n", err); err = unregister_switchdev_notifier(&dsa_user_switchdev_notifier); if (err) pr_err("DSA: failed to unregister switchdev notifier (%d)\n", err); err = unregister_netdevice_notifier(&dsa_user_nb); if (err) pr_err("DSA: failed to unregister user notifier (%d)\n", err); }
210 2 210 213 210 210 213 213 213 212 210 212 185 185 184 183 184 184 185 184 185 184 183 184 185 184 184 184 185 183 185 185 184 184 210 210 210 212 212 211 212 212 213 213 212 213 212 212 212 213 210 212 213 210 213 212 212 213 213 211 210 209 209 210 181 4 4 4 206 7 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2012-2015 - ARM Ltd * Author: Marc Zyngier <marc.zyngier@arm.com> */ #ifndef __ARM64_KVM_HYP_SYSREG_SR_H__ #define __ARM64_KVM_HYP_SYSREG_SR_H__ #include <linux/compiler.h> #include <linux/kvm_host.h> #include <asm/kprobes.h> #include <asm/kvm_asm.h> #include <asm/kvm_emulate.h> #include <asm/kvm_hyp.h> #include <asm/kvm_mmu.h> static inline bool ctxt_has_s1poe(struct kvm_cpu_context *ctxt); static inline struct kvm_vcpu *ctxt_to_vcpu(struct kvm_cpu_context *ctxt) { struct kvm_vcpu *vcpu = ctxt->__hyp_running_vcpu; if (!vcpu) vcpu = container_of(ctxt, struct kvm_vcpu, arch.ctxt); return vcpu; } static inline bool ctxt_is_guest(struct kvm_cpu_context *ctxt) { return host_data_ptr(host_ctxt) != ctxt; } static inline u64 *ctxt_mdscr_el1(struct kvm_cpu_context *ctxt) { struct kvm_vcpu *vcpu = ctxt_to_vcpu(ctxt); if (ctxt_is_guest(ctxt) && kvm_host_owns_debug_regs(vcpu)) return &vcpu->arch.external_mdscr_el1; return &ctxt_sys_reg(ctxt, MDSCR_EL1); } static inline u64 ctxt_midr_el1(struct kvm_cpu_context *ctxt) { struct kvm *kvm = kern_hyp_va(ctxt_to_vcpu(ctxt)->kvm); if (!(ctxt_is_guest(ctxt) && test_bit(KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS, &kvm->arch.flags))) return read_cpuid_id(); return kvm_read_vm_id_reg(kvm, SYS_MIDR_EL1); } static inline void __sysreg_save_common_state(struct kvm_cpu_context *ctxt) { *ctxt_mdscr_el1(ctxt) = read_sysreg(mdscr_el1); // POR_EL0 can affect uaccess, so must be saved/restored early. if (ctxt_has_s1poe(ctxt)) ctxt_sys_reg(ctxt, POR_EL0) = read_sysreg_s(SYS_POR_EL0); } static inline void __sysreg_save_user_state(struct kvm_cpu_context *ctxt) { ctxt_sys_reg(ctxt, TPIDR_EL0) = read_sysreg(tpidr_el0); ctxt_sys_reg(ctxt, TPIDRRO_EL0) = read_sysreg(tpidrro_el0); } static inline bool ctxt_has_mte(struct kvm_cpu_context *ctxt) { struct kvm_vcpu *vcpu = ctxt_to_vcpu(ctxt); return kvm_has_mte(kern_hyp_va(vcpu->kvm)); } static inline bool ctxt_has_s1pie(struct kvm_cpu_context *ctxt) { struct kvm_vcpu *vcpu; if (!cpus_have_final_cap(ARM64_HAS_S1PIE)) return false; vcpu = ctxt_to_vcpu(ctxt); return kvm_has_s1pie(kern_hyp_va(vcpu->kvm)); } static inline bool ctxt_has_tcrx(struct kvm_cpu_context *ctxt) { struct kvm_vcpu *vcpu; if (!cpus_have_final_cap(ARM64_HAS_TCR2)) return false; vcpu = ctxt_to_vcpu(ctxt); return kvm_has_tcr2(kern_hyp_va(vcpu->kvm)); } static inline bool ctxt_has_s1poe(struct kvm_cpu_context *ctxt) { struct kvm_vcpu *vcpu; if (!system_supports_poe()) return false; vcpu = ctxt_to_vcpu(ctxt); return kvm_has_s1poe(kern_hyp_va(vcpu->kvm)); } static inline bool ctxt_has_ras(struct kvm_cpu_context *ctxt) { struct kvm_vcpu *vcpu; if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) return false; vcpu = ctxt_to_vcpu(ctxt); return kvm_has_ras(kern_hyp_va(vcpu->kvm)); } static inline bool ctxt_has_sctlr2(struct kvm_cpu_context *ctxt) { struct kvm_vcpu *vcpu; if (!cpus_have_final_cap(ARM64_HAS_SCTLR2)) return false; vcpu = ctxt_to_vcpu(ctxt); return kvm_has_sctlr2(kern_hyp_va(vcpu->kvm)); } static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt) { ctxt_sys_reg(ctxt, SCTLR_EL1) = read_sysreg_el1(SYS_SCTLR); ctxt_sys_reg(ctxt, CPACR_EL1) = read_sysreg_el1(SYS_CPACR); ctxt_sys_reg(ctxt, TTBR0_EL1) = read_sysreg_el1(SYS_TTBR0); ctxt_sys_reg(ctxt, TTBR1_EL1) = read_sysreg_el1(SYS_TTBR1); ctxt_sys_reg(ctxt, TCR_EL1) = read_sysreg_el1(SYS_TCR); if (ctxt_has_tcrx(ctxt)) { ctxt_sys_reg(ctxt, TCR2_EL1) = read_sysreg_el1(SYS_TCR2); if (ctxt_has_s1pie(ctxt)) { ctxt_sys_reg(ctxt, PIR_EL1) = read_sysreg_el1(SYS_PIR); ctxt_sys_reg(ctxt, PIRE0_EL1) = read_sysreg_el1(SYS_PIRE0); } if (ctxt_has_s1poe(ctxt)) ctxt_sys_reg(ctxt, POR_EL1) = read_sysreg_el1(SYS_POR); } ctxt_sys_reg(ctxt, ESR_EL1) = read_sysreg_el1(SYS_ESR); ctxt_sys_reg(ctxt, AFSR0_EL1) = read_sysreg_el1(SYS_AFSR0); ctxt_sys_reg(ctxt, AFSR1_EL1) = read_sysreg_el1(SYS_AFSR1); ctxt_sys_reg(ctxt, FAR_EL1) = read_sysreg_el1(SYS_FAR); ctxt_sys_reg(ctxt, MAIR_EL1) = read_sysreg_el1(SYS_MAIR); ctxt_sys_reg(ctxt, VBAR_EL1) = read_sysreg_el1(SYS_VBAR); ctxt_sys_reg(ctxt, CONTEXTIDR_EL1) = read_sysreg_el1(SYS_CONTEXTIDR); ctxt_sys_reg(ctxt, AMAIR_EL1) = read_sysreg_el1(SYS_AMAIR); ctxt_sys_reg(ctxt, CNTKCTL_EL1) = read_sysreg_el1(SYS_CNTKCTL); ctxt_sys_reg(ctxt, PAR_EL1) = read_sysreg_par(); ctxt_sys_reg(ctxt, TPIDR_EL1) = read_sysreg(tpidr_el1); if (ctxt_has_mte(ctxt)) { ctxt_sys_reg(ctxt, TFSR_EL1) = read_sysreg_el1(SYS_TFSR); ctxt_sys_reg(ctxt, TFSRE0_EL1) = read_sysreg_s(SYS_TFSRE0_EL1); } ctxt_sys_reg(ctxt, SP_EL1) = read_sysreg(sp_el1); ctxt_sys_reg(ctxt, ELR_EL1) = read_sysreg_el1(SYS_ELR); ctxt_sys_reg(ctxt, SPSR_EL1) = read_sysreg_el1(SYS_SPSR); if (ctxt_has_sctlr2(ctxt)) ctxt_sys_reg(ctxt, SCTLR2_EL1) = read_sysreg_el1(SYS_SCTLR2); } static inline void __sysreg_save_el2_return_state(struct kvm_cpu_context *ctxt) { ctxt->regs.pc = read_sysreg_el2(SYS_ELR); /* * Guest PSTATE gets saved at guest fixup time in all * cases. We still need to handle the nVHE host side here. */ if (!has_vhe() && ctxt->__hyp_running_vcpu) ctxt->regs.pstate = read_sysreg_el2(SYS_SPSR); if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) return; if (!vserror_state_is_nested(ctxt_to_vcpu(ctxt))) ctxt_sys_reg(ctxt, DISR_EL1) = read_sysreg_s(SYS_VDISR_EL2); else if (ctxt_has_ras(ctxt)) ctxt_sys_reg(ctxt, VDISR_EL2) = read_sysreg_s(SYS_VDISR_EL2); } static inline void __sysreg_restore_common_state(struct kvm_cpu_context *ctxt) { write_sysreg(*ctxt_mdscr_el1(ctxt), mdscr_el1); // POR_EL0 can affect uaccess, so must be saved/restored early. if (ctxt_has_s1poe(ctxt)) write_sysreg_s(ctxt_sys_reg(ctxt, POR_EL0), SYS_POR_EL0); } static inline void __sysreg_restore_user_state(struct kvm_cpu_context *ctxt) { write_sysreg(ctxt_sys_reg(ctxt, TPIDR_EL0), tpidr_el0); write_sysreg(ctxt_sys_reg(ctxt, TPIDRRO_EL0), tpidrro_el0); } static inline void __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt, u64 midr, u64 mpidr) { write_sysreg(midr, vpidr_el2); write_sysreg(mpidr, vmpidr_el2); if (has_vhe() || !cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { write_sysreg_el1(ctxt_sys_reg(ctxt, SCTLR_EL1), SYS_SCTLR); write_sysreg_el1(ctxt_sys_reg(ctxt, TCR_EL1), SYS_TCR); } else if (!ctxt->__hyp_running_vcpu) { /* * Must only be done for guest registers, hence the context * test. We're coming from the host, so SCTLR.M is already * set. Pairs with nVHE's __activate_traps(). */ write_sysreg_el1((ctxt_sys_reg(ctxt, TCR_EL1) | TCR_EPD1_MASK | TCR_EPD0_MASK), SYS_TCR); isb(); } write_sysreg_el1(ctxt_sys_reg(ctxt, CPACR_EL1), SYS_CPACR); write_sysreg_el1(ctxt_sys_reg(ctxt, TTBR0_EL1), SYS_TTBR0); write_sysreg_el1(ctxt_sys_reg(ctxt, TTBR1_EL1), SYS_TTBR1); if (ctxt_has_tcrx(ctxt)) { write_sysreg_el1(ctxt_sys_reg(ctxt, TCR2_EL1), SYS_TCR2); if (ctxt_has_s1pie(ctxt)) { write_sysreg_el1(ctxt_sys_reg(ctxt, PIR_EL1), SYS_PIR); write_sysreg_el1(ctxt_sys_reg(ctxt, PIRE0_EL1), SYS_PIRE0); } if (ctxt_has_s1poe(ctxt)) write_sysreg_el1(ctxt_sys_reg(ctxt, POR_EL1), SYS_POR); } write_sysreg_el1(ctxt_sys_reg(ctxt, ESR_EL1), SYS_ESR); write_sysreg_el1(ctxt_sys_reg(ctxt, AFSR0_EL1), SYS_AFSR0); write_sysreg_el1(ctxt_sys_reg(ctxt, AFSR1_EL1), SYS_AFSR1); write_sysreg_el1(ctxt_sys_reg(ctxt, FAR_EL1), SYS_FAR); write_sysreg_el1(ctxt_sys_reg(ctxt, MAIR_EL1), SYS_MAIR); write_sysreg_el1(ctxt_sys_reg(ctxt, VBAR_EL1), SYS_VBAR); write_sysreg_el1(ctxt_sys_reg(ctxt, CONTEXTIDR_EL1), SYS_CONTEXTIDR); write_sysreg_el1(ctxt_sys_reg(ctxt, AMAIR_EL1), SYS_AMAIR); write_sysreg_el1(ctxt_sys_reg(ctxt, CNTKCTL_EL1), SYS_CNTKCTL); write_sysreg(ctxt_sys_reg(ctxt, PAR_EL1), par_el1); write_sysreg(ctxt_sys_reg(ctxt, TPIDR_EL1), tpidr_el1); if (ctxt_has_mte(ctxt)) { write_sysreg_el1(ctxt_sys_reg(ctxt, TFSR_EL1), SYS_TFSR); write_sysreg_s(ctxt_sys_reg(ctxt, TFSRE0_EL1), SYS_TFSRE0_EL1); } if (!has_vhe() && cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT) && ctxt->__hyp_running_vcpu) { /* * Must only be done for host registers, hence the context * test. Pairs with nVHE's __deactivate_traps(). */ isb(); /* * At this stage, and thanks to the above isb(), S2 is * deconfigured and disabled. We can now restore the host's * S1 configuration: SCTLR, and only then TCR. */ write_sysreg_el1(ctxt_sys_reg(ctxt, SCTLR_EL1), SYS_SCTLR); isb(); write_sysreg_el1(ctxt_sys_reg(ctxt, TCR_EL1), SYS_TCR); } write_sysreg(ctxt_sys_reg(ctxt, SP_EL1), sp_el1); write_sysreg_el1(ctxt_sys_reg(ctxt, ELR_EL1), SYS_ELR); write_sysreg_el1(ctxt_sys_reg(ctxt, SPSR_EL1), SYS_SPSR); if (ctxt_has_sctlr2(ctxt)) write_sysreg_el1(ctxt_sys_reg(ctxt, SCTLR2_EL1), SYS_SCTLR2); } /* Read the VCPU state's PSTATE, but translate (v)EL2 to EL1. */ static inline u64 to_hw_pstate(const struct kvm_cpu_context *ctxt) { u64 mode = ctxt->regs.pstate & (PSR_MODE_MASK | PSR_MODE32_BIT); switch (mode) { case PSR_MODE_EL2t: mode = PSR_MODE_EL1t; break; case PSR_MODE_EL2h: mode = PSR_MODE_EL1h; break; } return (ctxt->regs.pstate & ~(PSR_MODE_MASK | PSR_MODE32_BIT)) | mode; } static inline void __sysreg_restore_el2_return_state(struct kvm_cpu_context *ctxt) { u64 pstate = to_hw_pstate(ctxt); u64 mode = pstate & PSR_AA32_MODE_MASK; u64 vdisr; /* * Safety check to ensure we're setting the CPU up to enter the guest * in a less privileged mode. * * If we are attempting a return to EL2 or higher in AArch64 state, * program SPSR_EL2 with M=EL2h and the IL bit set which ensures that * we'll take an illegal exception state exception immediately after * the ERET to the guest. Attempts to return to AArch32 Hyp will * result in an illegal exception return because EL2's execution state * is determined by SCR_EL3.RW. */ if (!(mode & PSR_MODE32_BIT) && mode >= PSR_MODE_EL2t) pstate = PSR_MODE_EL2h | PSR_IL_BIT; write_sysreg_el2(ctxt->regs.pc, SYS_ELR); write_sysreg_el2(pstate, SYS_SPSR); if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) return; if (!vserror_state_is_nested(ctxt_to_vcpu(ctxt))) vdisr = ctxt_sys_reg(ctxt, DISR_EL1); else if (ctxt_has_ras(ctxt)) vdisr = ctxt_sys_reg(ctxt, VDISR_EL2); else vdisr = 0; write_sysreg_s(vdisr, SYS_VDISR_EL2); } static inline void __sysreg32_save_state(struct kvm_vcpu *vcpu) { if (!vcpu_el1_is_32bit(vcpu)) return; vcpu->arch.ctxt.spsr_abt = read_sysreg(spsr_abt); vcpu->arch.ctxt.spsr_und = read_sysreg(spsr_und); vcpu->arch.ctxt.spsr_irq = read_sysreg(spsr_irq); vcpu->arch.ctxt.spsr_fiq = read_sysreg(spsr_fiq); __vcpu_assign_sys_reg(vcpu, DACR32_EL2, read_sysreg(dacr32_el2)); __vcpu_assign_sys_reg(vcpu, IFSR32_EL2, read_sysreg(ifsr32_el2)); if (has_vhe() || kvm_debug_regs_in_use(vcpu)) __vcpu_assign_sys_reg(vcpu, DBGVCR32_EL2, read_sysreg(dbgvcr32_el2)); } static inline void __sysreg32_restore_state(struct kvm_vcpu *vcpu) { if (!vcpu_el1_is_32bit(vcpu)) return; write_sysreg(vcpu->arch.ctxt.spsr_abt, spsr_abt); write_sysreg(vcpu->arch.ctxt.spsr_und, spsr_und); write_sysreg(vcpu->arch.ctxt.spsr_irq, spsr_irq); write_sysreg(vcpu->arch.ctxt.spsr_fiq, spsr_fiq); write_sysreg(__vcpu_sys_reg(vcpu, DACR32_EL2), dacr32_el2); write_sysreg(__vcpu_sys_reg(vcpu, IFSR32_EL2), ifsr32_el2); if (has_vhe() || kvm_debug_regs_in_use(vcpu)) write_sysreg(__vcpu_sys_reg(vcpu, DBGVCR32_EL2), dbgvcr32_el2); } #endif /* __ARM64_KVM_HYP_SYSREG_SR_H__ */
4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 // SPDX-License-Identifier: GPL-2.0-only /* * GENEVE: Generic Network Virtualization Encapsulation * * Copyright (c) 2015 Red Hat, Inc. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/ethtool.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/etherdevice.h> #include <linux/hash.h> #include <net/ipv6_stubs.h> #include <net/dst_metadata.h> #include <net/gro_cells.h> #include <net/rtnetlink.h> #include <net/geneve.h> #include <net/gro.h> #include <net/netdev_lock.h> #include <net/protocol.h> #define GENEVE_NETDEV_VER "0.6" #define GENEVE_N_VID (1u << 24) #define GENEVE_VID_MASK (GENEVE_N_VID - 1) #define VNI_HASH_BITS 10 #define VNI_HASH_SIZE (1<<VNI_HASH_BITS) static bool log_ecn_error = true; module_param(log_ecn_error, bool, 0644); MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); #define GENEVE_VER 0 #define GENEVE_BASE_HLEN (sizeof(struct udphdr) + sizeof(struct genevehdr)) #define GENEVE_IPV4_HLEN (ETH_HLEN + sizeof(struct iphdr) + GENEVE_BASE_HLEN) #define GENEVE_IPV6_HLEN (ETH_HLEN + sizeof(struct ipv6hdr) + GENEVE_BASE_HLEN) /* per-network namespace private data for this module */ struct geneve_net { struct list_head geneve_list; /* sock_list is protected by rtnl lock */ struct list_head sock_list; }; static unsigned int geneve_net_id; struct geneve_dev_node { struct hlist_node hlist; struct geneve_dev *geneve; }; struct geneve_config { struct ip_tunnel_info info; bool collect_md; bool use_udp6_rx_checksums; bool ttl_inherit; enum ifla_geneve_df df; bool inner_proto_inherit; u16 port_min; u16 port_max; }; /* Pseudo network device */ struct geneve_dev { struct geneve_dev_node hlist4; /* vni hash table for IPv4 socket */ #if IS_ENABLED(CONFIG_IPV6) struct geneve_dev_node hlist6; /* vni hash table for IPv6 socket */ #endif struct net *net; /* netns for packet i/o */ struct net_device *dev; /* netdev for geneve tunnel */ struct geneve_sock __rcu *sock4; /* IPv4 socket used for geneve tunnel */ #if IS_ENABLED(CONFIG_IPV6) struct geneve_sock __rcu *sock6; /* IPv6 socket used for geneve tunnel */ #endif struct list_head next; /* geneve's per namespace list */ struct gro_cells gro_cells; struct geneve_config cfg; }; struct geneve_sock { bool collect_md; struct list_head list; struct socket *sock; struct rcu_head rcu; int refcnt; struct hlist_head vni_list[VNI_HASH_SIZE]; }; static inline __u32 geneve_net_vni_hash(u8 vni[3]) { __u32 vnid; vnid = (vni[0] << 16) | (vni[1] << 8) | vni[2]; return hash_32(vnid, VNI_HASH_BITS); } static __be64 vni_to_tunnel_id(const __u8 *vni) { #ifdef __BIG_ENDIAN return (vni[0] << 16) | (vni[1] << 8) | vni[2]; #else return (__force __be64)(((__force u64)vni[0] << 40) | ((__force u64)vni[1] << 48) | ((__force u64)vni[2] << 56)); #endif } /* Convert 64 bit tunnel ID to 24 bit VNI. */ static void tunnel_id_to_vni(__be64 tun_id, __u8 *vni) { #ifdef __BIG_ENDIAN vni[0] = (__force __u8)(tun_id >> 16); vni[1] = (__force __u8)(tun_id >> 8); vni[2] = (__force __u8)tun_id; #else vni[0] = (__force __u8)((__force u64)tun_id >> 40); vni[1] = (__force __u8)((__force u64)tun_id >> 48); vni[2] = (__force __u8)((__force u64)tun_id >> 56); #endif } static bool eq_tun_id_and_vni(u8 *tun_id, u8 *vni) { return !memcmp(vni, &tun_id[5], 3); } static sa_family_t geneve_get_sk_family(struct geneve_sock *gs) { return gs->sock->sk->sk_family; } static struct geneve_dev *geneve_lookup(struct geneve_sock *gs, __be32 addr, u8 vni[]) { struct hlist_head *vni_list_head; struct geneve_dev_node *node; __u32 hash; /* Find the device for this VNI */ hash = geneve_net_vni_hash(vni); vni_list_head = &gs->vni_list[hash]; hlist_for_each_entry_rcu(node, vni_list_head, hlist) { if (eq_tun_id_and_vni((u8 *)&node->geneve->cfg.info.key.tun_id, vni) && addr == node->geneve->cfg.info.key.u.ipv4.dst) return node->geneve; } return NULL; } #if IS_ENABLED(CONFIG_IPV6) static struct geneve_dev *geneve6_lookup(struct geneve_sock *gs, struct in6_addr addr6, u8 vni[]) { struct hlist_head *vni_list_head; struct geneve_dev_node *node; __u32 hash; /* Find the device for this VNI */ hash = geneve_net_vni_hash(vni); vni_list_head = &gs->vni_list[hash]; hlist_for_each_entry_rcu(node, vni_list_head, hlist) { if (eq_tun_id_and_vni((u8 *)&node->geneve->cfg.info.key.tun_id, vni) && ipv6_addr_equal(&addr6, &node->geneve->cfg.info.key.u.ipv6.dst)) return node->geneve; } return NULL; } #endif static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb) { return (struct genevehdr *)(udp_hdr(skb) + 1); } static struct geneve_dev *geneve_lookup_skb(struct geneve_sock *gs, struct sk_buff *skb) { static u8 zero_vni[3]; u8 *vni; if (geneve_get_sk_family(gs) == AF_INET) { struct iphdr *iph; __be32 addr; iph = ip_hdr(skb); /* outer IP header... */ if (gs->collect_md) { vni = zero_vni; addr = 0; } else { vni = geneve_hdr(skb)->vni; addr = iph->saddr; } return geneve_lookup(gs, addr, vni); #if IS_ENABLED(CONFIG_IPV6) } else if (geneve_get_sk_family(gs) == AF_INET6) { static struct in6_addr zero_addr6; struct ipv6hdr *ip6h; struct in6_addr addr6; ip6h = ipv6_hdr(skb); /* outer IPv6 header... */ if (gs->collect_md) { vni = zero_vni; addr6 = zero_addr6; } else { vni = geneve_hdr(skb)->vni; addr6 = ip6h->saddr; } return geneve6_lookup(gs, addr6, vni); #endif } return NULL; } /* geneve receive/decap routine */ static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs, struct sk_buff *skb) { struct genevehdr *gnvh = geneve_hdr(skb); struct metadata_dst *tun_dst = NULL; unsigned int len; int nh, err = 0; void *oiph; if (ip_tunnel_collect_metadata() || gs->collect_md) { IP_TUNNEL_DECLARE_FLAGS(flags) = { }; __set_bit(IP_TUNNEL_KEY_BIT, flags); __assign_bit(IP_TUNNEL_OAM_BIT, flags, gnvh->oam); __assign_bit(IP_TUNNEL_CRIT_OPT_BIT, flags, gnvh->critical); tun_dst = udp_tun_rx_dst(skb, geneve_get_sk_family(gs), flags, vni_to_tunnel_id(gnvh->vni), gnvh->opt_len * 4); if (!tun_dst) { dev_dstats_rx_dropped(geneve->dev); goto drop; } /* Update tunnel dst according to Geneve options. */ ip_tunnel_flags_zero(flags); __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, flags); ip_tunnel_info_opts_set(&tun_dst->u.tun_info, gnvh->options, gnvh->opt_len * 4, flags); } else { /* Drop packets w/ critical options, * since we don't support any... */ if (gnvh->critical) { DEV_STATS_INC(geneve->dev, rx_frame_errors); DEV_STATS_INC(geneve->dev, rx_errors); goto drop; } } if (tun_dst) skb_dst_set(skb, &tun_dst->dst); if (gnvh->proto_type == htons(ETH_P_TEB)) { skb_reset_mac_header(skb); skb->protocol = eth_type_trans(skb, geneve->dev); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); /* Ignore packet loops (and multicast echo) */ if (ether_addr_equal(eth_hdr(skb)->h_source, geneve->dev->dev_addr)) { DEV_STATS_INC(geneve->dev, rx_errors); goto drop; } } else { skb_reset_mac_header(skb); skb->dev = geneve->dev; skb->pkt_type = PACKET_HOST; } /* Save offset of outer header relative to skb->head, * because we are going to reset the network header to the inner header * and might change skb->head. */ nh = skb_network_header(skb) - skb->head; skb_reset_network_header(skb); if (!pskb_inet_may_pull(skb)) { DEV_STATS_INC(geneve->dev, rx_length_errors); DEV_STATS_INC(geneve->dev, rx_errors); goto drop; } /* Get the outer header. */ oiph = skb->head + nh; if (geneve_get_sk_family(gs) == AF_INET) err = IP_ECN_decapsulate(oiph, skb); #if IS_ENABLED(CONFIG_IPV6) else err = IP6_ECN_decapsulate(oiph, skb); #endif if (unlikely(err)) { if (log_ecn_error) { if (geneve_get_sk_family(gs) == AF_INET) net_info_ratelimited("non-ECT from %pI4 " "with TOS=%#x\n", &((struct iphdr *)oiph)->saddr, ((struct iphdr *)oiph)->tos); #if IS_ENABLED(CONFIG_IPV6) else net_info_ratelimited("non-ECT from %pI6\n", &((struct ipv6hdr *)oiph)->saddr); #endif } if (err > 1) { DEV_STATS_INC(geneve->dev, rx_frame_errors); DEV_STATS_INC(geneve->dev, rx_errors); goto drop; } } len = skb->len; err = gro_cells_receive(&geneve->gro_cells, skb); if (likely(err == NET_RX_SUCCESS)) dev_dstats_rx_add(geneve->dev, len); return; drop: /* Consume bad packet */ kfree_skb(skb); } /* Setup stats when device is created */ static int geneve_init(struct net_device *dev) { struct geneve_dev *geneve = netdev_priv(dev); int err; err = gro_cells_init(&geneve->gro_cells, dev); if (err) return err; err = dst_cache_init(&geneve->cfg.info.dst_cache, GFP_KERNEL); if (err) { gro_cells_destroy(&geneve->gro_cells); return err; } netdev_lockdep_set_classes(dev); return 0; } static void geneve_uninit(struct net_device *dev) { struct geneve_dev *geneve = netdev_priv(dev); dst_cache_destroy(&geneve->cfg.info.dst_cache); gro_cells_destroy(&geneve->gro_cells); } /* Callback from net/ipv4/udp.c to receive packets */ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) { struct genevehdr *geneveh; struct geneve_dev *geneve; struct geneve_sock *gs; __be16 inner_proto; int opts_len; /* Need UDP and Geneve header to be present */ if (unlikely(!pskb_may_pull(skb, GENEVE_BASE_HLEN))) goto drop; /* Return packets with reserved bits set */ geneveh = geneve_hdr(skb); if (unlikely(geneveh->ver != GENEVE_VER)) goto drop; gs = rcu_dereference_sk_user_data(sk); if (!gs) goto drop; geneve = geneve_lookup_skb(gs, skb); if (!geneve) goto drop; inner_proto = geneveh->proto_type; if (unlikely((!geneve->cfg.inner_proto_inherit && inner_proto != htons(ETH_P_TEB)))) { dev_dstats_rx_dropped(geneve->dev); goto drop; } opts_len = geneveh->opt_len * 4; if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len, inner_proto, !net_eq(geneve->net, dev_net(geneve->dev)))) { dev_dstats_rx_dropped(geneve->dev); goto drop; } geneve_rx(geneve, gs, skb); return 0; drop: /* Consume bad packet */ kfree_skb(skb); return 0; } /* Callback from net/ipv{4,6}/udp.c to check that we have a tunnel for errors */ static int geneve_udp_encap_err_lookup(struct sock *sk, struct sk_buff *skb) { struct genevehdr *geneveh; struct geneve_sock *gs; u8 zero_vni[3] = { 0 }; u8 *vni = zero_vni; if (!pskb_may_pull(skb, skb_transport_offset(skb) + GENEVE_BASE_HLEN)) return -EINVAL; geneveh = geneve_hdr(skb); if (geneveh->ver != GENEVE_VER) return -EINVAL; if (geneveh->proto_type != htons(ETH_P_TEB)) return -EINVAL; gs = rcu_dereference_sk_user_data(sk); if (!gs) return -ENOENT; if (geneve_get_sk_family(gs) == AF_INET) { struct iphdr *iph = ip_hdr(skb); __be32 addr4 = 0; if (!gs->collect_md) { vni = geneve_hdr(skb)->vni; addr4 = iph->daddr; } return geneve_lookup(gs, addr4, vni) ? 0 : -ENOENT; } #if IS_ENABLED(CONFIG_IPV6) if (geneve_get_sk_family(gs) == AF_INET6) { struct ipv6hdr *ip6h = ipv6_hdr(skb); struct in6_addr addr6; memset(&addr6, 0, sizeof(struct in6_addr)); if (!gs->collect_md) { vni = geneve_hdr(skb)->vni; addr6 = ip6h->daddr; } return geneve6_lookup(gs, addr6, vni) ? 0 : -ENOENT; } #endif return -EPFNOSUPPORT; } static struct socket *geneve_create_sock(struct net *net, bool ipv6, __be16 port, bool ipv6_rx_csum) { struct socket *sock; struct udp_port_cfg udp_conf; int err; memset(&udp_conf, 0, sizeof(udp_conf)); if (ipv6) { udp_conf.family = AF_INET6; udp_conf.ipv6_v6only = 1; udp_conf.use_udp6_rx_checksums = ipv6_rx_csum; } else { udp_conf.family = AF_INET; udp_conf.local_ip.s_addr = htonl(INADDR_ANY); } udp_conf.local_udp_port = port; /* Open UDP socket */ err = udp_sock_create(net, &udp_conf, &sock); if (err < 0) return ERR_PTR(err); udp_allow_gso(sock->sk); return sock; } static int geneve_hlen(struct genevehdr *gh) { return sizeof(*gh) + gh->opt_len * 4; } static struct sk_buff *geneve_gro_receive(struct sock *sk, struct list_head *head, struct sk_buff *skb) { struct sk_buff *pp = NULL; struct sk_buff *p; struct genevehdr *gh, *gh2; unsigned int hlen, gh_len, off_gnv; const struct packet_offload *ptype; __be16 type; int flush = 1; off_gnv = skb_gro_offset(skb); hlen = off_gnv + sizeof(*gh); gh = skb_gro_header(skb, hlen, off_gnv); if (unlikely(!gh)) goto out; if (gh->ver != GENEVE_VER || gh->oam) goto out; gh_len = geneve_hlen(gh); hlen = off_gnv + gh_len; if (!skb_gro_may_pull(skb, hlen)) { gh = skb_gro_header_slow(skb, hlen, off_gnv); if (unlikely(!gh)) goto out; } list_for_each_entry(p, head, list) { if (!NAPI_GRO_CB(p)->same_flow) continue; gh2 = (struct genevehdr *)(p->data + off_gnv); if (gh->opt_len != gh2->opt_len || memcmp(gh, gh2, gh_len)) { NAPI_GRO_CB(p)->same_flow = 0; continue; } } skb_gro_pull(skb, gh_len); skb_gro_postpull_rcsum(skb, gh, gh_len); type = gh->proto_type; if (likely(type == htons(ETH_P_TEB))) return call_gro_receive(eth_gro_receive, head, skb); ptype = gro_find_receive_by_type(type); if (!ptype) goto out; pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb); flush = 0; out: skb_gro_flush_final(skb, pp, flush); return pp; } static int geneve_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff) { struct genevehdr *gh; struct packet_offload *ptype; __be16 type; int gh_len; int err = -ENOSYS; gh = (struct genevehdr *)(skb->data + nhoff); gh_len = geneve_hlen(gh); type = gh->proto_type; /* since skb->encapsulation is set, eth_gro_complete() sets the inner mac header */ if (likely(type == htons(ETH_P_TEB))) return eth_gro_complete(skb, nhoff + gh_len); ptype = gro_find_complete_by_type(type); if (ptype) err = ptype->callbacks.gro_complete(skb, nhoff + gh_len); skb_set_inner_mac_header(skb, nhoff + gh_len); return err; } /* Create new listen socket if needed */ static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port, bool ipv6, bool ipv6_rx_csum) { struct geneve_net *gn = net_generic(net, geneve_net_id); struct geneve_sock *gs; struct socket *sock; struct udp_tunnel_sock_cfg tunnel_cfg; int h; gs = kzalloc(sizeof(*gs), GFP_KERNEL); if (!gs) return ERR_PTR(-ENOMEM); sock = geneve_create_sock(net, ipv6, port, ipv6_rx_csum); if (IS_ERR(sock)) { kfree(gs); return ERR_CAST(sock); } gs->sock = sock; gs->refcnt = 1; for (h = 0; h < VNI_HASH_SIZE; ++h) INIT_HLIST_HEAD(&gs->vni_list[h]); /* Initialize the geneve udp offloads structure */ udp_tunnel_notify_add_rx_port(gs->sock, UDP_TUNNEL_TYPE_GENEVE); /* Mark socket as an encapsulation socket */ memset(&tunnel_cfg, 0, sizeof(tunnel_cfg)); tunnel_cfg.sk_user_data = gs; tunnel_cfg.encap_type = 1; tunnel_cfg.gro_receive = geneve_gro_receive; tunnel_cfg.gro_complete = geneve_gro_complete; tunnel_cfg.encap_rcv = geneve_udp_encap_recv; tunnel_cfg.encap_err_lookup = geneve_udp_encap_err_lookup; tunnel_cfg.encap_destroy = NULL; setup_udp_tunnel_sock(net, sock, &tunnel_cfg); list_add(&gs->list, &gn->sock_list); return gs; } static void __geneve_sock_release(struct geneve_sock *gs) { if (!gs || --gs->refcnt) return; list_del(&gs->list); udp_tunnel_notify_del_rx_port(gs->sock, UDP_TUNNEL_TYPE_GENEVE); udp_tunnel_sock_release(gs->sock); kfree_rcu(gs, rcu); } static void geneve_sock_release(struct geneve_dev *geneve) { struct geneve_sock *gs4 = rtnl_dereference(geneve->sock4); #if IS_ENABLED(CONFIG_IPV6) struct geneve_sock *gs6 = rtnl_dereference(geneve->sock6); rcu_assign_pointer(geneve->sock6, NULL); #endif rcu_assign_pointer(geneve->sock4, NULL); synchronize_net(); __geneve_sock_release(gs4); #if IS_ENABLED(CONFIG_IPV6) __geneve_sock_release(gs6); #endif } static struct geneve_sock *geneve_find_sock(struct geneve_net *gn, sa_family_t family, __be16 dst_port) { struct geneve_sock *gs; list_for_each_entry(gs, &gn->sock_list, list) { if (inet_sk(gs->sock->sk)->inet_sport == dst_port && geneve_get_sk_family(gs) == family) { return gs; } } return NULL; } static int geneve_sock_add(struct geneve_dev *geneve, bool ipv6) { struct net *net = geneve->net; struct geneve_net *gn = net_generic(net, geneve_net_id); struct geneve_dev_node *node; struct geneve_sock *gs; __u8 vni[3]; __u32 hash; gs = geneve_find_sock(gn, ipv6 ? AF_INET6 : AF_INET, geneve->cfg.info.key.tp_dst); if (gs) { gs->refcnt++; goto out; } gs = geneve_socket_create(net, geneve->cfg.info.key.tp_dst, ipv6, geneve->cfg.use_udp6_rx_checksums); if (IS_ERR(gs)) return PTR_ERR(gs); out: gs->collect_md = geneve->cfg.collect_md; #if IS_ENABLED(CONFIG_IPV6) if (ipv6) { rcu_assign_pointer(geneve->sock6, gs); node = &geneve->hlist6; } else #endif { rcu_assign_pointer(geneve->sock4, gs); node = &geneve->hlist4; } node->geneve = geneve; tunnel_id_to_vni(geneve->cfg.info.key.tun_id, vni); hash = geneve_net_vni_hash(vni); hlist_add_head_rcu(&node->hlist, &gs->vni_list[hash]); return 0; } static int geneve_open(struct net_device *dev) { struct geneve_dev *geneve = netdev_priv(dev); bool metadata = geneve->cfg.collect_md; bool ipv4, ipv6; int ret = 0; ipv6 = geneve->cfg.info.mode & IP_TUNNEL_INFO_IPV6 || metadata; ipv4 = !ipv6 || metadata; #if IS_ENABLED(CONFIG_IPV6) if (ipv6) { ret = geneve_sock_add(geneve, true); if (ret < 0 && ret != -EAFNOSUPPORT) ipv4 = false; } #endif if (ipv4) ret = geneve_sock_add(geneve, false); if (ret < 0) geneve_sock_release(geneve); return ret; } static int geneve_stop(struct net_device *dev) { struct geneve_dev *geneve = netdev_priv(dev); hlist_del_init_rcu(&geneve->hlist4.hlist); #if IS_ENABLED(CONFIG_IPV6) hlist_del_init_rcu(&geneve->hlist6.hlist); #endif geneve_sock_release(geneve); return 0; } static void geneve_build_header(struct genevehdr *geneveh, const struct ip_tunnel_info *info, __be16 inner_proto) { geneveh->ver = GENEVE_VER; geneveh->opt_len = info->options_len / 4; geneveh->oam = test_bit(IP_TUNNEL_OAM_BIT, info->key.tun_flags); geneveh->critical = test_bit(IP_TUNNEL_CRIT_OPT_BIT, info->key.tun_flags); geneveh->rsvd1 = 0; tunnel_id_to_vni(info->key.tun_id, geneveh->vni); geneveh->proto_type = inner_proto; geneveh->rsvd2 = 0; if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, info->key.tun_flags)) ip_tunnel_info_opts_get(geneveh->options, info); } static int geneve_build_skb(struct dst_entry *dst, struct sk_buff *skb, const struct ip_tunnel_info *info, bool xnet, int ip_hdr_len, bool inner_proto_inherit) { bool udp_sum = test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags); struct genevehdr *gnvh; __be16 inner_proto; int min_headroom; int err; skb_reset_mac_header(skb); skb_scrub_packet(skb, xnet); min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len + GENEVE_BASE_HLEN + info->options_len + ip_hdr_len; err = skb_cow_head(skb, min_headroom); if (unlikely(err)) goto free_dst; err = udp_tunnel_handle_offloads(skb, udp_sum); if (err) goto free_dst; gnvh = __skb_push(skb, sizeof(*gnvh) + info->options_len); inner_proto = inner_proto_inherit ? skb->protocol : htons(ETH_P_TEB); geneve_build_header(gnvh, info, inner_proto); skb_set_inner_protocol(skb, inner_proto); return 0; free_dst: dst_release(dst); return err; } static u8 geneve_get_dsfield(struct sk_buff *skb, struct net_device *dev, const struct ip_tunnel_info *info, bool *use_cache) { struct geneve_dev *geneve = netdev_priv(dev); u8 dsfield; dsfield = info->key.tos; if (dsfield == 1 && !geneve->cfg.collect_md) { dsfield = ip_tunnel_get_dsfield(ip_hdr(skb), skb); *use_cache = false; } return dsfield; } static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, struct geneve_dev *geneve, const struct ip_tunnel_info *info) { bool inner_proto_inherit = geneve->cfg.inner_proto_inherit; bool xnet = !net_eq(geneve->net, dev_net(geneve->dev)); struct geneve_sock *gs4 = rcu_dereference(geneve->sock4); const struct ip_tunnel_key *key = &info->key; struct rtable *rt; bool use_cache; __u8 tos, ttl; __be16 df = 0; __be32 saddr; __be16 sport; int err; if (skb_vlan_inet_prepare(skb, inner_proto_inherit)) return -EINVAL; if (!gs4) return -EIO; use_cache = ip_tunnel_dst_cache_usable(skb, info); tos = geneve_get_dsfield(skb, dev, info, &use_cache); sport = udp_flow_src_port(geneve->net, skb, geneve->cfg.port_min, geneve->cfg.port_max, true); rt = udp_tunnel_dst_lookup(skb, dev, geneve->net, 0, &saddr, &info->key, sport, geneve->cfg.info.key.tp_dst, tos, use_cache ? (struct dst_cache *)&info->dst_cache : NULL); if (IS_ERR(rt)) return PTR_ERR(rt); err = skb_tunnel_check_pmtu(skb, &rt->dst, GENEVE_IPV4_HLEN + info->options_len, netif_is_any_bridge_port(dev)); if (err < 0) { dst_release(&rt->dst); return err; } else if (err) { struct ip_tunnel_info *info; info = skb_tunnel_info(skb); if (info) { struct ip_tunnel_info *unclone; unclone = skb_tunnel_info_unclone(skb); if (unlikely(!unclone)) { dst_release(&rt->dst); return -ENOMEM; } unclone->key.u.ipv4.dst = saddr; unclone->key.u.ipv4.src = info->key.u.ipv4.dst; } if (!pskb_may_pull(skb, ETH_HLEN)) { dst_release(&rt->dst); return -EINVAL; } skb->protocol = eth_type_trans(skb, geneve->dev); __netif_rx(skb); dst_release(&rt->dst); return -EMSGSIZE; } tos = ip_tunnel_ecn_encap(tos, ip_hdr(skb), skb); if (geneve->cfg.collect_md) { ttl = key->ttl; df = test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, key->tun_flags) ? htons(IP_DF) : 0; } else { if (geneve->cfg.ttl_inherit) ttl = ip_tunnel_get_ttl(ip_hdr(skb), skb); else ttl = key->ttl; ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); if (geneve->cfg.df == GENEVE_DF_SET) { df = htons(IP_DF); } else if (geneve->cfg.df == GENEVE_DF_INHERIT) { struct ethhdr *eth = skb_eth_hdr(skb); if (ntohs(eth->h_proto) == ETH_P_IPV6) { df = htons(IP_DF); } else if (ntohs(eth->h_proto) == ETH_P_IP) { struct iphdr *iph = ip_hdr(skb); if (iph->frag_off & htons(IP_DF)) df = htons(IP_DF); } } } err = geneve_build_skb(&rt->dst, skb, info, xnet, sizeof(struct iphdr), inner_proto_inherit); if (unlikely(err)) return err; udp_tunnel_xmit_skb(rt, gs4->sock->sk, skb, saddr, info->key.u.ipv4.dst, tos, ttl, df, sport, geneve->cfg.info.key.tp_dst, !net_eq(geneve->net, dev_net(geneve->dev)), !test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags), 0); return 0; } #if IS_ENABLED(CONFIG_IPV6) static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, struct geneve_dev *geneve, const struct ip_tunnel_info *info) { bool inner_proto_inherit = geneve->cfg.inner_proto_inherit; bool xnet = !net_eq(geneve->net, dev_net(geneve->dev)); struct geneve_sock *gs6 = rcu_dereference(geneve->sock6); const struct ip_tunnel_key *key = &info->key; struct dst_entry *dst = NULL; struct in6_addr saddr; bool use_cache; __u8 prio, ttl; __be16 sport; int err; if (skb_vlan_inet_prepare(skb, inner_proto_inherit)) return -EINVAL; if (!gs6) return -EIO; use_cache = ip_tunnel_dst_cache_usable(skb, info); prio = geneve_get_dsfield(skb, dev, info, &use_cache); sport = udp_flow_src_port(geneve->net, skb, geneve->cfg.port_min, geneve->cfg.port_max, true); dst = udp_tunnel6_dst_lookup(skb, dev, geneve->net, gs6->sock, 0, &saddr, key, sport, geneve->cfg.info.key.tp_dst, prio, use_cache ? (struct dst_cache *)&info->dst_cache : NULL); if (IS_ERR(dst)) return PTR_ERR(dst); err = skb_tunnel_check_pmtu(skb, dst, GENEVE_IPV6_HLEN + info->options_len, netif_is_any_bridge_port(dev)); if (err < 0) { dst_release(dst); return err; } else if (err) { struct ip_tunnel_info *info = skb_tunnel_info(skb); if (info) { struct ip_tunnel_info *unclone; unclone = skb_tunnel_info_unclone(skb); if (unlikely(!unclone)) { dst_release(dst); return -ENOMEM; } unclone->key.u.ipv6.dst = saddr; unclone->key.u.ipv6.src = info->key.u.ipv6.dst; } if (!pskb_may_pull(skb, ETH_HLEN)) { dst_release(dst); return -EINVAL; } skb->protocol = eth_type_trans(skb, geneve->dev); __netif_rx(skb); dst_release(dst); return -EMSGSIZE; } prio = ip_tunnel_ecn_encap(prio, ip_hdr(skb), skb); if (geneve->cfg.collect_md) { ttl = key->ttl; } else { if (geneve->cfg.ttl_inherit) ttl = ip_tunnel_get_ttl(ip_hdr(skb), skb); else ttl = key->ttl; ttl = ttl ? : ip6_dst_hoplimit(dst); } err = geneve_build_skb(dst, skb, info, xnet, sizeof(struct ipv6hdr), inner_proto_inherit); if (unlikely(err)) return err; udp_tunnel6_xmit_skb(dst, gs6->sock->sk, skb, dev, &saddr, &key->u.ipv6.dst, prio, ttl, info->key.label, sport, geneve->cfg.info.key.tp_dst, !test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags), 0); return 0; } #endif static netdev_tx_t geneve_xmit(struct sk_buff *skb, struct net_device *dev) { struct geneve_dev *geneve = netdev_priv(dev); struct ip_tunnel_info *info = NULL; int err; if (geneve->cfg.collect_md) { info = skb_tunnel_info(skb); if (unlikely(!info || !(info->mode & IP_TUNNEL_INFO_TX))) { netdev_dbg(dev, "no tunnel metadata\n"); dev_kfree_skb(skb); dev_dstats_tx_dropped(dev); return NETDEV_TX_OK; } } else { info = &geneve->cfg.info; } rcu_read_lock(); #if IS_ENABLED(CONFIG_IPV6) if (info->mode & IP_TUNNEL_INFO_IPV6) err = geneve6_xmit_skb(skb, dev, geneve, info); else #endif err = geneve_xmit_skb(skb, dev, geneve, info); rcu_read_unlock(); if (likely(!err)) return NETDEV_TX_OK; if (err != -EMSGSIZE) dev_kfree_skb(skb); if (err == -ELOOP) DEV_STATS_INC(dev, collisions); else if (err == -ENETUNREACH) DEV_STATS_INC(dev, tx_carrier_errors); DEV_STATS_INC(dev, tx_errors); return NETDEV_TX_OK; } static int geneve_change_mtu(struct net_device *dev, int new_mtu) { if (new_mtu > dev->max_mtu) new_mtu = dev->max_mtu; else if (new_mtu < dev->min_mtu) new_mtu = dev->min_mtu; WRITE_ONCE(dev->mtu, new_mtu); return 0; } static int geneve_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) { struct ip_tunnel_info *info = skb_tunnel_info(skb); struct geneve_dev *geneve = netdev_priv(dev); __be16 sport; if (ip_tunnel_info_af(info) == AF_INET) { struct rtable *rt; struct geneve_sock *gs4 = rcu_dereference(geneve->sock4); bool use_cache; __be32 saddr; u8 tos; if (!gs4) return -EIO; use_cache = ip_tunnel_dst_cache_usable(skb, info); tos = geneve_get_dsfield(skb, dev, info, &use_cache); sport = udp_flow_src_port(geneve->net, skb, geneve->cfg.port_min, geneve->cfg.port_max, true); rt = udp_tunnel_dst_lookup(skb, dev, geneve->net, 0, &saddr, &info->key, sport, geneve->cfg.info.key.tp_dst, tos, use_cache ? &info->dst_cache : NULL); if (IS_ERR(rt)) return PTR_ERR(rt); ip_rt_put(rt); info->key.u.ipv4.src = saddr; #if IS_ENABLED(CONFIG_IPV6) } else if (ip_tunnel_info_af(info) == AF_INET6) { struct dst_entry *dst; struct geneve_sock *gs6 = rcu_dereference(geneve->sock6); struct in6_addr saddr; bool use_cache; u8 prio; if (!gs6) return -EIO; use_cache = ip_tunnel_dst_cache_usable(skb, info); prio = geneve_get_dsfield(skb, dev, info, &use_cache); sport = udp_flow_src_port(geneve->net, skb, geneve->cfg.port_min, geneve->cfg.port_max, true); dst = udp_tunnel6_dst_lookup(skb, dev, geneve->net, gs6->sock, 0, &saddr, &info->key, sport, geneve->cfg.info.key.tp_dst, prio, use_cache ? &info->dst_cache : NULL); if (IS_ERR(dst)) return PTR_ERR(dst); dst_release(dst); info->key.u.ipv6.src = saddr; #endif } else { return -EINVAL; } info->key.tp_src = sport; info->key.tp_dst = geneve->cfg.info.key.tp_dst; return 0; } static const struct net_device_ops geneve_netdev_ops = { .ndo_init = geneve_init, .ndo_uninit = geneve_uninit, .ndo_open = geneve_open, .ndo_stop = geneve_stop, .ndo_start_xmit = geneve_xmit, .ndo_change_mtu = geneve_change_mtu, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, .ndo_fill_metadata_dst = geneve_fill_metadata_dst, }; static void geneve_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo) { strscpy(drvinfo->version, GENEVE_NETDEV_VER, sizeof(drvinfo->version)); strscpy(drvinfo->driver, "geneve", sizeof(drvinfo->driver)); } static const struct ethtool_ops geneve_ethtool_ops = { .get_drvinfo = geneve_get_drvinfo, .get_link = ethtool_op_get_link, }; /* Info for udev, that this is a virtual tunnel endpoint */ static const struct device_type geneve_type = { .name = "geneve", }; /* Calls the ndo_udp_tunnel_add of the caller in order to * supply the listening GENEVE udp ports. Callers are expected * to implement the ndo_udp_tunnel_add. */ static void geneve_offload_rx_ports(struct net_device *dev, bool push) { struct net *net = dev_net(dev); struct geneve_net *gn = net_generic(net, geneve_net_id); struct geneve_sock *gs; ASSERT_RTNL(); list_for_each_entry(gs, &gn->sock_list, list) { if (push) { udp_tunnel_push_rx_port(dev, gs->sock, UDP_TUNNEL_TYPE_GENEVE); } else { udp_tunnel_drop_rx_port(dev, gs->sock, UDP_TUNNEL_TYPE_GENEVE); } } } /* Initialize the device structure. */ static void geneve_setup(struct net_device *dev) { ether_setup(dev); dev->netdev_ops = &geneve_netdev_ops; dev->ethtool_ops = &geneve_ethtool_ops; dev->needs_free_netdev = true; SET_NETDEV_DEVTYPE(dev, &geneve_type); dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST; dev->features |= NETIF_F_RXCSUM; dev->features |= NETIF_F_GSO_SOFTWARE; dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST; dev->hw_features |= NETIF_F_RXCSUM; dev->hw_features |= NETIF_F_GSO_SOFTWARE; dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS; /* MTU range: 68 - (something less than 65535) */ dev->min_mtu = ETH_MIN_MTU; /* The max_mtu calculation does not take account of GENEVE * options, to avoid excluding potentially valid * configurations. This will be further reduced by IPvX hdr size. */ dev->max_mtu = IP_MAX_MTU - GENEVE_BASE_HLEN - dev->hard_header_len; netif_keep_dst(dev); dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE; dev->lltx = true; eth_hw_addr_random(dev); } static const struct nla_policy geneve_policy[IFLA_GENEVE_MAX + 1] = { [IFLA_GENEVE_UNSPEC] = { .strict_start_type = IFLA_GENEVE_INNER_PROTO_INHERIT }, [IFLA_GENEVE_ID] = { .type = NLA_U32 }, [IFLA_GENEVE_REMOTE] = { .len = sizeof_field(struct iphdr, daddr) }, [IFLA_GENEVE_REMOTE6] = { .len = sizeof(struct in6_addr) }, [IFLA_GENEVE_TTL] = { .type = NLA_U8 }, [IFLA_GENEVE_TOS] = { .type = NLA_U8 }, [IFLA_GENEVE_LABEL] = { .type = NLA_U32 }, [IFLA_GENEVE_PORT] = { .type = NLA_U16 }, [IFLA_GENEVE_COLLECT_METADATA] = { .type = NLA_FLAG }, [IFLA_GENEVE_UDP_CSUM] = { .type = NLA_U8 }, [IFLA_GENEVE_UDP_ZERO_CSUM6_TX] = { .type = NLA_U8 }, [IFLA_GENEVE_UDP_ZERO_CSUM6_RX] = { .type = NLA_U8 }, [IFLA_GENEVE_TTL_INHERIT] = { .type = NLA_U8 }, [IFLA_GENEVE_DF] = { .type = NLA_U8 }, [IFLA_GENEVE_INNER_PROTO_INHERIT] = { .type = NLA_FLAG }, [IFLA_GENEVE_PORT_RANGE] = NLA_POLICY_EXACT_LEN(sizeof(struct ifla_geneve_port_range)), }; static int geneve_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (tb[IFLA_ADDRESS]) { if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS], "Provided link layer address is not Ethernet"); return -EINVAL; } if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS], "Provided Ethernet address is not unicast"); return -EADDRNOTAVAIL; } } if (!data) { NL_SET_ERR_MSG(extack, "Not enough attributes provided to perform the operation"); return -EINVAL; } if (data[IFLA_GENEVE_ID]) { __u32 vni = nla_get_u32(data[IFLA_GENEVE_ID]); if (vni >= GENEVE_N_VID) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_ID], "Geneve ID must be lower than 16777216"); return -ERANGE; } } if (data[IFLA_GENEVE_DF]) { enum ifla_geneve_df df = nla_get_u8(data[IFLA_GENEVE_DF]); if (df < 0 || df > GENEVE_DF_MAX) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_DF], "Invalid DF attribute"); return -EINVAL; } } if (data[IFLA_GENEVE_PORT_RANGE]) { const struct ifla_geneve_port_range *p; p = nla_data(data[IFLA_GENEVE_PORT_RANGE]); if (ntohs(p->high) < ntohs(p->low)) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_PORT_RANGE], "Invalid source port range"); return -EINVAL; } } return 0; } static struct geneve_dev *geneve_find_dev(struct geneve_net *gn, const struct ip_tunnel_info *info, bool *tun_on_same_port, bool *tun_collect_md) { struct geneve_dev *geneve, *t = NULL; *tun_on_same_port = false; *tun_collect_md = false; list_for_each_entry(geneve, &gn->geneve_list, next) { if (info->key.tp_dst == geneve->cfg.info.key.tp_dst) { *tun_collect_md = geneve->cfg.collect_md; *tun_on_same_port = true; } if (info->key.tun_id == geneve->cfg.info.key.tun_id && info->key.tp_dst == geneve->cfg.info.key.tp_dst && !memcmp(&info->key.u, &geneve->cfg.info.key.u, sizeof(info->key.u))) t = geneve; } return t; } static bool is_tnl_info_zero(const struct ip_tunnel_info *info) { return !(info->key.tun_id || info->key.tos || !ip_tunnel_flags_empty(info->key.tun_flags) || info->key.ttl || info->key.label || info->key.tp_src || memchr_inv(&info->key.u, 0, sizeof(info->key.u))); } static bool geneve_dst_addr_equal(struct ip_tunnel_info *a, struct ip_tunnel_info *b) { if (ip_tunnel_info_af(a) == AF_INET) return a->key.u.ipv4.dst == b->key.u.ipv4.dst; else return ipv6_addr_equal(&a->key.u.ipv6.dst, &b->key.u.ipv6.dst); } static int geneve_configure(struct net *net, struct net_device *dev, struct netlink_ext_ack *extack, const struct geneve_config *cfg) { struct geneve_net *gn = net_generic(net, geneve_net_id); struct geneve_dev *t, *geneve = netdev_priv(dev); const struct ip_tunnel_info *info = &cfg->info; bool tun_collect_md, tun_on_same_port; int err, encap_len; if (cfg->collect_md && !is_tnl_info_zero(info)) { NL_SET_ERR_MSG(extack, "Device is externally controlled, so attributes (VNI, Port, and so on) must not be specified"); return -EINVAL; } geneve->net = net; geneve->dev = dev; t = geneve_find_dev(gn, info, &tun_on_same_port, &tun_collect_md); if (t) return -EBUSY; /* make enough headroom for basic scenario */ encap_len = GENEVE_BASE_HLEN + ETH_HLEN; if (!cfg->collect_md && ip_tunnel_info_af(info) == AF_INET) { encap_len += sizeof(struct iphdr); dev->max_mtu -= sizeof(struct iphdr); } else { encap_len += sizeof(struct ipv6hdr); dev->max_mtu -= sizeof(struct ipv6hdr); } dev->needed_headroom = encap_len + ETH_HLEN; if (cfg->collect_md) { if (tun_on_same_port) { NL_SET_ERR_MSG(extack, "There can be only one externally controlled device on a destination port"); return -EPERM; } } else { if (tun_collect_md) { NL_SET_ERR_MSG(extack, "There already exists an externally controlled device on this destination port"); return -EPERM; } } dst_cache_reset(&geneve->cfg.info.dst_cache); memcpy(&geneve->cfg, cfg, sizeof(*cfg)); if (geneve->cfg.inner_proto_inherit) { dev->header_ops = NULL; dev->type = ARPHRD_NONE; dev->hard_header_len = 0; dev->addr_len = 0; dev->flags = IFF_POINTOPOINT | IFF_NOARP; } err = register_netdevice(dev); if (err) return err; list_add(&geneve->next, &gn->geneve_list); return 0; } static void init_tnl_info(struct ip_tunnel_info *info, __u16 dst_port) { memset(info, 0, sizeof(*info)); info->key.tp_dst = htons(dst_port); } static int geneve_nl2info(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack, struct geneve_config *cfg, bool changelink) { struct ip_tunnel_info *info = &cfg->info; int attrtype; if (data[IFLA_GENEVE_REMOTE] && data[IFLA_GENEVE_REMOTE6]) { NL_SET_ERR_MSG(extack, "Cannot specify both IPv4 and IPv6 Remote addresses"); return -EINVAL; } if (data[IFLA_GENEVE_REMOTE]) { if (changelink && (ip_tunnel_info_af(info) == AF_INET6)) { attrtype = IFLA_GENEVE_REMOTE; goto change_notsup; } info->key.u.ipv4.dst = nla_get_in_addr(data[IFLA_GENEVE_REMOTE]); if (ipv4_is_multicast(info->key.u.ipv4.dst)) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_REMOTE], "Remote IPv4 address cannot be Multicast"); return -EINVAL; } } if (data[IFLA_GENEVE_REMOTE6]) { #if IS_ENABLED(CONFIG_IPV6) if (changelink && (ip_tunnel_info_af(info) == AF_INET)) { attrtype = IFLA_GENEVE_REMOTE6; goto change_notsup; } info->mode = IP_TUNNEL_INFO_IPV6; info->key.u.ipv6.dst = nla_get_in6_addr(data[IFLA_GENEVE_REMOTE6]); if (ipv6_addr_type(&info->key.u.ipv6.dst) & IPV6_ADDR_LINKLOCAL) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_REMOTE6], "Remote IPv6 address cannot be link-local"); return -EINVAL; } if (ipv6_addr_is_multicast(&info->key.u.ipv6.dst)) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_REMOTE6], "Remote IPv6 address cannot be Multicast"); return -EINVAL; } __set_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags); cfg->use_udp6_rx_checksums = true; #else NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_REMOTE6], "IPv6 support not enabled in the kernel"); return -EPFNOSUPPORT; #endif } if (data[IFLA_GENEVE_ID]) { __u32 vni; __u8 tvni[3]; __be64 tunid; vni = nla_get_u32(data[IFLA_GENEVE_ID]); tvni[0] = (vni & 0x00ff0000) >> 16; tvni[1] = (vni & 0x0000ff00) >> 8; tvni[2] = vni & 0x000000ff; tunid = vni_to_tunnel_id(tvni); if (changelink && (tunid != info->key.tun_id)) { attrtype = IFLA_GENEVE_ID; goto change_notsup; } info->key.tun_id = tunid; } if (data[IFLA_GENEVE_TTL_INHERIT]) { if (nla_get_u8(data[IFLA_GENEVE_TTL_INHERIT])) cfg->ttl_inherit = true; else cfg->ttl_inherit = false; } else if (data[IFLA_GENEVE_TTL]) { info->key.ttl = nla_get_u8(data[IFLA_GENEVE_TTL]); cfg->ttl_inherit = false; } if (data[IFLA_GENEVE_TOS]) info->key.tos = nla_get_u8(data[IFLA_GENEVE_TOS]); if (data[IFLA_GENEVE_DF]) cfg->df = nla_get_u8(data[IFLA_GENEVE_DF]); if (data[IFLA_GENEVE_LABEL]) { info->key.label = nla_get_be32(data[IFLA_GENEVE_LABEL]) & IPV6_FLOWLABEL_MASK; if (info->key.label && (!(info->mode & IP_TUNNEL_INFO_IPV6))) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_LABEL], "Label attribute only applies for IPv6 Geneve devices"); return -EINVAL; } } if (data[IFLA_GENEVE_PORT]) { if (changelink) { attrtype = IFLA_GENEVE_PORT; goto change_notsup; } info->key.tp_dst = nla_get_be16(data[IFLA_GENEVE_PORT]); } if (data[IFLA_GENEVE_PORT_RANGE]) { const struct ifla_geneve_port_range *p; if (changelink) { attrtype = IFLA_GENEVE_PORT_RANGE; goto change_notsup; } p = nla_data(data[IFLA_GENEVE_PORT_RANGE]); cfg->port_min = ntohs(p->low); cfg->port_max = ntohs(p->high); } if (data[IFLA_GENEVE_COLLECT_METADATA]) { if (changelink) { attrtype = IFLA_GENEVE_COLLECT_METADATA; goto change_notsup; } cfg->collect_md = true; } if (data[IFLA_GENEVE_UDP_CSUM]) { if (changelink) { attrtype = IFLA_GENEVE_UDP_CSUM; goto change_notsup; } if (nla_get_u8(data[IFLA_GENEVE_UDP_CSUM])) __set_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags); } if (data[IFLA_GENEVE_UDP_ZERO_CSUM6_TX]) { #if IS_ENABLED(CONFIG_IPV6) if (changelink) { attrtype = IFLA_GENEVE_UDP_ZERO_CSUM6_TX; goto change_notsup; } if (nla_get_u8(data[IFLA_GENEVE_UDP_ZERO_CSUM6_TX])) __clear_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags); #else NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_UDP_ZERO_CSUM6_TX], "IPv6 support not enabled in the kernel"); return -EPFNOSUPPORT; #endif } if (data[IFLA_GENEVE_UDP_ZERO_CSUM6_RX]) { #if IS_ENABLED(CONFIG_IPV6) if (changelink) { attrtype = IFLA_GENEVE_UDP_ZERO_CSUM6_RX; goto change_notsup; } if (nla_get_u8(data[IFLA_GENEVE_UDP_ZERO_CSUM6_RX])) cfg->use_udp6_rx_checksums = false; #else NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_UDP_ZERO_CSUM6_RX], "IPv6 support not enabled in the kernel"); return -EPFNOSUPPORT; #endif } if (data[IFLA_GENEVE_INNER_PROTO_INHERIT]) { if (changelink) { attrtype = IFLA_GENEVE_INNER_PROTO_INHERIT; goto change_notsup; } cfg->inner_proto_inherit = true; } return 0; change_notsup: NL_SET_ERR_MSG_ATTR(extack, data[attrtype], "Changing VNI, Port, endpoint IP address family, external, inner_proto_inherit, and UDP checksum attributes are not supported"); return -EOPNOTSUPP; } static void geneve_link_config(struct net_device *dev, struct ip_tunnel_info *info, struct nlattr *tb[]) { struct geneve_dev *geneve = netdev_priv(dev); int ldev_mtu = 0; if (tb[IFLA_MTU]) { geneve_change_mtu(dev, nla_get_u32(tb[IFLA_MTU])); return; } switch (ip_tunnel_info_af(info)) { case AF_INET: { struct flowi4 fl4 = { .daddr = info->key.u.ipv4.dst }; struct rtable *rt = ip_route_output_key(geneve->net, &fl4); if (!IS_ERR(rt) && rt->dst.dev) { ldev_mtu = rt->dst.dev->mtu - GENEVE_IPV4_HLEN; ip_rt_put(rt); } break; } #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: { struct rt6_info *rt; if (!__in6_dev_get(dev)) break; rt = rt6_lookup(geneve->net, &info->key.u.ipv6.dst, NULL, 0, NULL, 0); if (rt && rt->dst.dev) ldev_mtu = rt->dst.dev->mtu - GENEVE_IPV6_HLEN; ip6_rt_put(rt); break; } #endif } if (ldev_mtu <= 0) return; geneve_change_mtu(dev, ldev_mtu - info->options_len); } static int geneve_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct net *link_net = rtnl_newlink_link_net(params); struct nlattr **data = params->data; struct nlattr **tb = params->tb; struct geneve_config cfg = { .df = GENEVE_DF_UNSET, .use_udp6_rx_checksums = false, .ttl_inherit = false, .collect_md = false, .port_min = 1, .port_max = USHRT_MAX, }; int err; init_tnl_info(&cfg.info, GENEVE_UDP_PORT); err = geneve_nl2info(tb, data, extack, &cfg, false); if (err) return err; err = geneve_configure(link_net, dev, extack, &cfg); if (err) return err; geneve_link_config(dev, &cfg.info, tb); return 0; } /* Quiesces the geneve device data path for both TX and RX. * * On transmit geneve checks for non-NULL geneve_sock before it proceeds. * So, if we set that socket to NULL under RCU and wait for synchronize_net() * to complete for the existing set of in-flight packets to be transmitted, * then we would have quiesced the transmit data path. All the future packets * will get dropped until we unquiesce the data path. * * On receive geneve dereference the geneve_sock stashed in the socket. So, * if we set that to NULL under RCU and wait for synchronize_net() to * complete, then we would have quiesced the receive data path. */ static void geneve_quiesce(struct geneve_dev *geneve, struct geneve_sock **gs4, struct geneve_sock **gs6) { *gs4 = rtnl_dereference(geneve->sock4); rcu_assign_pointer(geneve->sock4, NULL); if (*gs4) rcu_assign_sk_user_data((*gs4)->sock->sk, NULL); #if IS_ENABLED(CONFIG_IPV6) *gs6 = rtnl_dereference(geneve->sock6); rcu_assign_pointer(geneve->sock6, NULL); if (*gs6) rcu_assign_sk_user_data((*gs6)->sock->sk, NULL); #else *gs6 = NULL; #endif synchronize_net(); } /* Resumes the geneve device data path for both TX and RX. */ static void geneve_unquiesce(struct geneve_dev *geneve, struct geneve_sock *gs4, struct geneve_sock __maybe_unused *gs6) { rcu_assign_pointer(geneve->sock4, gs4); if (gs4) rcu_assign_sk_user_data(gs4->sock->sk, gs4); #if IS_ENABLED(CONFIG_IPV6) rcu_assign_pointer(geneve->sock6, gs6); if (gs6) rcu_assign_sk_user_data(gs6->sock->sk, gs6); #endif synchronize_net(); } static int geneve_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct geneve_dev *geneve = netdev_priv(dev); struct geneve_sock *gs4, *gs6; struct geneve_config cfg; int err; /* If the geneve device is configured for metadata (or externally * controlled, for example, OVS), then nothing can be changed. */ if (geneve->cfg.collect_md) return -EOPNOTSUPP; /* Start with the existing info. */ memcpy(&cfg, &geneve->cfg, sizeof(cfg)); err = geneve_nl2info(tb, data, extack, &cfg, true); if (err) return err; if (!geneve_dst_addr_equal(&geneve->cfg.info, &cfg.info)) { dst_cache_reset(&cfg.info.dst_cache); geneve_link_config(dev, &cfg.info, tb); } geneve_quiesce(geneve, &gs4, &gs6); memcpy(&geneve->cfg, &cfg, sizeof(cfg)); geneve_unquiesce(geneve, gs4, gs6); return 0; } static void geneve_dellink(struct net_device *dev, struct list_head *head) { struct geneve_dev *geneve = netdev_priv(dev); list_del(&geneve->next); unregister_netdevice_queue(dev, head); } static size_t geneve_get_size(const struct net_device *dev) { return nla_total_size(sizeof(__u32)) + /* IFLA_GENEVE_ID */ nla_total_size(sizeof(struct in6_addr)) + /* IFLA_GENEVE_REMOTE{6} */ nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_TTL */ nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_TOS */ nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_DF */ nla_total_size(sizeof(__be32)) + /* IFLA_GENEVE_LABEL */ nla_total_size(sizeof(__be16)) + /* IFLA_GENEVE_PORT */ nla_total_size(0) + /* IFLA_GENEVE_COLLECT_METADATA */ nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_CSUM */ nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_ZERO_CSUM6_TX */ nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_ZERO_CSUM6_RX */ nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_TTL_INHERIT */ nla_total_size(0) + /* IFLA_GENEVE_INNER_PROTO_INHERIT */ nla_total_size(sizeof(struct ifla_geneve_port_range)) + /* IFLA_GENEVE_PORT_RANGE */ 0; } static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct geneve_dev *geneve = netdev_priv(dev); struct ip_tunnel_info *info = &geneve->cfg.info; bool ttl_inherit = geneve->cfg.ttl_inherit; bool metadata = geneve->cfg.collect_md; struct ifla_geneve_port_range ports = { .low = htons(geneve->cfg.port_min), .high = htons(geneve->cfg.port_max), }; __u8 tmp_vni[3]; __u32 vni; tunnel_id_to_vni(info->key.tun_id, tmp_vni); vni = (tmp_vni[0] << 16) | (tmp_vni[1] << 8) | tmp_vni[2]; if (nla_put_u32(skb, IFLA_GENEVE_ID, vni)) goto nla_put_failure; if (!metadata && ip_tunnel_info_af(info) == AF_INET) { if (nla_put_in_addr(skb, IFLA_GENEVE_REMOTE, info->key.u.ipv4.dst)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_GENEVE_UDP_CSUM, test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags))) goto nla_put_failure; #if IS_ENABLED(CONFIG_IPV6) } else if (!metadata) { if (nla_put_in6_addr(skb, IFLA_GENEVE_REMOTE6, &info->key.u.ipv6.dst)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_GENEVE_UDP_ZERO_CSUM6_TX, !test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags))) goto nla_put_failure; #endif } if (nla_put_u8(skb, IFLA_GENEVE_TTL, info->key.ttl) || nla_put_u8(skb, IFLA_GENEVE_TOS, info->key.tos) || nla_put_be32(skb, IFLA_GENEVE_LABEL, info->key.label)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_GENEVE_DF, geneve->cfg.df)) goto nla_put_failure; if (nla_put_be16(skb, IFLA_GENEVE_PORT, info->key.tp_dst)) goto nla_put_failure; if (metadata && nla_put_flag(skb, IFLA_GENEVE_COLLECT_METADATA)) goto nla_put_failure; #if IS_ENABLED(CONFIG_IPV6) if (nla_put_u8(skb, IFLA_GENEVE_UDP_ZERO_CSUM6_RX, !geneve->cfg.use_udp6_rx_checksums)) goto nla_put_failure; #endif if (nla_put_u8(skb, IFLA_GENEVE_TTL_INHERIT, ttl_inherit)) goto nla_put_failure; if (geneve->cfg.inner_proto_inherit && nla_put_flag(skb, IFLA_GENEVE_INNER_PROTO_INHERIT)) goto nla_put_failure; if (nla_put(skb, IFLA_GENEVE_PORT_RANGE, sizeof(ports), &ports)) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static struct rtnl_link_ops geneve_link_ops __read_mostly = { .kind = "geneve", .maxtype = IFLA_GENEVE_MAX, .policy = geneve_policy, .priv_size = sizeof(struct geneve_dev), .setup = geneve_setup, .validate = geneve_validate, .newlink = geneve_newlink, .changelink = geneve_changelink, .dellink = geneve_dellink, .get_size = geneve_get_size, .fill_info = geneve_fill_info, }; struct net_device *geneve_dev_create_fb(struct net *net, const char *name, u8 name_assign_type, u16 dst_port) { struct nlattr *tb[IFLA_MAX + 1]; struct net_device *dev; LIST_HEAD(list_kill); int err; struct geneve_config cfg = { .df = GENEVE_DF_UNSET, .use_udp6_rx_checksums = true, .ttl_inherit = false, .collect_md = true, .port_min = 1, .port_max = USHRT_MAX, }; memset(tb, 0, sizeof(tb)); dev = rtnl_create_link(net, name, name_assign_type, &geneve_link_ops, tb, NULL); if (IS_ERR(dev)) return dev; init_tnl_info(&cfg.info, dst_port); err = geneve_configure(net, dev, NULL, &cfg); if (err) { free_netdev(dev); return ERR_PTR(err); } /* openvswitch users expect packet sizes to be unrestricted, * so set the largest MTU we can. */ err = geneve_change_mtu(dev, IP_MAX_MTU); if (err) goto err; err = rtnl_configure_link(dev, NULL, 0, NULL); if (err < 0) goto err; return dev; err: geneve_dellink(dev, &list_kill); unregister_netdevice_many(&list_kill); return ERR_PTR(err); } EXPORT_SYMBOL_GPL(geneve_dev_create_fb); static int geneve_netdevice_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (event == NETDEV_UDP_TUNNEL_PUSH_INFO) geneve_offload_rx_ports(dev, true); else if (event == NETDEV_UDP_TUNNEL_DROP_INFO) geneve_offload_rx_ports(dev, false); return NOTIFY_DONE; } static struct notifier_block geneve_notifier_block __read_mostly = { .notifier_call = geneve_netdevice_event, }; static __net_init int geneve_init_net(struct net *net) { struct geneve_net *gn = net_generic(net, geneve_net_id); INIT_LIST_HEAD(&gn->geneve_list); INIT_LIST_HEAD(&gn->sock_list); return 0; } static void __net_exit geneve_exit_rtnl_net(struct net *net, struct list_head *dev_to_kill) { struct geneve_net *gn = net_generic(net, geneve_net_id); struct geneve_dev *geneve, *next; list_for_each_entry_safe(geneve, next, &gn->geneve_list, next) geneve_dellink(geneve->dev, dev_to_kill); } static void __net_exit geneve_exit_net(struct net *net) { const struct geneve_net *gn = net_generic(net, geneve_net_id); WARN_ON_ONCE(!list_empty(&gn->sock_list)); } static struct pernet_operations geneve_net_ops = { .init = geneve_init_net, .exit_rtnl = geneve_exit_rtnl_net, .exit = geneve_exit_net, .id = &geneve_net_id, .size = sizeof(struct geneve_net), }; static int __init geneve_init_module(void) { int rc; rc = register_pernet_subsys(&geneve_net_ops); if (rc) goto out1; rc = register_netdevice_notifier(&geneve_notifier_block); if (rc) goto out2; rc = rtnl_link_register(&geneve_link_ops); if (rc) goto out3; return 0; out3: unregister_netdevice_notifier(&geneve_notifier_block); out2: unregister_pernet_subsys(&geneve_net_ops); out1: return rc; } late_initcall(geneve_init_module); static void __exit geneve_cleanup_module(void) { rtnl_link_unregister(&geneve_link_ops); unregister_netdevice_notifier(&geneve_notifier_block); unregister_pernet_subsys(&geneve_net_ops); } module_exit(geneve_cleanup_module); MODULE_LICENSE("GPL"); MODULE_VERSION(GENEVE_NETDEV_VER); MODULE_AUTHOR("John W. Linville <linville@tuxdriver.com>"); MODULE_DESCRIPTION("Interface driver for GENEVE encapsulated traffic"); MODULE_ALIAS_RTNL_LINK("geneve");
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2012 ARM Ltd. */ #ifndef __ASM_SYSCALL_H #define __ASM_SYSCALL_H #include <uapi/linux/audit.h> #include <linux/compat.h> #include <linux/err.h> typedef long (*syscall_fn_t)(const struct pt_regs *regs); extern const syscall_fn_t sys_call_table[]; #ifdef CONFIG_COMPAT extern const syscall_fn_t compat_sys_call_table[]; #endif static inline int syscall_get_nr(struct task_struct *task, struct pt_regs *regs) { return regs->syscallno; } static inline void syscall_rollback(struct task_struct *task, struct pt_regs *regs) { regs->regs[0] = regs->orig_x0; } static inline long syscall_get_return_value(struct task_struct *task, struct pt_regs *regs) { unsigned long val = regs->regs[0]; if (is_compat_thread(task_thread_info(task))) val = sign_extend64(val, 31); return val; } static inline long syscall_get_error(struct task_struct *task, struct pt_regs *regs) { unsigned long error = syscall_get_return_value(task, regs); return IS_ERR_VALUE(error) ? error : 0; } static inline void syscall_set_return_value(struct task_struct *task, struct pt_regs *regs, int error, long val) { if (error) val = error; if (is_compat_thread(task_thread_info(task))) val = lower_32_bits(val); regs->regs[0] = val; } static inline void syscall_set_nr(struct task_struct *task, struct pt_regs *regs, int nr) { regs->syscallno = nr; if (nr == -1) { /* * When the syscall number is set to -1, the syscall will be * skipped. In this case the syscall return value has to be * set explicitly, otherwise the first syscall argument is * returned as the syscall return value. */ syscall_set_return_value(task, regs, -ENOSYS, 0); } } #define SYSCALL_MAX_ARGS 6 static inline void syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, unsigned long *args) { args[0] = regs->orig_x0; args++; memcpy(args, &regs->regs[1], 5 * sizeof(args[0])); } static inline void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs, const unsigned long *args) { memcpy(&regs->regs[0], args, 6 * sizeof(args[0])); /* * Also copy the first argument into orig_x0 * so that syscall_get_arguments() would return it * instead of the previous value. */ regs->orig_x0 = regs->regs[0]; } /* * We don't care about endianness (__AUDIT_ARCH_LE bit) here because * AArch64 has the same system calls both on little- and big- endian. */ static inline int syscall_get_arch(struct task_struct *task) { if (is_compat_thread(task_thread_info(task))) return AUDIT_ARCH_ARM; return AUDIT_ARCH_AARCH64; } int syscall_trace_enter(struct pt_regs *regs); void syscall_trace_exit(struct pt_regs *regs); #endif /* __ASM_SYSCALL_H */
1179 1183 1184 1183 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_BH_H #define _LINUX_BH_H #include <linux/instruction_pointer.h> #include <linux/preempt.h> #if defined(CONFIG_PREEMPT_RT) || defined(CONFIG_TRACE_IRQFLAGS) extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt); #else static __always_inline void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) { preempt_count_add(cnt); barrier(); } #endif static inline void local_bh_disable(void) { __local_bh_disable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET); } extern void _local_bh_enable(void); extern void __local_bh_enable_ip(unsigned long ip, unsigned int cnt); static inline void local_bh_enable_ip(unsigned long ip) { __local_bh_enable_ip(ip, SOFTIRQ_DISABLE_OFFSET); } static inline void local_bh_enable(void) { __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET); } #ifdef CONFIG_PREEMPT_RT extern bool local_bh_blocked(void); #else static inline bool local_bh_blocked(void) { return false; } #endif #endif /* _LINUX_BH_H */
365 1 101 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM timestamp #if !defined(_TRACE_TIMESTAMP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_TIMESTAMP_H #include <linux/tracepoint.h> #include <linux/fs.h> #define CTIME_QUERIED_FLAGS \ { I_CTIME_QUERIED, "Q" } DECLARE_EVENT_CLASS(ctime, TP_PROTO(struct inode *inode, struct timespec64 *ctime), TP_ARGS(inode, ctime), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(time64_t, ctime_s) __field(u32, ctime_ns) __field(u32, gen) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->gen = inode->i_generation; __entry->ctime_s = ctime->tv_sec; __entry->ctime_ns = ctime->tv_nsec; ), TP_printk("ino=%d:%d:%ld:%u ctime=%lld.%u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->gen, __entry->ctime_s, __entry->ctime_ns ) ); DEFINE_EVENT(ctime, inode_set_ctime_to_ts, TP_PROTO(struct inode *inode, struct timespec64 *ctime), TP_ARGS(inode, ctime)); DEFINE_EVENT(ctime, ctime_xchg_skip, TP_PROTO(struct inode *inode, struct timespec64 *ctime), TP_ARGS(inode, ctime)); TRACE_EVENT(ctime_ns_xchg, TP_PROTO(struct inode *inode, u32 old, u32 new, u32 cur), TP_ARGS(inode, old, new, cur), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(u32, gen) __field(u32, old) __field(u32, new) __field(u32, cur) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->gen = inode->i_generation; __entry->old = old; __entry->new = new; __entry->cur = cur; ), TP_printk("ino=%d:%d:%ld:%u old=%u:%s new=%u cur=%u:%s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->gen, __entry->old & ~I_CTIME_QUERIED, __print_flags(__entry->old & I_CTIME_QUERIED, "|", CTIME_QUERIED_FLAGS), __entry->new, __entry->cur & ~I_CTIME_QUERIED, __print_flags(__entry->cur & I_CTIME_QUERIED, "|", CTIME_QUERIED_FLAGS) ) ); TRACE_EVENT(fill_mg_cmtime, TP_PROTO(struct inode *inode, struct timespec64 *ctime, struct timespec64 *mtime), TP_ARGS(inode, ctime, mtime), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(time64_t, ctime_s) __field(time64_t, mtime_s) __field(u32, ctime_ns) __field(u32, mtime_ns) __field(u32, gen) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->gen = inode->i_generation; __entry->ctime_s = ctime->tv_sec; __entry->mtime_s = mtime->tv_sec; __entry->ctime_ns = ctime->tv_nsec; __entry->mtime_ns = mtime->tv_nsec; ), TP_printk("ino=%d:%d:%ld:%u ctime=%lld.%u mtime=%lld.%u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->gen, __entry->ctime_s, __entry->ctime_ns, __entry->mtime_s, __entry->mtime_ns ) ); #endif /* _TRACE_TIMESTAMP_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
29 29 29 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 /* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ #ifndef _LINUX_RSEQ_H #define _LINUX_RSEQ_H #ifdef CONFIG_RSEQ #include <linux/preempt.h> #include <linux/sched.h> /* * Map the event mask on the user-space ABI enum rseq_cs_flags * for direct mask checks. */ enum rseq_event_mask_bits { RSEQ_EVENT_PREEMPT_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT, RSEQ_EVENT_SIGNAL_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT, RSEQ_EVENT_MIGRATE_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT, }; enum rseq_event_mask { RSEQ_EVENT_PREEMPT = (1U << RSEQ_EVENT_PREEMPT_BIT), RSEQ_EVENT_SIGNAL = (1U << RSEQ_EVENT_SIGNAL_BIT), RSEQ_EVENT_MIGRATE = (1U << RSEQ_EVENT_MIGRATE_BIT), }; static inline void rseq_set_notify_resume(struct task_struct *t) { if (t->rseq) set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); } void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs); static inline void rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) { if (current->rseq) __rseq_handle_notify_resume(ksig, regs); } static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { preempt_disable(); __set_bit(RSEQ_EVENT_SIGNAL_BIT, &current->rseq_event_mask); preempt_enable(); rseq_handle_notify_resume(ksig, regs); } /* rseq_preempt() requires preemption to be disabled. */ static inline void rseq_preempt(struct task_struct *t) { __set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask); rseq_set_notify_resume(t); } /* rseq_migrate() requires preemption to be disabled. */ static inline void rseq_migrate(struct task_struct *t) { __set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask); rseq_set_notify_resume(t); } /* * If parent process has a registered restartable sequences area, the * child inherits. Unregister rseq for a clone with CLONE_VM set. */ static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) { if (clone_flags & CLONE_VM) { t->rseq = NULL; t->rseq_len = 0; t->rseq_sig = 0; t->rseq_event_mask = 0; } else { t->rseq = current->rseq; t->rseq_len = current->rseq_len; t->rseq_sig = current->rseq_sig; t->rseq_event_mask = current->rseq_event_mask; } } static inline void rseq_execve(struct task_struct *t) { t->rseq = NULL; t->rseq_len = 0; t->rseq_sig = 0; t->rseq_event_mask = 0; } #else static inline void rseq_set_notify_resume(struct task_struct *t) { } static inline void rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) { } static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { } static inline void rseq_preempt(struct task_struct *t) { } static inline void rseq_migrate(struct task_struct *t) { } static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) { } static inline void rseq_execve(struct task_struct *t) { } #endif #ifdef CONFIG_DEBUG_RSEQ void rseq_syscall(struct pt_regs *regs); #else static inline void rseq_syscall(struct pt_regs *regs) { } #endif #endif /* _LINUX_RSEQ_H */
200 1125 200 199 198 1189 1186 1125 1174 3 200 1184 1183 1188 1190 200 7 193 173 30 200 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 // SPDX-License-Identifier: GPL-2.0 /* * security/tomoyo/file.c * * Copyright (C) 2005-2011 NTT DATA CORPORATION */ #include "common.h" #include <linux/slab.h> /* * Mapping table from "enum tomoyo_path_acl_index" to "enum tomoyo_mac_index". */ static const u8 tomoyo_p2mac[TOMOYO_MAX_PATH_OPERATION] = { [TOMOYO_TYPE_EXECUTE] = TOMOYO_MAC_FILE_EXECUTE, [TOMOYO_TYPE_READ] = TOMOYO_MAC_FILE_OPEN, [TOMOYO_TYPE_WRITE] = TOMOYO_MAC_FILE_OPEN, [TOMOYO_TYPE_APPEND] = TOMOYO_MAC_FILE_OPEN, [TOMOYO_TYPE_UNLINK] = TOMOYO_MAC_FILE_UNLINK, [TOMOYO_TYPE_GETATTR] = TOMOYO_MAC_FILE_GETATTR, [TOMOYO_TYPE_RMDIR] = TOMOYO_MAC_FILE_RMDIR, [TOMOYO_TYPE_TRUNCATE] = TOMOYO_MAC_FILE_TRUNCATE, [TOMOYO_TYPE_SYMLINK] = TOMOYO_MAC_FILE_SYMLINK, [TOMOYO_TYPE_CHROOT] = TOMOYO_MAC_FILE_CHROOT, [TOMOYO_TYPE_UMOUNT] = TOMOYO_MAC_FILE_UMOUNT, }; /* * Mapping table from "enum tomoyo_mkdev_acl_index" to "enum tomoyo_mac_index". */ const u8 tomoyo_pnnn2mac[TOMOYO_MAX_MKDEV_OPERATION] = { [TOMOYO_TYPE_MKBLOCK] = TOMOYO_MAC_FILE_MKBLOCK, [TOMOYO_TYPE_MKCHAR] = TOMOYO_MAC_FILE_MKCHAR, }; /* * Mapping table from "enum tomoyo_path2_acl_index" to "enum tomoyo_mac_index". */ const u8 tomoyo_pp2mac[TOMOYO_MAX_PATH2_OPERATION] = { [TOMOYO_TYPE_LINK] = TOMOYO_MAC_FILE_LINK, [TOMOYO_TYPE_RENAME] = TOMOYO_MAC_FILE_RENAME, [TOMOYO_TYPE_PIVOT_ROOT] = TOMOYO_MAC_FILE_PIVOT_ROOT, }; /* * Mapping table from "enum tomoyo_path_number_acl_index" to * "enum tomoyo_mac_index". */ const u8 tomoyo_pn2mac[TOMOYO_MAX_PATH_NUMBER_OPERATION] = { [TOMOYO_TYPE_CREATE] = TOMOYO_MAC_FILE_CREATE, [TOMOYO_TYPE_MKDIR] = TOMOYO_MAC_FILE_MKDIR, [TOMOYO_TYPE_MKFIFO] = TOMOYO_MAC_FILE_MKFIFO, [TOMOYO_TYPE_MKSOCK] = TOMOYO_MAC_FILE_MKSOCK, [TOMOYO_TYPE_IOCTL] = TOMOYO_MAC_FILE_IOCTL, [TOMOYO_TYPE_CHMOD] = TOMOYO_MAC_FILE_CHMOD, [TOMOYO_TYPE_CHOWN] = TOMOYO_MAC_FILE_CHOWN, [TOMOYO_TYPE_CHGRP] = TOMOYO_MAC_FILE_CHGRP, }; /** * tomoyo_put_name_union - Drop reference on "struct tomoyo_name_union". * * @ptr: Pointer to "struct tomoyo_name_union". * * Returns nothing. */ void tomoyo_put_name_union(struct tomoyo_name_union *ptr) { tomoyo_put_group(ptr->group); tomoyo_put_name(ptr->filename); } /** * tomoyo_compare_name_union - Check whether a name matches "struct tomoyo_name_union" or not. * * @name: Pointer to "struct tomoyo_path_info". * @ptr: Pointer to "struct tomoyo_name_union". * * Returns "struct tomoyo_path_info" if @name matches @ptr, NULL otherwise. */ const struct tomoyo_path_info * tomoyo_compare_name_union(const struct tomoyo_path_info *name, const struct tomoyo_name_union *ptr) { if (ptr->group) return tomoyo_path_matches_group(name, ptr->group); if (tomoyo_path_matches_pattern(name, ptr->filename)) return ptr->filename; return NULL; } /** * tomoyo_put_number_union - Drop reference on "struct tomoyo_number_union". * * @ptr: Pointer to "struct tomoyo_number_union". * * Returns nothing. */ void tomoyo_put_number_union(struct tomoyo_number_union *ptr) { tomoyo_put_group(ptr->group); } /** * tomoyo_compare_number_union - Check whether a value matches "struct tomoyo_number_union" or not. * * @value: Number to check. * @ptr: Pointer to "struct tomoyo_number_union". * * Returns true if @value matches @ptr, false otherwise. */ bool tomoyo_compare_number_union(const unsigned long value, const struct tomoyo_number_union *ptr) { if (ptr->group) return tomoyo_number_matches_group(value, value, ptr->group); return value >= ptr->values[0] && value <= ptr->values[1]; } /** * tomoyo_add_slash - Add trailing '/' if needed. * * @buf: Pointer to "struct tomoyo_path_info". * * Returns nothing. * * @buf must be generated by tomoyo_encode() because this function does not * allocate memory for adding '/'. */ static void tomoyo_add_slash(struct tomoyo_path_info *buf) { if (buf->is_dir) return; /* * This is OK because tomoyo_encode() reserves space for appending "/". */ strcat((char *) buf->name, "/"); tomoyo_fill_path_info(buf); } /** * tomoyo_get_realpath - Get realpath. * * @buf: Pointer to "struct tomoyo_path_info". * @path: Pointer to "struct path". * * Returns true on success, false otherwise. */ static bool tomoyo_get_realpath(struct tomoyo_path_info *buf, const struct path *path) { buf->name = tomoyo_realpath_from_path(path); if (buf->name) { tomoyo_fill_path_info(buf); return true; } return false; } /** * tomoyo_audit_path_log - Audit path request log. * * @r: Pointer to "struct tomoyo_request_info". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_audit_path_log(struct tomoyo_request_info *r) { return tomoyo_supervisor(r, "file %s %s\n", tomoyo_path_keyword [r->param.path.operation], r->param.path.filename->name); } /** * tomoyo_audit_path2_log - Audit path/path request log. * * @r: Pointer to "struct tomoyo_request_info". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_audit_path2_log(struct tomoyo_request_info *r) { return tomoyo_supervisor(r, "file %s %s %s\n", tomoyo_mac_keywords [tomoyo_pp2mac[r->param.path2.operation]], r->param.path2.filename1->name, r->param.path2.filename2->name); } /** * tomoyo_audit_mkdev_log - Audit path/number/number/number request log. * * @r: Pointer to "struct tomoyo_request_info". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_audit_mkdev_log(struct tomoyo_request_info *r) { return tomoyo_supervisor(r, "file %s %s 0%o %u %u\n", tomoyo_mac_keywords [tomoyo_pnnn2mac[r->param.mkdev.operation]], r->param.mkdev.filename->name, r->param.mkdev.mode, r->param.mkdev.major, r->param.mkdev.minor); } /** * tomoyo_audit_path_number_log - Audit path/number request log. * * @r: Pointer to "struct tomoyo_request_info". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_audit_path_number_log(struct tomoyo_request_info *r) { const u8 type = r->param.path_number.operation; u8 radix; char buffer[64]; switch (type) { case TOMOYO_TYPE_CREATE: case TOMOYO_TYPE_MKDIR: case TOMOYO_TYPE_MKFIFO: case TOMOYO_TYPE_MKSOCK: case TOMOYO_TYPE_CHMOD: radix = TOMOYO_VALUE_TYPE_OCTAL; break; case TOMOYO_TYPE_IOCTL: radix = TOMOYO_VALUE_TYPE_HEXADECIMAL; break; default: radix = TOMOYO_VALUE_TYPE_DECIMAL; break; } tomoyo_print_ulong(buffer, sizeof(buffer), r->param.path_number.number, radix); return tomoyo_supervisor(r, "file %s %s %s\n", tomoyo_mac_keywords [tomoyo_pn2mac[type]], r->param.path_number.filename->name, buffer); } /** * tomoyo_check_path_acl - Check permission for path operation. * * @r: Pointer to "struct tomoyo_request_info". * @ptr: Pointer to "struct tomoyo_acl_info". * * Returns true if granted, false otherwise. * * To be able to use wildcard for domain transition, this function sets * matching entry on success. Since the caller holds tomoyo_read_lock(), * it is safe to set matching entry. */ static bool tomoyo_check_path_acl(struct tomoyo_request_info *r, const struct tomoyo_acl_info *ptr) { const struct tomoyo_path_acl *acl = container_of(ptr, typeof(*acl), head); if (acl->perm & (1 << r->param.path.operation)) { r->param.path.matched_path = tomoyo_compare_name_union(r->param.path.filename, &acl->name); return r->param.path.matched_path != NULL; } return false; } /** * tomoyo_check_path_number_acl - Check permission for path number operation. * * @r: Pointer to "struct tomoyo_request_info". * @ptr: Pointer to "struct tomoyo_acl_info". * * Returns true if granted, false otherwise. */ static bool tomoyo_check_path_number_acl(struct tomoyo_request_info *r, const struct tomoyo_acl_info *ptr) { const struct tomoyo_path_number_acl *acl = container_of(ptr, typeof(*acl), head); return (acl->perm & (1 << r->param.path_number.operation)) && tomoyo_compare_number_union(r->param.path_number.number, &acl->number) && tomoyo_compare_name_union(r->param.path_number.filename, &acl->name); } /** * tomoyo_check_path2_acl - Check permission for path path operation. * * @r: Pointer to "struct tomoyo_request_info". * @ptr: Pointer to "struct tomoyo_acl_info". * * Returns true if granted, false otherwise. */ static bool tomoyo_check_path2_acl(struct tomoyo_request_info *r, const struct tomoyo_acl_info *ptr) { const struct tomoyo_path2_acl *acl = container_of(ptr, typeof(*acl), head); return (acl->perm & (1 << r->param.path2.operation)) && tomoyo_compare_name_union(r->param.path2.filename1, &acl->name1) && tomoyo_compare_name_union(r->param.path2.filename2, &acl->name2); } /** * tomoyo_check_mkdev_acl - Check permission for path number number number operation. * * @r: Pointer to "struct tomoyo_request_info". * @ptr: Pointer to "struct tomoyo_acl_info". * * Returns true if granted, false otherwise. */ static bool tomoyo_check_mkdev_acl(struct tomoyo_request_info *r, const struct tomoyo_acl_info *ptr) { const struct tomoyo_mkdev_acl *acl = container_of(ptr, typeof(*acl), head); return (acl->perm & (1 << r->param.mkdev.operation)) && tomoyo_compare_number_union(r->param.mkdev.mode, &acl->mode) && tomoyo_compare_number_union(r->param.mkdev.major, &acl->major) && tomoyo_compare_number_union(r->param.mkdev.minor, &acl->minor) && tomoyo_compare_name_union(r->param.mkdev.filename, &acl->name); } /** * tomoyo_same_path_acl - Check for duplicated "struct tomoyo_path_acl" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * * Returns true if @a == @b except permission bits, false otherwise. */ static bool tomoyo_same_path_acl(const struct tomoyo_acl_info *a, const struct tomoyo_acl_info *b) { const struct tomoyo_path_acl *p1 = container_of(a, typeof(*p1), head); const struct tomoyo_path_acl *p2 = container_of(b, typeof(*p2), head); return tomoyo_same_name_union(&p1->name, &p2->name); } /** * tomoyo_merge_path_acl - Merge duplicated "struct tomoyo_path_acl" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * @is_delete: True for @a &= ~@b, false for @a |= @b. * * Returns true if @a is empty, false otherwise. */ static bool tomoyo_merge_path_acl(struct tomoyo_acl_info *a, struct tomoyo_acl_info *b, const bool is_delete) { u16 * const a_perm = &container_of(a, struct tomoyo_path_acl, head) ->perm; u16 perm = READ_ONCE(*a_perm); const u16 b_perm = container_of(b, struct tomoyo_path_acl, head)->perm; if (is_delete) perm &= ~b_perm; else perm |= b_perm; WRITE_ONCE(*a_perm, perm); return !perm; } /** * tomoyo_update_path_acl - Update "struct tomoyo_path_acl" list. * * @perm: Permission. * @param: Pointer to "struct tomoyo_acl_param". * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ static int tomoyo_update_path_acl(const u16 perm, struct tomoyo_acl_param *param) { struct tomoyo_path_acl e = { .head.type = TOMOYO_TYPE_PATH_ACL, .perm = perm }; int error; if (!tomoyo_parse_name_union(param, &e.name)) error = -EINVAL; else error = tomoyo_update_domain(&e.head, sizeof(e), param, tomoyo_same_path_acl, tomoyo_merge_path_acl); tomoyo_put_name_union(&e.name); return error; } /** * tomoyo_same_mkdev_acl - Check for duplicated "struct tomoyo_mkdev_acl" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * * Returns true if @a == @b except permission bits, false otherwise. */ static bool tomoyo_same_mkdev_acl(const struct tomoyo_acl_info *a, const struct tomoyo_acl_info *b) { const struct tomoyo_mkdev_acl *p1 = container_of(a, typeof(*p1), head); const struct tomoyo_mkdev_acl *p2 = container_of(b, typeof(*p2), head); return tomoyo_same_name_union(&p1->name, &p2->name) && tomoyo_same_number_union(&p1->mode, &p2->mode) && tomoyo_same_number_union(&p1->major, &p2->major) && tomoyo_same_number_union(&p1->minor, &p2->minor); } /** * tomoyo_merge_mkdev_acl - Merge duplicated "struct tomoyo_mkdev_acl" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * @is_delete: True for @a &= ~@b, false for @a |= @b. * * Returns true if @a is empty, false otherwise. */ static bool tomoyo_merge_mkdev_acl(struct tomoyo_acl_info *a, struct tomoyo_acl_info *b, const bool is_delete) { u8 *const a_perm = &container_of(a, struct tomoyo_mkdev_acl, head)->perm; u8 perm = READ_ONCE(*a_perm); const u8 b_perm = container_of(b, struct tomoyo_mkdev_acl, head) ->perm; if (is_delete) perm &= ~b_perm; else perm |= b_perm; WRITE_ONCE(*a_perm, perm); return !perm; } /** * tomoyo_update_mkdev_acl - Update "struct tomoyo_mkdev_acl" list. * * @perm: Permission. * @param: Pointer to "struct tomoyo_acl_param". * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ static int tomoyo_update_mkdev_acl(const u8 perm, struct tomoyo_acl_param *param) { struct tomoyo_mkdev_acl e = { .head.type = TOMOYO_TYPE_MKDEV_ACL, .perm = perm }; int error; if (!tomoyo_parse_name_union(param, &e.name) || !tomoyo_parse_number_union(param, &e.mode) || !tomoyo_parse_number_union(param, &e.major) || !tomoyo_parse_number_union(param, &e.minor)) error = -EINVAL; else error = tomoyo_update_domain(&e.head, sizeof(e), param, tomoyo_same_mkdev_acl, tomoyo_merge_mkdev_acl); tomoyo_put_name_union(&e.name); tomoyo_put_number_union(&e.mode); tomoyo_put_number_union(&e.major); tomoyo_put_number_union(&e.minor); return error; } /** * tomoyo_same_path2_acl - Check for duplicated "struct tomoyo_path2_acl" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * * Returns true if @a == @b except permission bits, false otherwise. */ static bool tomoyo_same_path2_acl(const struct tomoyo_acl_info *a, const struct tomoyo_acl_info *b) { const struct tomoyo_path2_acl *p1 = container_of(a, typeof(*p1), head); const struct tomoyo_path2_acl *p2 = container_of(b, typeof(*p2), head); return tomoyo_same_name_union(&p1->name1, &p2->name1) && tomoyo_same_name_union(&p1->name2, &p2->name2); } /** * tomoyo_merge_path2_acl - Merge duplicated "struct tomoyo_path2_acl" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * @is_delete: True for @a &= ~@b, false for @a |= @b. * * Returns true if @a is empty, false otherwise. */ static bool tomoyo_merge_path2_acl(struct tomoyo_acl_info *a, struct tomoyo_acl_info *b, const bool is_delete) { u8 * const a_perm = &container_of(a, struct tomoyo_path2_acl, head) ->perm; u8 perm = READ_ONCE(*a_perm); const u8 b_perm = container_of(b, struct tomoyo_path2_acl, head)->perm; if (is_delete) perm &= ~b_perm; else perm |= b_perm; WRITE_ONCE(*a_perm, perm); return !perm; } /** * tomoyo_update_path2_acl - Update "struct tomoyo_path2_acl" list. * * @perm: Permission. * @param: Pointer to "struct tomoyo_acl_param". * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ static int tomoyo_update_path2_acl(const u8 perm, struct tomoyo_acl_param *param) { struct tomoyo_path2_acl e = { .head.type = TOMOYO_TYPE_PATH2_ACL, .perm = perm }; int error; if (!tomoyo_parse_name_union(param, &e.name1) || !tomoyo_parse_name_union(param, &e.name2)) error = -EINVAL; else error = tomoyo_update_domain(&e.head, sizeof(e), param, tomoyo_same_path2_acl, tomoyo_merge_path2_acl); tomoyo_put_name_union(&e.name1); tomoyo_put_name_union(&e.name2); return error; } /** * tomoyo_path_permission - Check permission for single path operation. * * @r: Pointer to "struct tomoyo_request_info". * @operation: Type of operation. * @filename: Filename to check. * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ static int tomoyo_path_permission(struct tomoyo_request_info *r, u8 operation, const struct tomoyo_path_info *filename) { int error; r->type = tomoyo_p2mac[operation]; r->mode = tomoyo_get_mode(r->domain->ns, r->profile, r->type); if (r->mode == TOMOYO_CONFIG_DISABLED) return 0; r->param_type = TOMOYO_TYPE_PATH_ACL; r->param.path.filename = filename; r->param.path.operation = operation; do { tomoyo_check_acl(r, tomoyo_check_path_acl); error = tomoyo_audit_path_log(r); } while (error == TOMOYO_RETRY_REQUEST); return error; } /** * tomoyo_execute_permission - Check permission for execute operation. * * @r: Pointer to "struct tomoyo_request_info". * @filename: Filename to check. * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ int tomoyo_execute_permission(struct tomoyo_request_info *r, const struct tomoyo_path_info *filename) { /* * Unlike other permission checks, this check is done regardless of * profile mode settings in order to check for domain transition * preference. */ r->type = TOMOYO_MAC_FILE_EXECUTE; r->mode = tomoyo_get_mode(r->domain->ns, r->profile, r->type); r->param_type = TOMOYO_TYPE_PATH_ACL; r->param.path.filename = filename; r->param.path.operation = TOMOYO_TYPE_EXECUTE; tomoyo_check_acl(r, tomoyo_check_path_acl); r->ee->transition = r->matched_acl && r->matched_acl->cond ? r->matched_acl->cond->transit : NULL; if (r->mode != TOMOYO_CONFIG_DISABLED) return tomoyo_audit_path_log(r); return 0; } /** * tomoyo_same_path_number_acl - Check for duplicated "struct tomoyo_path_number_acl" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * * Returns true if @a == @b except permission bits, false otherwise. */ static bool tomoyo_same_path_number_acl(const struct tomoyo_acl_info *a, const struct tomoyo_acl_info *b) { const struct tomoyo_path_number_acl *p1 = container_of(a, typeof(*p1), head); const struct tomoyo_path_number_acl *p2 = container_of(b, typeof(*p2), head); return tomoyo_same_name_union(&p1->name, &p2->name) && tomoyo_same_number_union(&p1->number, &p2->number); } /** * tomoyo_merge_path_number_acl - Merge duplicated "struct tomoyo_path_number_acl" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * @is_delete: True for @a &= ~@b, false for @a |= @b. * * Returns true if @a is empty, false otherwise. */ static bool tomoyo_merge_path_number_acl(struct tomoyo_acl_info *a, struct tomoyo_acl_info *b, const bool is_delete) { u8 * const a_perm = &container_of(a, struct tomoyo_path_number_acl, head)->perm; u8 perm = READ_ONCE(*a_perm); const u8 b_perm = container_of(b, struct tomoyo_path_number_acl, head) ->perm; if (is_delete) perm &= ~b_perm; else perm |= b_perm; WRITE_ONCE(*a_perm, perm); return !perm; } /** * tomoyo_update_path_number_acl - Update ioctl/chmod/chown/chgrp ACL. * * @perm: Permission. * @param: Pointer to "struct tomoyo_acl_param". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_update_path_number_acl(const u8 perm, struct tomoyo_acl_param *param) { struct tomoyo_path_number_acl e = { .head.type = TOMOYO_TYPE_PATH_NUMBER_ACL, .perm = perm }; int error; if (!tomoyo_parse_name_union(param, &e.name) || !tomoyo_parse_number_union(param, &e.number)) error = -EINVAL; else error = tomoyo_update_domain(&e.head, sizeof(e), param, tomoyo_same_path_number_acl, tomoyo_merge_path_number_acl); tomoyo_put_name_union(&e.name); tomoyo_put_number_union(&e.number); return error; } /** * tomoyo_path_number_perm - Check permission for "create", "mkdir", "mkfifo", "mksock", "ioctl", "chmod", "chown", "chgrp". * * @type: Type of operation. * @path: Pointer to "struct path". * @number: Number. * * Returns 0 on success, negative value otherwise. */ int tomoyo_path_number_perm(const u8 type, const struct path *path, unsigned long number) { struct tomoyo_request_info r; struct tomoyo_obj_info obj = { .path1 = { .mnt = path->mnt, .dentry = path->dentry }, }; int error = -ENOMEM; struct tomoyo_path_info buf; int idx; if (tomoyo_init_request_info(&r, NULL, tomoyo_pn2mac[type]) == TOMOYO_CONFIG_DISABLED) return 0; idx = tomoyo_read_lock(); if (!tomoyo_get_realpath(&buf, path)) goto out; r.obj = &obj; if (type == TOMOYO_TYPE_MKDIR) tomoyo_add_slash(&buf); r.param_type = TOMOYO_TYPE_PATH_NUMBER_ACL; r.param.path_number.operation = type; r.param.path_number.filename = &buf; r.param.path_number.number = number; do { tomoyo_check_acl(&r, tomoyo_check_path_number_acl); error = tomoyo_audit_path_number_log(&r); } while (error == TOMOYO_RETRY_REQUEST); kfree(buf.name); out: tomoyo_read_unlock(idx); if (r.mode != TOMOYO_CONFIG_ENFORCING) error = 0; return error; } /** * tomoyo_check_open_permission - Check permission for "read" and "write". * * @domain: Pointer to "struct tomoyo_domain_info". * @path: Pointer to "struct path". * @flag: Flags for open(). * * Returns 0 on success, negative value otherwise. */ int tomoyo_check_open_permission(struct tomoyo_domain_info *domain, const struct path *path, const int flag) { const u8 acc_mode = ACC_MODE(flag); int error = 0; struct tomoyo_path_info buf; struct tomoyo_request_info r; struct tomoyo_obj_info obj = { .path1 = { .mnt = path->mnt, .dentry = path->dentry }, }; int idx; buf.name = NULL; r.mode = TOMOYO_CONFIG_DISABLED; idx = tomoyo_read_lock(); if (acc_mode && tomoyo_init_request_info(&r, domain, TOMOYO_MAC_FILE_OPEN) != TOMOYO_CONFIG_DISABLED) { if (!tomoyo_get_realpath(&buf, path)) { error = -ENOMEM; goto out; } r.obj = &obj; if (acc_mode & MAY_READ) error = tomoyo_path_permission(&r, TOMOYO_TYPE_READ, &buf); if (!error && (acc_mode & MAY_WRITE)) error = tomoyo_path_permission(&r, (flag & O_APPEND) ? TOMOYO_TYPE_APPEND : TOMOYO_TYPE_WRITE, &buf); } out: kfree(buf.name); tomoyo_read_unlock(idx); if (r.mode != TOMOYO_CONFIG_ENFORCING) error = 0; return error; } /** * tomoyo_path_perm - Check permission for "unlink", "rmdir", "truncate", "symlink", "append", "chroot" and "unmount". * * @operation: Type of operation. * @path: Pointer to "struct path". * @target: Symlink's target if @operation is TOMOYO_TYPE_SYMLINK, * NULL otherwise. * * Returns 0 on success, negative value otherwise. */ int tomoyo_path_perm(const u8 operation, const struct path *path, const char *target) { struct tomoyo_request_info r; struct tomoyo_obj_info obj = { .path1 = { .mnt = path->mnt, .dentry = path->dentry }, }; int error; struct tomoyo_path_info buf; bool is_enforce; struct tomoyo_path_info symlink_target; int idx; if (tomoyo_init_request_info(&r, NULL, tomoyo_p2mac[operation]) == TOMOYO_CONFIG_DISABLED) return 0; is_enforce = (r.mode == TOMOYO_CONFIG_ENFORCING); error = -ENOMEM; buf.name = NULL; idx = tomoyo_read_lock(); if (!tomoyo_get_realpath(&buf, path)) goto out; r.obj = &obj; switch (operation) { case TOMOYO_TYPE_RMDIR: case TOMOYO_TYPE_CHROOT: tomoyo_add_slash(&buf); break; case TOMOYO_TYPE_SYMLINK: symlink_target.name = tomoyo_encode(target); if (!symlink_target.name) goto out; tomoyo_fill_path_info(&symlink_target); obj.symlink_target = &symlink_target; break; } error = tomoyo_path_permission(&r, operation, &buf); if (operation == TOMOYO_TYPE_SYMLINK) kfree(symlink_target.name); out: kfree(buf.name); tomoyo_read_unlock(idx); if (!is_enforce) error = 0; return error; } /** * tomoyo_mkdev_perm - Check permission for "mkblock" and "mkchar". * * @operation: Type of operation. (TOMOYO_TYPE_MKCHAR or TOMOYO_TYPE_MKBLOCK) * @path: Pointer to "struct path". * @mode: Create mode. * @dev: Device number. * * Returns 0 on success, negative value otherwise. */ int tomoyo_mkdev_perm(const u8 operation, const struct path *path, const unsigned int mode, unsigned int dev) { struct tomoyo_request_info r; struct tomoyo_obj_info obj = { .path1 = { .mnt = path->mnt, .dentry = path->dentry }, }; int error = -ENOMEM; struct tomoyo_path_info buf; int idx; if (tomoyo_init_request_info(&r, NULL, tomoyo_pnnn2mac[operation]) == TOMOYO_CONFIG_DISABLED) return 0; idx = tomoyo_read_lock(); error = -ENOMEM; if (tomoyo_get_realpath(&buf, path)) { r.obj = &obj; dev = new_decode_dev(dev); r.param_type = TOMOYO_TYPE_MKDEV_ACL; r.param.mkdev.filename = &buf; r.param.mkdev.operation = operation; r.param.mkdev.mode = mode; r.param.mkdev.major = MAJOR(dev); r.param.mkdev.minor = MINOR(dev); tomoyo_check_acl(&r, tomoyo_check_mkdev_acl); error = tomoyo_audit_mkdev_log(&r); kfree(buf.name); } tomoyo_read_unlock(idx); if (r.mode != TOMOYO_CONFIG_ENFORCING) error = 0; return error; } /** * tomoyo_path2_perm - Check permission for "rename", "link" and "pivot_root". * * @operation: Type of operation. * @path1: Pointer to "struct path". * @path2: Pointer to "struct path". * * Returns 0 on success, negative value otherwise. */ int tomoyo_path2_perm(const u8 operation, const struct path *path1, const struct path *path2) { int error = -ENOMEM; struct tomoyo_path_info buf1; struct tomoyo_path_info buf2; struct tomoyo_request_info r; struct tomoyo_obj_info obj = { .path1 = { .mnt = path1->mnt, .dentry = path1->dentry }, .path2 = { .mnt = path2->mnt, .dentry = path2->dentry } }; int idx; if (tomoyo_init_request_info(&r, NULL, tomoyo_pp2mac[operation]) == TOMOYO_CONFIG_DISABLED) return 0; buf1.name = NULL; buf2.name = NULL; idx = tomoyo_read_lock(); if (!tomoyo_get_realpath(&buf1, path1) || !tomoyo_get_realpath(&buf2, path2)) goto out; switch (operation) { case TOMOYO_TYPE_RENAME: case TOMOYO_TYPE_LINK: if (!d_is_dir(path1->dentry)) break; fallthrough; case TOMOYO_TYPE_PIVOT_ROOT: tomoyo_add_slash(&buf1); tomoyo_add_slash(&buf2); break; } r.obj = &obj; r.param_type = TOMOYO_TYPE_PATH2_ACL; r.param.path2.operation = operation; r.param.path2.filename1 = &buf1; r.param.path2.filename2 = &buf2; do { tomoyo_check_acl(&r, tomoyo_check_path2_acl); error = tomoyo_audit_path2_log(&r); } while (error == TOMOYO_RETRY_REQUEST); out: kfree(buf1.name); kfree(buf2.name); tomoyo_read_unlock(idx); if (r.mode != TOMOYO_CONFIG_ENFORCING) error = 0; return error; } /** * tomoyo_same_mount_acl - Check for duplicated "struct tomoyo_mount_acl" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * * Returns true if @a == @b, false otherwise. */ static bool tomoyo_same_mount_acl(const struct tomoyo_acl_info *a, const struct tomoyo_acl_info *b) { const struct tomoyo_mount_acl *p1 = container_of(a, typeof(*p1), head); const struct tomoyo_mount_acl *p2 = container_of(b, typeof(*p2), head); return tomoyo_same_name_union(&p1->dev_name, &p2->dev_name) && tomoyo_same_name_union(&p1->dir_name, &p2->dir_name) && tomoyo_same_name_union(&p1->fs_type, &p2->fs_type) && tomoyo_same_number_union(&p1->flags, &p2->flags); } /** * tomoyo_update_mount_acl - Write "struct tomoyo_mount_acl" list. * * @param: Pointer to "struct tomoyo_acl_param". * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ static int tomoyo_update_mount_acl(struct tomoyo_acl_param *param) { struct tomoyo_mount_acl e = { .head.type = TOMOYO_TYPE_MOUNT_ACL }; int error; if (!tomoyo_parse_name_union(param, &e.dev_name) || !tomoyo_parse_name_union(param, &e.dir_name) || !tomoyo_parse_name_union(param, &e.fs_type) || !tomoyo_parse_number_union(param, &e.flags)) error = -EINVAL; else error = tomoyo_update_domain(&e.head, sizeof(e), param, tomoyo_same_mount_acl, NULL); tomoyo_put_name_union(&e.dev_name); tomoyo_put_name_union(&e.dir_name); tomoyo_put_name_union(&e.fs_type); tomoyo_put_number_union(&e.flags); return error; } /** * tomoyo_write_file - Update file related list. * * @param: Pointer to "struct tomoyo_acl_param". * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ int tomoyo_write_file(struct tomoyo_acl_param *param) { u16 perm = 0; u8 type; const char *operation = tomoyo_read_token(param); for (type = 0; type < TOMOYO_MAX_PATH_OPERATION; type++) if (tomoyo_permstr(operation, tomoyo_path_keyword[type])) perm |= 1 << type; if (perm) return tomoyo_update_path_acl(perm, param); for (type = 0; type < TOMOYO_MAX_PATH2_OPERATION; type++) if (tomoyo_permstr(operation, tomoyo_mac_keywords[tomoyo_pp2mac[type]])) perm |= 1 << type; if (perm) return tomoyo_update_path2_acl(perm, param); for (type = 0; type < TOMOYO_MAX_PATH_NUMBER_OPERATION; type++) if (tomoyo_permstr(operation, tomoyo_mac_keywords[tomoyo_pn2mac[type]])) perm |= 1 << type; if (perm) return tomoyo_update_path_number_acl(perm, param); for (type = 0; type < TOMOYO_MAX_MKDEV_OPERATION; type++) if (tomoyo_permstr(operation, tomoyo_mac_keywords[tomoyo_pnnn2mac[type]])) perm |= 1 << type; if (perm) return tomoyo_update_mkdev_acl(perm, param); if (tomoyo_permstr(operation, tomoyo_mac_keywords[TOMOYO_MAC_FILE_MOUNT])) return tomoyo_update_mount_acl(param); return -EINVAL; }
4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 // SPDX-License-Identifier: GPL-2.0 /* Copyright 2011-2014 Autronica Fire and Security AS * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se * This file contains device methods for creating, using and destroying * virtual HSR or PRP devices. */ #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/etherdevice.h> #include <linux/rtnetlink.h> #include <linux/pkt_sched.h> #include "hsr_device.h" #include "hsr_slave.h" #include "hsr_framereg.h" #include "hsr_main.h" #include "hsr_forward.h" static bool is_admin_up(struct net_device *dev) { return dev && (dev->flags & IFF_UP); } static bool is_slave_up(struct net_device *dev) { return dev && is_admin_up(dev) && netif_oper_up(dev); } static void hsr_set_operstate(struct hsr_port *master, bool has_carrier) { struct net_device *dev = master->dev; if (!is_admin_up(dev)) { netif_set_operstate(dev, IF_OPER_DOWN); return; } if (has_carrier) netif_set_operstate(dev, IF_OPER_UP); else netif_set_operstate(dev, IF_OPER_LOWERLAYERDOWN); } static bool hsr_check_carrier(struct hsr_port *master) { struct hsr_port *port; ASSERT_RTNL(); hsr_for_each_port(master->hsr, port) { if (port->type != HSR_PT_MASTER && is_slave_up(port->dev)) { netif_carrier_on(master->dev); return true; } } netif_carrier_off(master->dev); return false; } static void hsr_check_announce(struct net_device *hsr_dev) { struct hsr_priv *hsr; hsr = netdev_priv(hsr_dev); if (netif_running(hsr_dev) && netif_oper_up(hsr_dev)) { /* Enable announce timer and start sending supervisory frames */ if (!timer_pending(&hsr->announce_timer)) { hsr->announce_count = 0; mod_timer(&hsr->announce_timer, jiffies + msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL)); } if (hsr->redbox && !timer_pending(&hsr->announce_proxy_timer)) mod_timer(&hsr->announce_proxy_timer, jiffies + msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL) / 2); } else { /* Deactivate the announce timer */ timer_delete(&hsr->announce_timer); if (hsr->redbox) timer_delete(&hsr->announce_proxy_timer); } } void hsr_check_carrier_and_operstate(struct hsr_priv *hsr) { struct hsr_port *master; bool has_carrier; master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); /* netif_stacked_transfer_operstate() cannot be used here since * it doesn't set IF_OPER_LOWERLAYERDOWN (?) */ has_carrier = hsr_check_carrier(master); hsr_set_operstate(master, has_carrier); hsr_check_announce(master->dev); } int hsr_get_max_mtu(struct hsr_priv *hsr) { unsigned int mtu_max; struct hsr_port *port; mtu_max = ETH_DATA_LEN; hsr_for_each_port(hsr, port) if (port->type != HSR_PT_MASTER) mtu_max = min(port->dev->mtu, mtu_max); if (mtu_max < HSR_HLEN) return 0; return mtu_max - HSR_HLEN; } static int hsr_dev_change_mtu(struct net_device *dev, int new_mtu) { struct hsr_priv *hsr; hsr = netdev_priv(dev); if (new_mtu > hsr_get_max_mtu(hsr)) { netdev_info(dev, "A HSR master's MTU cannot be greater than the smallest MTU of its slaves minus the HSR Tag length (%d octets).\n", HSR_HLEN); return -EINVAL; } WRITE_ONCE(dev->mtu, new_mtu); return 0; } static int hsr_dev_open(struct net_device *dev) { struct hsr_priv *hsr; struct hsr_port *port; const char *designation = NULL; hsr = netdev_priv(dev); hsr_for_each_port(hsr, port) { if (port->type == HSR_PT_MASTER) continue; switch (port->type) { case HSR_PT_SLAVE_A: designation = "Slave A"; break; case HSR_PT_SLAVE_B: designation = "Slave B"; break; case HSR_PT_INTERLINK: designation = "Interlink"; break; default: designation = "Unknown"; } if (!is_slave_up(port->dev)) netdev_warn(dev, "%s (%s) is not up; please bring it up to get a fully working HSR network\n", designation, port->dev->name); } if (!designation) netdev_warn(dev, "No slave devices configured\n"); return 0; } static int hsr_dev_close(struct net_device *dev) { struct hsr_port *port; struct hsr_priv *hsr; hsr = netdev_priv(dev); hsr_for_each_port(hsr, port) { if (port->type == HSR_PT_MASTER) continue; switch (port->type) { case HSR_PT_SLAVE_A: case HSR_PT_SLAVE_B: dev_uc_unsync(port->dev, dev); dev_mc_unsync(port->dev, dev); break; default: break; } } return 0; } static netdev_features_t hsr_features_recompute(struct hsr_priv *hsr, netdev_features_t features) { netdev_features_t mask; struct hsr_port *port; mask = features; /* Mask out all features that, if supported by one device, should be * enabled for all devices (see NETIF_F_ONE_FOR_ALL). * * Anything that's off in mask will not be enabled - so only things * that were in features originally, and also is in NETIF_F_ONE_FOR_ALL, * may become enabled. */ features &= ~NETIF_F_ONE_FOR_ALL; hsr_for_each_port(hsr, port) features = netdev_increment_features(features, port->dev->features, mask); return features; } static netdev_features_t hsr_fix_features(struct net_device *dev, netdev_features_t features) { struct hsr_priv *hsr = netdev_priv(dev); return hsr_features_recompute(hsr, features); } static netdev_tx_t hsr_dev_xmit(struct sk_buff *skb, struct net_device *dev) { struct hsr_priv *hsr = netdev_priv(dev); struct hsr_port *master; master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); if (master) { skb->dev = master->dev; skb_reset_mac_header(skb); skb_reset_mac_len(skb); spin_lock_bh(&hsr->seqnr_lock); hsr_forward_skb(skb, master); spin_unlock_bh(&hsr->seqnr_lock); } else { dev_core_stats_tx_dropped_inc(dev); dev_kfree_skb_any(skb); } return NETDEV_TX_OK; } static const struct header_ops hsr_header_ops = { .create = eth_header, .parse = eth_header_parse, }; static struct sk_buff *hsr_init_skb(struct hsr_port *master, int extra) { struct hsr_priv *hsr = master->hsr; struct sk_buff *skb; int hlen, tlen; int len; hlen = LL_RESERVED_SPACE(master->dev); tlen = master->dev->needed_tailroom; len = sizeof(struct hsr_sup_tag) + sizeof(struct hsr_sup_payload); /* skb size is same for PRP/HSR frames, only difference * being, for PRP it is a trailer and for HSR it is a * header. * RedBox might use @extra more bytes. */ skb = dev_alloc_skb(len + extra + hlen + tlen); if (!skb) return skb; skb_reserve(skb, hlen); skb->dev = master->dev; skb->priority = TC_PRIO_CONTROL; skb_reset_network_header(skb); skb_reset_transport_header(skb); if (dev_hard_header(skb, skb->dev, ETH_P_PRP, hsr->sup_multicast_addr, skb->dev->dev_addr, skb->len) <= 0) goto out; skb_reset_mac_header(skb); skb_reset_mac_len(skb); return skb; out: kfree_skb(skb); return NULL; } static void send_hsr_supervision_frame(struct hsr_port *port, unsigned long *interval, const unsigned char *addr) { struct hsr_priv *hsr = port->hsr; __u8 type = HSR_TLV_LIFE_CHECK; struct hsr_sup_payload *hsr_sp; struct hsr_sup_tlv *hsr_stlv; struct hsr_sup_tag *hsr_stag; struct sk_buff *skb; int extra = 0; *interval = msecs_to_jiffies(HSR_LIFE_CHECK_INTERVAL); if (hsr->announce_count < 3 && hsr->prot_version == 0) { type = HSR_TLV_ANNOUNCE; *interval = msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL); hsr->announce_count++; } if (hsr->redbox) extra = sizeof(struct hsr_sup_tlv) + sizeof(struct hsr_sup_payload); skb = hsr_init_skb(port, extra); if (!skb) { netdev_warn_once(port->dev, "HSR: Could not send supervision frame\n"); return; } hsr_stag = skb_put(skb, sizeof(struct hsr_sup_tag)); set_hsr_stag_path(hsr_stag, (hsr->prot_version ? 0x0 : 0xf)); set_hsr_stag_HSR_ver(hsr_stag, hsr->prot_version); /* From HSRv1 on we have separate supervision sequence numbers. */ spin_lock_bh(&hsr->seqnr_lock); if (hsr->prot_version > 0) { hsr_stag->sequence_nr = htons(hsr->sup_sequence_nr); hsr->sup_sequence_nr++; } else { hsr_stag->sequence_nr = htons(hsr->sequence_nr); hsr->sequence_nr++; } hsr_stag->tlv.HSR_TLV_type = type; /* TODO: Why 12 in HSRv0? */ hsr_stag->tlv.HSR_TLV_length = hsr->prot_version ? sizeof(struct hsr_sup_payload) : 12; /* Payload: MacAddressA / SAN MAC from ProxyNodeTable */ hsr_sp = skb_put(skb, sizeof(struct hsr_sup_payload)); ether_addr_copy(hsr_sp->macaddress_A, addr); if (hsr->redbox && hsr_is_node_in_db(&hsr->proxy_node_db, addr)) { hsr_stlv = skb_put(skb, sizeof(struct hsr_sup_tlv)); hsr_stlv->HSR_TLV_type = PRP_TLV_REDBOX_MAC; hsr_stlv->HSR_TLV_length = sizeof(struct hsr_sup_payload); /* Payload: MacAddressRedBox */ hsr_sp = skb_put(skb, sizeof(struct hsr_sup_payload)); ether_addr_copy(hsr_sp->macaddress_A, hsr->macaddress_redbox); } if (skb_put_padto(skb, ETH_ZLEN)) { spin_unlock_bh(&hsr->seqnr_lock); return; } hsr_forward_skb(skb, port); spin_unlock_bh(&hsr->seqnr_lock); return; } static void send_prp_supervision_frame(struct hsr_port *master, unsigned long *interval, const unsigned char *addr) { struct hsr_priv *hsr = master->hsr; struct hsr_sup_payload *hsr_sp; struct hsr_sup_tag *hsr_stag; struct sk_buff *skb; skb = hsr_init_skb(master, 0); if (!skb) { netdev_warn_once(master->dev, "PRP: Could not send supervision frame\n"); return; } *interval = msecs_to_jiffies(HSR_LIFE_CHECK_INTERVAL); hsr_stag = skb_put(skb, sizeof(struct hsr_sup_tag)); set_hsr_stag_path(hsr_stag, (hsr->prot_version ? 0x0 : 0xf)); set_hsr_stag_HSR_ver(hsr_stag, (hsr->prot_version ? 1 : 0)); /* From HSRv1 on we have separate supervision sequence numbers. */ spin_lock_bh(&hsr->seqnr_lock); hsr_stag->sequence_nr = htons(hsr->sup_sequence_nr); hsr->sup_sequence_nr++; hsr_stag->tlv.HSR_TLV_type = PRP_TLV_LIFE_CHECK_DD; hsr_stag->tlv.HSR_TLV_length = sizeof(struct hsr_sup_payload); /* Payload: MacAddressA */ hsr_sp = skb_put(skb, sizeof(struct hsr_sup_payload)); ether_addr_copy(hsr_sp->macaddress_A, master->dev->dev_addr); if (skb_put_padto(skb, ETH_ZLEN)) { spin_unlock_bh(&hsr->seqnr_lock); return; } hsr_forward_skb(skb, master); spin_unlock_bh(&hsr->seqnr_lock); } /* Announce (supervision frame) timer function */ static void hsr_announce(struct timer_list *t) { struct hsr_priv *hsr; struct hsr_port *master; unsigned long interval; hsr = timer_container_of(hsr, t, announce_timer); rcu_read_lock(); master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); hsr->proto_ops->send_sv_frame(master, &interval, master->dev->dev_addr); if (is_admin_up(master->dev)) mod_timer(&hsr->announce_timer, jiffies + interval); rcu_read_unlock(); } /* Announce (supervision frame) timer function for RedBox */ static void hsr_proxy_announce(struct timer_list *t) { struct hsr_priv *hsr = timer_container_of(hsr, t, announce_proxy_timer); struct hsr_port *interlink; unsigned long interval = 0; struct hsr_node *node; rcu_read_lock(); /* RedBOX sends supervisory frames to HSR network with MAC addresses * of SAN nodes stored in ProxyNodeTable. */ interlink = hsr_port_get_hsr(hsr, HSR_PT_INTERLINK); if (!interlink) goto done; list_for_each_entry_rcu(node, &hsr->proxy_node_db, mac_list) { if (hsr_addr_is_redbox(hsr, node->macaddress_A)) continue; hsr->proto_ops->send_sv_frame(interlink, &interval, node->macaddress_A); } if (is_admin_up(interlink->dev)) { if (!interval) interval = msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL); mod_timer(&hsr->announce_proxy_timer, jiffies + interval); } done: rcu_read_unlock(); } void hsr_del_ports(struct hsr_priv *hsr) { struct hsr_port *port; port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_A); if (port) hsr_del_port(port); port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B); if (port) hsr_del_port(port); port = hsr_port_get_hsr(hsr, HSR_PT_INTERLINK); if (port) hsr_del_port(port); port = hsr_port_get_hsr(hsr, HSR_PT_MASTER); if (port) hsr_del_port(port); } static void hsr_set_rx_mode(struct net_device *dev) { struct hsr_port *port; struct hsr_priv *hsr; hsr = netdev_priv(dev); hsr_for_each_port(hsr, port) { if (port->type == HSR_PT_MASTER) continue; switch (port->type) { case HSR_PT_SLAVE_A: case HSR_PT_SLAVE_B: dev_mc_sync_multiple(port->dev, dev); dev_uc_sync_multiple(port->dev, dev); break; default: break; } } } static void hsr_change_rx_flags(struct net_device *dev, int change) { struct hsr_port *port; struct hsr_priv *hsr; hsr = netdev_priv(dev); hsr_for_each_port(hsr, port) { if (port->type == HSR_PT_MASTER) continue; switch (port->type) { case HSR_PT_SLAVE_A: case HSR_PT_SLAVE_B: if (change & IFF_ALLMULTI) dev_set_allmulti(port->dev, dev->flags & IFF_ALLMULTI ? 1 : -1); break; default: break; } } } static int hsr_ndo_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid) { bool is_slave_a_added = false; bool is_slave_b_added = false; struct hsr_port *port; struct hsr_priv *hsr; int ret = 0; hsr = netdev_priv(dev); hsr_for_each_port(hsr, port) { if (port->type == HSR_PT_MASTER || port->type == HSR_PT_INTERLINK) continue; ret = vlan_vid_add(port->dev, proto, vid); switch (port->type) { case HSR_PT_SLAVE_A: if (ret) { /* clean up Slave-B */ netdev_err(dev, "add vid failed for Slave-A\n"); if (is_slave_b_added) vlan_vid_del(port->dev, proto, vid); return ret; } is_slave_a_added = true; break; case HSR_PT_SLAVE_B: if (ret) { /* clean up Slave-A */ netdev_err(dev, "add vid failed for Slave-B\n"); if (is_slave_a_added) vlan_vid_del(port->dev, proto, vid); return ret; } is_slave_b_added = true; break; default: break; } } return 0; } static int hsr_ndo_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid) { struct hsr_port *port; struct hsr_priv *hsr; hsr = netdev_priv(dev); hsr_for_each_port(hsr, port) { switch (port->type) { case HSR_PT_SLAVE_A: case HSR_PT_SLAVE_B: vlan_vid_del(port->dev, proto, vid); break; default: break; } } return 0; } static const struct net_device_ops hsr_device_ops = { .ndo_change_mtu = hsr_dev_change_mtu, .ndo_open = hsr_dev_open, .ndo_stop = hsr_dev_close, .ndo_start_xmit = hsr_dev_xmit, .ndo_change_rx_flags = hsr_change_rx_flags, .ndo_fix_features = hsr_fix_features, .ndo_set_rx_mode = hsr_set_rx_mode, .ndo_vlan_rx_add_vid = hsr_ndo_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = hsr_ndo_vlan_rx_kill_vid, }; static const struct device_type hsr_type = { .name = "hsr", }; static struct hsr_proto_ops hsr_ops = { .send_sv_frame = send_hsr_supervision_frame, .create_tagged_frame = hsr_create_tagged_frame, .get_untagged_frame = hsr_get_untagged_frame, .drop_frame = hsr_drop_frame, .fill_frame_info = hsr_fill_frame_info, .invalid_dan_ingress_frame = hsr_invalid_dan_ingress_frame, .register_frame_out = hsr_register_frame_out, }; static struct hsr_proto_ops prp_ops = { .send_sv_frame = send_prp_supervision_frame, .create_tagged_frame = prp_create_tagged_frame, .get_untagged_frame = prp_get_untagged_frame, .drop_frame = prp_drop_frame, .fill_frame_info = prp_fill_frame_info, .handle_san_frame = prp_handle_san_frame, .update_san_info = prp_update_san_info, .register_frame_out = prp_register_frame_out, }; void hsr_dev_setup(struct net_device *dev) { eth_hw_addr_random(dev); ether_setup(dev); dev->min_mtu = 0; dev->header_ops = &hsr_header_ops; dev->netdev_ops = &hsr_device_ops; SET_NETDEV_DEVTYPE(dev, &hsr_type); dev->priv_flags |= IFF_NO_QUEUE | IFF_DISABLE_NETPOLL; /* Prevent recursive tx locking */ dev->lltx = true; /* Not sure about this. Taken from bridge code. netdevice.h says * it means "Does not change network namespaces". */ dev->netns_immutable = true; dev->needs_free_netdev = true; dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA | NETIF_F_GSO_MASK | NETIF_F_HW_CSUM | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_FILTER; dev->features = dev->hw_features; } /* Return true if dev is a HSR master; return false otherwise. */ bool is_hsr_master(struct net_device *dev) { return (dev->netdev_ops->ndo_start_xmit == hsr_dev_xmit); } EXPORT_SYMBOL(is_hsr_master); struct net_device *hsr_get_port_ndev(struct net_device *ndev, enum hsr_port_type pt) { struct hsr_priv *hsr = netdev_priv(ndev); struct hsr_port *port; hsr_for_each_port(hsr, port) if (port->type == pt) return port->dev; return NULL; } EXPORT_SYMBOL(hsr_get_port_ndev); /* Default multicast address for HSR Supervision frames */ static const unsigned char def_multicast_addr[ETH_ALEN] __aligned(2) = { 0x01, 0x15, 0x4e, 0x00, 0x01, 0x00 }; int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], struct net_device *interlink, unsigned char multicast_spec, u8 protocol_version, struct netlink_ext_ack *extack) { bool unregister = false; struct hsr_priv *hsr; int res; hsr = netdev_priv(hsr_dev); INIT_LIST_HEAD(&hsr->ports); INIT_LIST_HEAD(&hsr->node_db); INIT_LIST_HEAD(&hsr->proxy_node_db); spin_lock_init(&hsr->list_lock); eth_hw_addr_set(hsr_dev, slave[0]->dev_addr); /* initialize protocol specific functions */ if (protocol_version == PRP_V1) { /* For PRP, lan_id has most significant 3 bits holding * the net_id of PRP_LAN_ID */ hsr->net_id = PRP_LAN_ID << 1; hsr->proto_ops = &prp_ops; } else { hsr->proto_ops = &hsr_ops; } /* Make sure we recognize frames from ourselves in hsr_rcv() */ res = hsr_create_self_node(hsr, hsr_dev->dev_addr, slave[1]->dev_addr); if (res < 0) return res; spin_lock_init(&hsr->seqnr_lock); /* Overflow soon to find bugs easier: */ hsr->sequence_nr = HSR_SEQNR_START; hsr->sup_sequence_nr = HSR_SUP_SEQNR_START; timer_setup(&hsr->announce_timer, hsr_announce, 0); timer_setup(&hsr->prune_timer, hsr_prune_nodes, 0); timer_setup(&hsr->prune_proxy_timer, hsr_prune_proxy_nodes, 0); timer_setup(&hsr->announce_proxy_timer, hsr_proxy_announce, 0); ether_addr_copy(hsr->sup_multicast_addr, def_multicast_addr); hsr->sup_multicast_addr[ETH_ALEN - 1] = multicast_spec; hsr->prot_version = protocol_version; /* Make sure the 1st call to netif_carrier_on() gets through */ netif_carrier_off(hsr_dev); res = hsr_add_port(hsr, hsr_dev, HSR_PT_MASTER, extack); if (res) goto err_add_master; /* HSR forwarding offload supported in lower device? */ if ((slave[0]->features & NETIF_F_HW_HSR_FWD) && (slave[1]->features & NETIF_F_HW_HSR_FWD)) hsr->fwd_offloaded = true; if ((slave[0]->features & NETIF_F_HW_VLAN_CTAG_FILTER) && (slave[1]->features & NETIF_F_HW_VLAN_CTAG_FILTER)) hsr_dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER; res = register_netdevice(hsr_dev); if (res) goto err_unregister; unregister = true; res = hsr_add_port(hsr, slave[0], HSR_PT_SLAVE_A, extack); if (res) goto err_unregister; res = hsr_add_port(hsr, slave[1], HSR_PT_SLAVE_B, extack); if (res) goto err_unregister; if (protocol_version == PRP_V1) { eth_hw_addr_set(slave[1], slave[0]->dev_addr); call_netdevice_notifiers(NETDEV_CHANGEADDR, slave[1]); } if (interlink) { res = hsr_add_port(hsr, interlink, HSR_PT_INTERLINK, extack); if (res) goto err_unregister; hsr->redbox = true; ether_addr_copy(hsr->macaddress_redbox, interlink->dev_addr); mod_timer(&hsr->prune_proxy_timer, jiffies + msecs_to_jiffies(PRUNE_PROXY_PERIOD)); } hsr_debugfs_init(hsr, hsr_dev); mod_timer(&hsr->prune_timer, jiffies + msecs_to_jiffies(PRUNE_PERIOD)); return 0; err_unregister: hsr_del_ports(hsr); err_add_master: hsr_del_self_node(hsr); if (unregister) unregister_netdevice(hsr_dev); return res; }
210 209 210 209 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2015 - ARM Ltd * Author: Marc Zyngier <marc.zyngier@arm.com> */ #include <hyp/debug-sr.h> #include <linux/kvm_host.h> #include <asm/kvm_hyp.h> void __debug_switch_to_guest(struct kvm_vcpu *vcpu) { __debug_switch_to_guest_common(vcpu); } void __debug_switch_to_host(struct kvm_vcpu *vcpu) { __debug_switch_to_host_common(vcpu); }
54 54 54 31 31 31 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/stat.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/blkdev.h> #include <linux/export.h> #include <linux/mm.h> #include <linux/errno.h> #include <linux/file.h> #include <linux/highuid.h> #include <linux/fs.h> #include <linux/namei.h> #include <linux/security.h> #include <linux/cred.h> #include <linux/syscalls.h> #include <linux/pagemap.h> #include <linux/compat.h> #include <linux/iversion.h> #include <linux/uaccess.h> #include <asm/unistd.h> #include <trace/events/timestamp.h> #include "internal.h" #include "mount.h" /** * fill_mg_cmtime - Fill in the mtime and ctime and flag ctime as QUERIED * @stat: where to store the resulting values * @request_mask: STATX_* values requested * @inode: inode from which to grab the c/mtime * * Given @inode, grab the ctime and mtime out if it and store the result * in @stat. When fetching the value, flag it as QUERIED (if not already) * so the next write will record a distinct timestamp. * * NB: The QUERIED flag is tracked in the ctime, but we set it there even * if only the mtime was requested, as that ensures that the next mtime * change will be distinct. */ void fill_mg_cmtime(struct kstat *stat, u32 request_mask, struct inode *inode) { atomic_t *pcn = (atomic_t *)&inode->i_ctime_nsec; /* If neither time was requested, then don't report them */ if (!(request_mask & (STATX_CTIME|STATX_MTIME))) { stat->result_mask &= ~(STATX_CTIME|STATX_MTIME); return; } stat->mtime = inode_get_mtime(inode); stat->ctime.tv_sec = inode->i_ctime_sec; stat->ctime.tv_nsec = (u32)atomic_read(pcn); if (!(stat->ctime.tv_nsec & I_CTIME_QUERIED)) stat->ctime.tv_nsec = ((u32)atomic_fetch_or(I_CTIME_QUERIED, pcn)); stat->ctime.tv_nsec &= ~I_CTIME_QUERIED; trace_fill_mg_cmtime(inode, &stat->ctime, &stat->mtime); } EXPORT_SYMBOL(fill_mg_cmtime); /** * generic_fillattr - Fill in the basic attributes from the inode struct * @idmap: idmap of the mount the inode was found from * @request_mask: statx request_mask * @inode: Inode to use as the source * @stat: Where to fill in the attributes * * Fill in the basic attributes in the kstat structure from data that's to be * found on the VFS inode structure. This is the default if no getattr inode * operation is supplied. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then * take care to map the inode according to @idmap before filling in the * uid and gid filds. On non-idmapped mounts or if permission checking is to be * performed on the raw inode simply pass @nop_mnt_idmap. */ void generic_fillattr(struct mnt_idmap *idmap, u32 request_mask, struct inode *inode, struct kstat *stat) { vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, inode); vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode); stat->dev = inode->i_sb->s_dev; stat->ino = inode->i_ino; stat->mode = inode->i_mode; stat->nlink = inode->i_nlink; stat->uid = vfsuid_into_kuid(vfsuid); stat->gid = vfsgid_into_kgid(vfsgid); stat->rdev = inode->i_rdev; stat->size = i_size_read(inode); stat->atime = inode_get_atime(inode); if (is_mgtime(inode)) { fill_mg_cmtime(stat, request_mask, inode); } else { stat->ctime = inode_get_ctime(inode); stat->mtime = inode_get_mtime(inode); } stat->blksize = i_blocksize(inode); stat->blocks = inode->i_blocks; if ((request_mask & STATX_CHANGE_COOKIE) && IS_I_VERSION(inode)) { stat->result_mask |= STATX_CHANGE_COOKIE; stat->change_cookie = inode_query_iversion(inode); } } EXPORT_SYMBOL(generic_fillattr); /** * generic_fill_statx_attr - Fill in the statx attributes from the inode flags * @inode: Inode to use as the source * @stat: Where to fill in the attribute flags * * Fill in the STATX_ATTR_* flags in the kstat structure for properties of the * inode that are published on i_flags and enforced by the VFS. */ void generic_fill_statx_attr(struct inode *inode, struct kstat *stat) { if (inode->i_flags & S_IMMUTABLE) stat->attributes |= STATX_ATTR_IMMUTABLE; if (inode->i_flags & S_APPEND) stat->attributes |= STATX_ATTR_APPEND; stat->attributes_mask |= KSTAT_ATTR_VFS_FLAGS; } EXPORT_SYMBOL(generic_fill_statx_attr); /** * generic_fill_statx_atomic_writes - Fill in atomic writes statx attributes * @stat: Where to fill in the attribute flags * @unit_min: Minimum supported atomic write length in bytes * @unit_max: Maximum supported atomic write length in bytes * @unit_max_opt: Optimised maximum supported atomic write length in bytes * * Fill in the STATX{_ATTR}_WRITE_ATOMIC flags in the kstat structure from * atomic write unit_min and unit_max values. */ void generic_fill_statx_atomic_writes(struct kstat *stat, unsigned int unit_min, unsigned int unit_max, unsigned int unit_max_opt) { /* Confirm that the request type is known */ stat->result_mask |= STATX_WRITE_ATOMIC; /* Confirm that the file attribute type is known */ stat->attributes_mask |= STATX_ATTR_WRITE_ATOMIC; if (unit_min) { stat->atomic_write_unit_min = unit_min; stat->atomic_write_unit_max = unit_max; stat->atomic_write_unit_max_opt = unit_max_opt; /* Initially only allow 1x segment */ stat->atomic_write_segments_max = 1; /* Confirm atomic writes are actually supported */ stat->attributes |= STATX_ATTR_WRITE_ATOMIC; } } EXPORT_SYMBOL_GPL(generic_fill_statx_atomic_writes); /** * vfs_getattr_nosec - getattr without security checks * @path: file to get attributes from * @stat: structure to return attributes in * @request_mask: STATX_xxx flags indicating what the caller wants * @query_flags: Query mode (AT_STATX_SYNC_TYPE) * * Get attributes without calling security_inode_getattr. * * Currently the only caller other than vfs_getattr is internal to the * filehandle lookup code, which uses only the inode number and returns no * attributes to any user. Any other code probably wants vfs_getattr. */ int vfs_getattr_nosec(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct mnt_idmap *idmap; struct inode *inode = d_backing_inode(path->dentry); memset(stat, 0, sizeof(*stat)); stat->result_mask |= STATX_BASIC_STATS; query_flags &= AT_STATX_SYNC_TYPE; /* allow the fs to override these if it really wants to */ /* SB_NOATIME means filesystem supplies dummy atime value */ if (inode->i_sb->s_flags & SB_NOATIME) stat->result_mask &= ~STATX_ATIME; /* * Note: If you add another clause to set an attribute flag, please * update attributes_mask below. */ if (IS_AUTOMOUNT(inode)) stat->attributes |= STATX_ATTR_AUTOMOUNT; if (IS_DAX(inode)) stat->attributes |= STATX_ATTR_DAX; stat->attributes_mask |= (STATX_ATTR_AUTOMOUNT | STATX_ATTR_DAX); idmap = mnt_idmap(path->mnt); if (inode->i_op->getattr) { int ret; ret = inode->i_op->getattr(idmap, path, stat, request_mask, query_flags); if (ret) return ret; } else { generic_fillattr(idmap, request_mask, inode, stat); } /* * If this is a block device inode, override the filesystem attributes * with the block device specific parameters that need to be obtained * from the bdev backing inode. */ if (S_ISBLK(stat->mode)) bdev_statx(path, stat, request_mask); return 0; } EXPORT_SYMBOL(vfs_getattr_nosec); /* * vfs_getattr - Get the enhanced basic attributes of a file * @path: The file of interest * @stat: Where to return the statistics * @request_mask: STATX_xxx flags indicating what the caller wants * @query_flags: Query mode (AT_STATX_SYNC_TYPE) * * Ask the filesystem for a file's attributes. The caller must indicate in * request_mask and query_flags to indicate what they want. * * If the file is remote, the filesystem can be forced to update the attributes * from the backing store by passing AT_STATX_FORCE_SYNC in query_flags or can * suppress the update by passing AT_STATX_DONT_SYNC. * * Bits must have been set in request_mask to indicate which attributes the * caller wants retrieving. Any such attribute not requested may be returned * anyway, but the value may be approximate, and, if remote, may not have been * synchronised with the server. * * 0 will be returned on success, and a -ve error code if unsuccessful. */ int vfs_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { int retval; retval = security_inode_getattr(path); if (unlikely(retval)) return retval; return vfs_getattr_nosec(path, stat, request_mask, query_flags); } EXPORT_SYMBOL(vfs_getattr); /** * vfs_fstat - Get the basic attributes by file descriptor * @fd: The file descriptor referring to the file of interest * @stat: The result structure to fill in. * * This function is a wrapper around vfs_getattr(). The main difference is * that it uses a file descriptor to determine the file location. * * 0 will be returned on success, and a -ve error code if unsuccessful. */ int vfs_fstat(int fd, struct kstat *stat) { CLASS(fd_raw, f)(fd); if (fd_empty(f)) return -EBADF; return vfs_getattr(&fd_file(f)->f_path, stat, STATX_BASIC_STATS, 0); } static int statx_lookup_flags(int flags) { int lookup_flags = 0; if (!(flags & AT_SYMLINK_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; if (!(flags & AT_NO_AUTOMOUNT)) lookup_flags |= LOOKUP_AUTOMOUNT; return lookup_flags; } static int vfs_statx_path(struct path *path, int flags, struct kstat *stat, u32 request_mask) { int error = vfs_getattr(path, stat, request_mask, flags); if (error) return error; if (request_mask & STATX_MNT_ID_UNIQUE) { stat->mnt_id = real_mount(path->mnt)->mnt_id_unique; stat->result_mask |= STATX_MNT_ID_UNIQUE; } else { stat->mnt_id = real_mount(path->mnt)->mnt_id; stat->result_mask |= STATX_MNT_ID; } if (path_mounted(path)) stat->attributes |= STATX_ATTR_MOUNT_ROOT; stat->attributes_mask |= STATX_ATTR_MOUNT_ROOT; return 0; } static int vfs_statx_fd(int fd, int flags, struct kstat *stat, u32 request_mask) { CLASS(fd_raw, f)(fd); if (fd_empty(f)) return -EBADF; return vfs_statx_path(&fd_file(f)->f_path, flags, stat, request_mask); } /** * vfs_statx - Get basic and extra attributes by filename * @dfd: A file descriptor representing the base dir for a relative filename * @filename: The name of the file of interest * @flags: Flags to control the query * @stat: The result structure to fill in. * @request_mask: STATX_xxx flags indicating what the caller wants * * This function is a wrapper around vfs_getattr(). The main difference is * that it uses a filename and base directory to determine the file location. * Additionally, the use of AT_SYMLINK_NOFOLLOW in flags will prevent a symlink * at the given name from being referenced. * * 0 will be returned on success, and a -ve error code if unsuccessful. */ static int vfs_statx(int dfd, struct filename *filename, int flags, struct kstat *stat, u32 request_mask) { struct path path; unsigned int lookup_flags = statx_lookup_flags(flags); int error; if (flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT | AT_EMPTY_PATH | AT_STATX_SYNC_TYPE)) return -EINVAL; retry: error = filename_lookup(dfd, filename, lookup_flags, &path, NULL); if (error) return error; error = vfs_statx_path(&path, flags, stat, request_mask); path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } return error; } int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat, int flags) { int ret; int statx_flags = flags | AT_NO_AUTOMOUNT; struct filename *name = getname_maybe_null(filename, flags); if (!name && dfd >= 0) return vfs_fstat(dfd, stat); ret = vfs_statx(dfd, name, statx_flags, stat, STATX_BASIC_STATS); putname(name); return ret; } #ifdef __ARCH_WANT_OLD_STAT /* * For backward compatibility? Maybe this should be moved * into arch/i386 instead? */ static int cp_old_stat(struct kstat *stat, struct __old_kernel_stat __user * statbuf) { static int warncount = 5; struct __old_kernel_stat tmp; if (warncount > 0) { warncount--; printk(KERN_WARNING "VFS: Warning: %s using old stat() call. Recompile your binary.\n", current->comm); } else if (warncount < 0) { /* it's laughable, but... */ warncount = 0; } memset(&tmp, 0, sizeof(struct __old_kernel_stat)); tmp.st_dev = old_encode_dev(stat->dev); tmp.st_ino = stat->ino; if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino) return -EOVERFLOW; tmp.st_mode = stat->mode; tmp.st_nlink = stat->nlink; if (tmp.st_nlink != stat->nlink) return -EOVERFLOW; SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid)); SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid)); tmp.st_rdev = old_encode_dev(stat->rdev); #if BITS_PER_LONG == 32 if (stat->size > MAX_NON_LFS) return -EOVERFLOW; #endif tmp.st_size = stat->size; tmp.st_atime = stat->atime.tv_sec; tmp.st_mtime = stat->mtime.tv_sec; tmp.st_ctime = stat->ctime.tv_sec; return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0; } SYSCALL_DEFINE2(stat, const char __user *, filename, struct __old_kernel_stat __user *, statbuf) { struct kstat stat; int error; error = vfs_stat(filename, &stat); if (unlikely(error)) return error; return cp_old_stat(&stat, statbuf); } SYSCALL_DEFINE2(lstat, const char __user *, filename, struct __old_kernel_stat __user *, statbuf) { struct kstat stat; int error; error = vfs_lstat(filename, &stat); if (unlikely(error)) return error; return cp_old_stat(&stat, statbuf); } SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, statbuf) { struct kstat stat; int error; error = vfs_fstat(fd, &stat); if (unlikely(error)) return error; return cp_old_stat(&stat, statbuf); } #endif /* __ARCH_WANT_OLD_STAT */ #ifdef __ARCH_WANT_NEW_STAT #ifndef INIT_STRUCT_STAT_PADDING # define INIT_STRUCT_STAT_PADDING(st) memset(&st, 0, sizeof(st)) #endif static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf) { struct stat tmp; if (sizeof(tmp.st_dev) < 4 && !old_valid_dev(stat->dev)) return -EOVERFLOW; if (sizeof(tmp.st_rdev) < 4 && !old_valid_dev(stat->rdev)) return -EOVERFLOW; #if BITS_PER_LONG == 32 if (stat->size > MAX_NON_LFS) return -EOVERFLOW; #endif INIT_STRUCT_STAT_PADDING(tmp); tmp.st_dev = new_encode_dev(stat->dev); tmp.st_ino = stat->ino; if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino) return -EOVERFLOW; tmp.st_mode = stat->mode; tmp.st_nlink = stat->nlink; if (tmp.st_nlink != stat->nlink) return -EOVERFLOW; SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid)); SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid)); tmp.st_rdev = new_encode_dev(stat->rdev); tmp.st_size = stat->size; tmp.st_atime = stat->atime.tv_sec; tmp.st_mtime = stat->mtime.tv_sec; tmp.st_ctime = stat->ctime.tv_sec; #ifdef STAT_HAVE_NSEC tmp.st_atime_nsec = stat->atime.tv_nsec; tmp.st_mtime_nsec = stat->mtime.tv_nsec; tmp.st_ctime_nsec = stat->ctime.tv_nsec; #endif tmp.st_blocks = stat->blocks; tmp.st_blksize = stat->blksize; return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0; } SYSCALL_DEFINE2(newstat, const char __user *, filename, struct stat __user *, statbuf) { struct kstat stat; int error; error = vfs_stat(filename, &stat); if (unlikely(error)) return error; return cp_new_stat(&stat, statbuf); } SYSCALL_DEFINE2(newlstat, const char __user *, filename, struct stat __user *, statbuf) { struct kstat stat; int error; error = vfs_lstat(filename, &stat); if (unlikely(error)) return error; return cp_new_stat(&stat, statbuf); } #if !defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_SYS_NEWFSTATAT) SYSCALL_DEFINE4(newfstatat, int, dfd, const char __user *, filename, struct stat __user *, statbuf, int, flag) { struct kstat stat; int error; error = vfs_fstatat(dfd, filename, &stat, flag); if (unlikely(error)) return error; return cp_new_stat(&stat, statbuf); } #endif SYSCALL_DEFINE2(newfstat, unsigned int, fd, struct stat __user *, statbuf) { struct kstat stat; int error; error = vfs_fstat(fd, &stat); if (unlikely(error)) return error; return cp_new_stat(&stat, statbuf); } #endif static int do_readlinkat(int dfd, const char __user *pathname, char __user *buf, int bufsiz) { struct path path; struct filename *name; int error; unsigned int lookup_flags = LOOKUP_EMPTY; if (bufsiz <= 0) return -EINVAL; retry: name = getname_flags(pathname, lookup_flags); error = filename_lookup(dfd, name, lookup_flags, &path, NULL); if (unlikely(error)) { putname(name); return error; } /* * AFS mountpoints allow readlink(2) but are not symlinks */ if (d_is_symlink(path.dentry) || d_backing_inode(path.dentry)->i_op->readlink) { error = security_inode_readlink(path.dentry); if (!error) { touch_atime(&path); error = vfs_readlink(path.dentry, buf, bufsiz); } } else { error = (name->name[0] == '\0') ? -ENOENT : -EINVAL; } path_put(&path); putname(name); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } return error; } SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname, char __user *, buf, int, bufsiz) { return do_readlinkat(dfd, pathname, buf, bufsiz); } SYSCALL_DEFINE3(readlink, const char __user *, path, char __user *, buf, int, bufsiz) { return do_readlinkat(AT_FDCWD, path, buf, bufsiz); } /* ---------- LFS-64 ----------- */ #if defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_COMPAT_STAT64) #ifndef INIT_STRUCT_STAT64_PADDING # define INIT_STRUCT_STAT64_PADDING(st) memset(&st, 0, sizeof(st)) #endif static long cp_new_stat64(struct kstat *stat, struct stat64 __user *statbuf) { struct stat64 tmp; INIT_STRUCT_STAT64_PADDING(tmp); #ifdef CONFIG_MIPS /* mips has weird padding, so we don't get 64 bits there */ tmp.st_dev = new_encode_dev(stat->dev); tmp.st_rdev = new_encode_dev(stat->rdev); #else tmp.st_dev = huge_encode_dev(stat->dev); tmp.st_rdev = huge_encode_dev(stat->rdev); #endif tmp.st_ino = stat->ino; if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino) return -EOVERFLOW; #ifdef STAT64_HAS_BROKEN_ST_INO tmp.__st_ino = stat->ino; #endif tmp.st_mode = stat->mode; tmp.st_nlink = stat->nlink; tmp.st_uid = from_kuid_munged(current_user_ns(), stat->uid); tmp.st_gid = from_kgid_munged(current_user_ns(), stat->gid); tmp.st_atime = stat->atime.tv_sec; tmp.st_atime_nsec = stat->atime.tv_nsec; tmp.st_mtime = stat->mtime.tv_sec; tmp.st_mtime_nsec = stat->mtime.tv_nsec; tmp.st_ctime = stat->ctime.tv_sec; tmp.st_ctime_nsec = stat->ctime.tv_nsec; tmp.st_size = stat->size; tmp.st_blocks = stat->blocks; tmp.st_blksize = stat->blksize; return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0; } SYSCALL_DEFINE2(stat64, const char __user *, filename, struct stat64 __user *, statbuf) { struct kstat stat; int error = vfs_stat(filename, &stat); if (!error) error = cp_new_stat64(&stat, statbuf); return error; } SYSCALL_DEFINE2(lstat64, const char __user *, filename, struct stat64 __user *, statbuf) { struct kstat stat; int error = vfs_lstat(filename, &stat); if (!error) error = cp_new_stat64(&stat, statbuf); return error; } SYSCALL_DEFINE2(fstat64, unsigned long, fd, struct stat64 __user *, statbuf) { struct kstat stat; int error = vfs_fstat(fd, &stat); if (!error) error = cp_new_stat64(&stat, statbuf); return error; } SYSCALL_DEFINE4(fstatat64, int, dfd, const char __user *, filename, struct stat64 __user *, statbuf, int, flag) { struct kstat stat; int error; error = vfs_fstatat(dfd, filename, &stat, flag); if (error) return error; return cp_new_stat64(&stat, statbuf); } #endif /* __ARCH_WANT_STAT64 || __ARCH_WANT_COMPAT_STAT64 */ static noinline_for_stack int cp_statx(const struct kstat *stat, struct statx __user *buffer) { struct statx tmp; memset(&tmp, 0, sizeof(tmp)); /* STATX_CHANGE_COOKIE is kernel-only for now */ tmp.stx_mask = stat->result_mask & ~STATX_CHANGE_COOKIE; tmp.stx_blksize = stat->blksize; /* STATX_ATTR_CHANGE_MONOTONIC is kernel-only for now */ tmp.stx_attributes = stat->attributes & ~STATX_ATTR_CHANGE_MONOTONIC; tmp.stx_nlink = stat->nlink; tmp.stx_uid = from_kuid_munged(current_user_ns(), stat->uid); tmp.stx_gid = from_kgid_munged(current_user_ns(), stat->gid); tmp.stx_mode = stat->mode; tmp.stx_ino = stat->ino; tmp.stx_size = stat->size; tmp.stx_blocks = stat->blocks; tmp.stx_attributes_mask = stat->attributes_mask; tmp.stx_atime.tv_sec = stat->atime.tv_sec; tmp.stx_atime.tv_nsec = stat->atime.tv_nsec; tmp.stx_btime.tv_sec = stat->btime.tv_sec; tmp.stx_btime.tv_nsec = stat->btime.tv_nsec; tmp.stx_ctime.tv_sec = stat->ctime.tv_sec; tmp.stx_ctime.tv_nsec = stat->ctime.tv_nsec; tmp.stx_mtime.tv_sec = stat->mtime.tv_sec; tmp.stx_mtime.tv_nsec = stat->mtime.tv_nsec; tmp.stx_rdev_major = MAJOR(stat->rdev); tmp.stx_rdev_minor = MINOR(stat->rdev); tmp.stx_dev_major = MAJOR(stat->dev); tmp.stx_dev_minor = MINOR(stat->dev); tmp.stx_mnt_id = stat->mnt_id; tmp.stx_dio_mem_align = stat->dio_mem_align; tmp.stx_dio_offset_align = stat->dio_offset_align; tmp.stx_dio_read_offset_align = stat->dio_read_offset_align; tmp.stx_subvol = stat->subvol; tmp.stx_atomic_write_unit_min = stat->atomic_write_unit_min; tmp.stx_atomic_write_unit_max = stat->atomic_write_unit_max; tmp.stx_atomic_write_segments_max = stat->atomic_write_segments_max; tmp.stx_atomic_write_unit_max_opt = stat->atomic_write_unit_max_opt; return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0; } int do_statx(int dfd, struct filename *filename, unsigned int flags, unsigned int mask, struct statx __user *buffer) { struct kstat stat; int error; if (mask & STATX__RESERVED) return -EINVAL; if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_SYNC_TYPE) return -EINVAL; /* * STATX_CHANGE_COOKIE is kernel-only for now. Ignore requests * from userland. */ mask &= ~STATX_CHANGE_COOKIE; error = vfs_statx(dfd, filename, flags, &stat, mask); if (error) return error; return cp_statx(&stat, buffer); } int do_statx_fd(int fd, unsigned int flags, unsigned int mask, struct statx __user *buffer) { struct kstat stat; int error; if (mask & STATX__RESERVED) return -EINVAL; if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_SYNC_TYPE) return -EINVAL; /* * STATX_CHANGE_COOKIE is kernel-only for now. Ignore requests * from userland. */ mask &= ~STATX_CHANGE_COOKIE; error = vfs_statx_fd(fd, flags, &stat, mask); if (error) return error; return cp_statx(&stat, buffer); } /** * sys_statx - System call to get enhanced stats * @dfd: Base directory to pathwalk from *or* fd to stat. * @filename: File to stat or either NULL or "" with AT_EMPTY_PATH * @flags: AT_* flags to control pathwalk. * @mask: Parts of statx struct actually required. * @buffer: Result buffer. * * Note that fstat() can be emulated by setting dfd to the fd of interest, * supplying "" (or preferably NULL) as the filename and setting AT_EMPTY_PATH * in the flags. */ SYSCALL_DEFINE5(statx, int, dfd, const char __user *, filename, unsigned, flags, unsigned int, mask, struct statx __user *, buffer) { int ret; struct filename *name = getname_maybe_null(filename, flags); if (!name && dfd >= 0) return do_statx_fd(dfd, flags & ~AT_NO_AUTOMOUNT, mask, buffer); ret = do_statx(dfd, name, flags, mask, buffer); putname(name); return ret; } #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_STAT) static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf) { struct compat_stat tmp; if (sizeof(tmp.st_dev) < 4 && !old_valid_dev(stat->dev)) return -EOVERFLOW; if (sizeof(tmp.st_rdev) < 4 && !old_valid_dev(stat->rdev)) return -EOVERFLOW; memset(&tmp, 0, sizeof(tmp)); tmp.st_dev = new_encode_dev(stat->dev); tmp.st_ino = stat->ino; if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino) return -EOVERFLOW; tmp.st_mode = stat->mode; tmp.st_nlink = stat->nlink; if (tmp.st_nlink != stat->nlink) return -EOVERFLOW; SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid)); SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid)); tmp.st_rdev = new_encode_dev(stat->rdev); if ((u64) stat->size > MAX_NON_LFS) return -EOVERFLOW; tmp.st_size = stat->size; tmp.st_atime = stat->atime.tv_sec; tmp.st_atime_nsec = stat->atime.tv_nsec; tmp.st_mtime = stat->mtime.tv_sec; tmp.st_mtime_nsec = stat->mtime.tv_nsec; tmp.st_ctime = stat->ctime.tv_sec; tmp.st_ctime_nsec = stat->ctime.tv_nsec; tmp.st_blocks = stat->blocks; tmp.st_blksize = stat->blksize; return copy_to_user(ubuf, &tmp, sizeof(tmp)) ? -EFAULT : 0; } COMPAT_SYSCALL_DEFINE2(newstat, const char __user *, filename, struct compat_stat __user *, statbuf) { struct kstat stat; int error; error = vfs_stat(filename, &stat); if (error) return error; return cp_compat_stat(&stat, statbuf); } COMPAT_SYSCALL_DEFINE2(newlstat, const char __user *, filename, struct compat_stat __user *, statbuf) { struct kstat stat; int error; error = vfs_lstat(filename, &stat); if (error) return error; return cp_compat_stat(&stat, statbuf); } #ifndef __ARCH_WANT_STAT64 COMPAT_SYSCALL_DEFINE4(newfstatat, unsigned int, dfd, const char __user *, filename, struct compat_stat __user *, statbuf, int, flag) { struct kstat stat; int error; error = vfs_fstatat(dfd, filename, &stat, flag); if (error) return error; return cp_compat_stat(&stat, statbuf); } #endif COMPAT_SYSCALL_DEFINE2(newfstat, unsigned int, fd, struct compat_stat __user *, statbuf) { struct kstat stat; int error = vfs_fstat(fd, &stat); if (!error) error = cp_compat_stat(&stat, statbuf); return error; } #endif /* Caller is here responsible for sufficient locking (ie. inode->i_lock) */ void __inode_add_bytes(struct inode *inode, loff_t bytes) { inode->i_blocks += bytes >> 9; bytes &= 511; inode->i_bytes += bytes; if (inode->i_bytes >= 512) { inode->i_blocks++; inode->i_bytes -= 512; } } EXPORT_SYMBOL(__inode_add_bytes); void inode_add_bytes(struct inode *inode, loff_t bytes) { spin_lock(&inode->i_lock); __inode_add_bytes(inode, bytes); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(inode_add_bytes); void __inode_sub_bytes(struct inode *inode, loff_t bytes) { inode->i_blocks -= bytes >> 9; bytes &= 511; if (inode->i_bytes < bytes) { inode->i_blocks--; inode->i_bytes += 512; } inode->i_bytes -= bytes; } EXPORT_SYMBOL(__inode_sub_bytes); void inode_sub_bytes(struct inode *inode, loff_t bytes) { spin_lock(&inode->i_lock); __inode_sub_bytes(inode, bytes); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(inode_sub_bytes); loff_t inode_get_bytes(struct inode *inode) { loff_t ret; spin_lock(&inode->i_lock); ret = __inode_get_bytes(inode); spin_unlock(&inode->i_lock); return ret; } EXPORT_SYMBOL(inode_get_bytes); void inode_set_bytes(struct inode *inode, loff_t bytes) { /* Caller is here responsible for sufficient locking * (ie. inode->i_lock) */ inode->i_blocks = bytes >> 9; inode->i_bytes = bytes & 511; } EXPORT_SYMBOL(inode_set_bytes);
241 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM sock #if !defined(_TRACE_SOCK_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_SOCK_H #include <net/sock.h> #include <net/ipv6.h> #include <linux/tracepoint.h> #include <linux/ipv6.h> #include <linux/tcp.h> #include <trace/events/net_probe_common.h> #define family_names \ EM(AF_INET) \ EMe(AF_INET6) /* The protocol traced by inet_sock_set_state */ #define inet_protocol_names \ EM(IPPROTO_TCP) \ EM(IPPROTO_SCTP) \ EMe(IPPROTO_MPTCP) #define tcp_state_names \ EM(TCP_ESTABLISHED) \ EM(TCP_SYN_SENT) \ EM(TCP_SYN_RECV) \ EM(TCP_FIN_WAIT1) \ EM(TCP_FIN_WAIT2) \ EM(TCP_TIME_WAIT) \ EM(TCP_CLOSE) \ EM(TCP_CLOSE_WAIT) \ EM(TCP_LAST_ACK) \ EM(TCP_LISTEN) \ EM(TCP_CLOSING) \ EMe(TCP_NEW_SYN_RECV) #define skmem_kind_names \ EM(SK_MEM_SEND) \ EMe(SK_MEM_RECV) /* enums need to be exported to user space */ #undef EM #undef EMe #define EM(a) TRACE_DEFINE_ENUM(a); #define EMe(a) TRACE_DEFINE_ENUM(a); family_names inet_protocol_names tcp_state_names skmem_kind_names #undef EM #undef EMe #define EM(a) { a, #a }, #define EMe(a) { a, #a } #define show_family_name(val) \ __print_symbolic(val, family_names) #define show_inet_protocol_name(val) \ __print_symbolic(val, inet_protocol_names) #define show_tcp_state_name(val) \ __print_symbolic(val, tcp_state_names) #define show_skmem_kind_names(val) \ __print_symbolic(val, skmem_kind_names) TRACE_EVENT(sock_rcvqueue_full, TP_PROTO(struct sock *sk, struct sk_buff *skb), TP_ARGS(sk, skb), TP_STRUCT__entry( __field(int, rmem_alloc) __field(unsigned int, truesize) __field(int, sk_rcvbuf) ), TP_fast_assign( __entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc); __entry->truesize = skb->truesize; __entry->sk_rcvbuf = READ_ONCE(sk->sk_rcvbuf); ), TP_printk("rmem_alloc=%d truesize=%u sk_rcvbuf=%d", __entry->rmem_alloc, __entry->truesize, __entry->sk_rcvbuf) ); TRACE_EVENT(sock_exceed_buf_limit, TP_PROTO(struct sock *sk, struct proto *prot, long allocated, int kind), TP_ARGS(sk, prot, allocated, kind), TP_STRUCT__entry( __array(char, name, 32) __array(long, sysctl_mem, 3) __field(long, allocated) __field(int, sysctl_rmem) __field(int, rmem_alloc) __field(int, sysctl_wmem) __field(int, wmem_alloc) __field(int, wmem_queued) __field(int, kind) ), TP_fast_assign( strscpy(__entry->name, prot->name, 32); __entry->sysctl_mem[0] = READ_ONCE(prot->sysctl_mem[0]); __entry->sysctl_mem[1] = READ_ONCE(prot->sysctl_mem[1]); __entry->sysctl_mem[2] = READ_ONCE(prot->sysctl_mem[2]); __entry->allocated = allocated; __entry->sysctl_rmem = sk_get_rmem0(sk, prot); __entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc); __entry->sysctl_wmem = sk_get_wmem0(sk, prot); __entry->wmem_alloc = refcount_read(&sk->sk_wmem_alloc); __entry->wmem_queued = READ_ONCE(sk->sk_wmem_queued); __entry->kind = kind; ), TP_printk("proto:%s sysctl_mem=%ld,%ld,%ld allocated=%ld sysctl_rmem=%d rmem_alloc=%d sysctl_wmem=%d wmem_alloc=%d wmem_queued=%d kind=%s", __entry->name, __entry->sysctl_mem[0], __entry->sysctl_mem[1], __entry->sysctl_mem[2], __entry->allocated, __entry->sysctl_rmem, __entry->rmem_alloc, __entry->sysctl_wmem, __entry->wmem_alloc, __entry->wmem_queued, show_skmem_kind_names(__entry->kind) ) ); TRACE_EVENT(inet_sock_set_state, TP_PROTO(const struct sock *sk, const int oldstate, const int newstate), TP_ARGS(sk, oldstate, newstate), TP_STRUCT__entry( __field(const void *, skaddr) __field(int, oldstate) __field(int, newstate) __field(__u16, sport) __field(__u16, dport) __field(__u16, family) __field(__u16, protocol) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) __array(__u8, daddr_v6, 16) ), TP_fast_assign( const struct inet_sock *inet = inet_sk(sk); __be32 *p32; __entry->skaddr = sk; __entry->oldstate = oldstate; __entry->newstate = newstate; __entry->family = sk->sk_family; __entry->protocol = sk->sk_protocol; __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); p32 = (__be32 *) __entry->saddr; *p32 = inet->inet_saddr; p32 = (__be32 *) __entry->daddr; *p32 = inet->inet_daddr; TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr, sk->sk_v6_rcv_saddr, sk->sk_v6_daddr); ), TP_printk("family=%s protocol=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c oldstate=%s newstate=%s", show_family_name(__entry->family), show_inet_protocol_name(__entry->protocol), __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6, show_tcp_state_name(__entry->oldstate), show_tcp_state_name(__entry->newstate)) ); TRACE_EVENT(inet_sk_error_report, TP_PROTO(const struct sock *sk), TP_ARGS(sk), TP_STRUCT__entry( __field(int, error) __field(__u16, sport) __field(__u16, dport) __field(__u16, family) __field(__u16, protocol) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) __array(__u8, daddr_v6, 16) ), TP_fast_assign( const struct inet_sock *inet = inet_sk(sk); __be32 *p32; __entry->error = sk->sk_err; __entry->family = sk->sk_family; __entry->protocol = sk->sk_protocol; __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); p32 = (__be32 *) __entry->saddr; *p32 = inet->inet_saddr; p32 = (__be32 *) __entry->daddr; *p32 = inet->inet_daddr; TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr, sk->sk_v6_rcv_saddr, sk->sk_v6_daddr); ), TP_printk("family=%s protocol=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c error=%d", show_family_name(__entry->family), show_inet_protocol_name(__entry->protocol), __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6, __entry->error) ); TRACE_EVENT(sk_data_ready, TP_PROTO(const struct sock *sk), TP_ARGS(sk), TP_STRUCT__entry( __field(const void *, skaddr) __field(__u16, family) __field(__u16, protocol) __field(unsigned long, ip) ), TP_fast_assign( __entry->skaddr = sk; __entry->family = sk->sk_family; __entry->protocol = sk->sk_protocol; __entry->ip = _RET_IP_; ), TP_printk("family=%u protocol=%u func=%ps", __entry->family, __entry->protocol, (void *)__entry->ip) ); /* * sock send/recv msg length */ DECLARE_EVENT_CLASS(sock_msg_length, TP_PROTO(struct sock *sk, int ret, int flags), TP_ARGS(sk, ret, flags), TP_STRUCT__entry( __field(void *, sk) __field(__u16, family) __field(__u16, protocol) __field(int, ret) __field(int, flags) ), TP_fast_assign( __entry->sk = sk; __entry->family = sk->sk_family; __entry->protocol = sk->sk_protocol; __entry->ret = ret; __entry->flags = flags; ), TP_printk("sk address = %p, family = %s protocol = %s, length = %d, error = %d, flags = 0x%x", __entry->sk, show_family_name(__entry->family), show_inet_protocol_name(__entry->protocol), !(__entry->flags & MSG_PEEK) ? (__entry->ret > 0 ? __entry->ret : 0) : 0, __entry->ret < 0 ? __entry->ret : 0, __entry->flags) ); DEFINE_EVENT(sock_msg_length, sock_send_length, TP_PROTO(struct sock *sk, int ret, int flags), TP_ARGS(sk, ret, flags) ); DEFINE_EVENT(sock_msg_length, sock_recv_length, TP_PROTO(struct sock *sk, int ret, int flags), TP_ARGS(sk, ret, flags) ); #endif /* _TRACE_SOCK_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
1 623 9 3 602 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef __ASM_GENERIC_BITOPS_GENERIC_NON_ATOMIC_H #define __ASM_GENERIC_BITOPS_GENERIC_NON_ATOMIC_H #include <linux/bits.h> #include <asm/barrier.h> #ifndef _LINUX_BITOPS_H #error only <linux/bitops.h> can be included directly #endif /* * Generic definitions for bit operations, should not be used in regular code * directly. */ /** * generic___set_bit - Set a bit in memory * @nr: the bit to set * @addr: the address to start counting from * * Unlike set_bit(), this function is non-atomic and may be reordered. * If it's called on the same region of memory simultaneously, the effect * may be that only one operation succeeds. */ static __always_inline void generic___set_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = BIT_MASK(nr); unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); *p |= mask; } static __always_inline void generic___clear_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = BIT_MASK(nr); unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); *p &= ~mask; } /** * generic___change_bit - Toggle a bit in memory * @nr: the bit to change * @addr: the address to start counting from * * Unlike change_bit(), this function is non-atomic and may be reordered. * If it's called on the same region of memory simultaneously, the effect * may be that only one operation succeeds. */ static __always_inline void generic___change_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = BIT_MASK(nr); unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); *p ^= mask; } /** * generic___test_and_set_bit - Set a bit and return its old value * @nr: Bit to set * @addr: Address to count from * * This operation is non-atomic and can be reordered. * If two examples of this operation race, one can appear to succeed * but actually fail. You must protect multiple accesses with a lock. */ static __always_inline bool generic___test_and_set_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = BIT_MASK(nr); unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); unsigned long old = *p; *p = old | mask; return (old & mask) != 0; } /** * generic___test_and_clear_bit - Clear a bit and return its old value * @nr: Bit to clear * @addr: Address to count from * * This operation is non-atomic and can be reordered. * If two examples of this operation race, one can appear to succeed * but actually fail. You must protect multiple accesses with a lock. */ static __always_inline bool generic___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = BIT_MASK(nr); unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); unsigned long old = *p; *p = old & ~mask; return (old & mask) != 0; } /* WARNING: non atomic and it can be reordered! */ static __always_inline bool generic___test_and_change_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = BIT_MASK(nr); unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); unsigned long old = *p; *p = old ^ mask; return (old & mask) != 0; } /** * generic_test_bit - Determine whether a bit is set * @nr: bit number to test * @addr: Address to start counting from */ static __always_inline bool generic_test_bit(unsigned long nr, const volatile unsigned long *addr) { /* * Unlike the bitops with the '__' prefix above, this one *is* atomic, * so `volatile` must always stay here with no cast-aways. See * `Documentation/atomic_bitops.txt` for the details. */ return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1))); } /** * generic_test_bit_acquire - Determine, with acquire semantics, whether a bit is set * @nr: bit number to test * @addr: Address to start counting from */ static __always_inline bool generic_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr) { unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); return 1UL & (smp_load_acquire(p) >> (nr & (BITS_PER_LONG-1))); } /* * const_*() definitions provide good compile-time optimizations when * the passed arguments can be resolved at compile time. */ #define const___set_bit generic___set_bit #define const___clear_bit generic___clear_bit #define const___change_bit generic___change_bit #define const___test_and_set_bit generic___test_and_set_bit #define const___test_and_clear_bit generic___test_and_clear_bit #define const___test_and_change_bit generic___test_and_change_bit #define const_test_bit_acquire generic_test_bit_acquire /** * const_test_bit - Determine whether a bit is set * @nr: bit number to test * @addr: Address to start counting from * * A version of generic_test_bit() which discards the `volatile` qualifier to * allow a compiler to optimize code harder. Non-atomic and to be called only * for testing compile-time constants, e.g. by the corresponding macros, not * directly from "regular" code. */ static __always_inline bool const_test_bit(unsigned long nr, const volatile unsigned long *addr) { const unsigned long *p = (const unsigned long *)addr + BIT_WORD(nr); unsigned long mask = BIT_MASK(nr); unsigned long val = *p; return !!(val & mask); } #endif /* __ASM_GENERIC_BITOPS_GENERIC_NON_ATOMIC_H */
69 69 69 69 930 927 926 933 929 58 930 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 // SPDX-License-Identifier: GPL-2.0 #include <linux/irq_work.h> #include <linux/spinlock.h> #include <linux/task_work.h> #include <linux/resume_user_mode.h> static struct callback_head work_exited; /* all we need is ->next == NULL */ #ifdef CONFIG_IRQ_WORK static void task_work_set_notify_irq(struct irq_work *entry) { test_and_set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); } static DEFINE_PER_CPU(struct irq_work, irq_work_NMI_resume) = IRQ_WORK_INIT_HARD(task_work_set_notify_irq); #endif /** * task_work_add - ask the @task to execute @work->func() * @task: the task which should run the callback * @work: the callback to run * @notify: how to notify the targeted task * * Queue @work for task_work_run() below and notify the @task if @notify * is @TWA_RESUME, @TWA_SIGNAL, @TWA_SIGNAL_NO_IPI or @TWA_NMI_CURRENT. * * @TWA_SIGNAL works like signals, in that the it will interrupt the targeted * task and run the task_work, regardless of whether the task is currently * running in the kernel or userspace. * @TWA_SIGNAL_NO_IPI works like @TWA_SIGNAL, except it doesn't send a * reschedule IPI to force the targeted task to reschedule and run task_work. * This can be advantageous if there's no strict requirement that the * task_work be run as soon as possible, just whenever the task enters the * kernel anyway. * @TWA_RESUME work is run only when the task exits the kernel and returns to * user mode, or before entering guest mode. * @TWA_NMI_CURRENT works like @TWA_RESUME, except it can only be used for the * current @task and if the current context is NMI. * * Fails if the @task is exiting/exited and thus it can't process this @work. * Otherwise @work->func() will be called when the @task goes through one of * the aforementioned transitions, or exits. * * If the targeted task is exiting, then an error is returned and the work item * is not queued. It's up to the caller to arrange for an alternative mechanism * in that case. * * Note: there is no ordering guarantee on works queued here. The task_work * list is LIFO. * * RETURNS: * 0 if succeeds or -ESRCH. */ int task_work_add(struct task_struct *task, struct callback_head *work, enum task_work_notify_mode notify) { struct callback_head *head; if (notify == TWA_NMI_CURRENT) { if (WARN_ON_ONCE(task != current)) return -EINVAL; if (!IS_ENABLED(CONFIG_IRQ_WORK)) return -EINVAL; } else { kasan_record_aux_stack(work); } head = READ_ONCE(task->task_works); do { if (unlikely(head == &work_exited)) return -ESRCH; work->next = head; } while (!try_cmpxchg(&task->task_works, &head, work)); switch (notify) { case TWA_NONE: break; case TWA_RESUME: set_notify_resume(task); break; case TWA_SIGNAL: set_notify_signal(task); break; case TWA_SIGNAL_NO_IPI: __set_notify_signal(task); break; #ifdef CONFIG_IRQ_WORK case TWA_NMI_CURRENT: irq_work_queue(this_cpu_ptr(&irq_work_NMI_resume)); break; #endif default: WARN_ON_ONCE(1); break; } return 0; } /** * task_work_cancel_match - cancel a pending work added by task_work_add() * @task: the task which should execute the work * @match: match function to call * @data: data to be passed in to match function * * RETURNS: * The found work or NULL if not found. */ struct callback_head * task_work_cancel_match(struct task_struct *task, bool (*match)(struct callback_head *, void *data), void *data) { struct callback_head **pprev = &task->task_works; struct callback_head *work; unsigned long flags; if (likely(!task_work_pending(task))) return NULL; /* * If cmpxchg() fails we continue without updating pprev. * Either we raced with task_work_add() which added the * new entry before this work, we will find it again. Or * we raced with task_work_run(), *pprev == NULL/exited. */ raw_spin_lock_irqsave(&task->pi_lock, flags); work = READ_ONCE(*pprev); while (work) { if (!match(work, data)) { pprev = &work->next; work = READ_ONCE(*pprev); } else if (try_cmpxchg(pprev, &work, work->next)) break; } raw_spin_unlock_irqrestore(&task->pi_lock, flags); return work; } static bool task_work_func_match(struct callback_head *cb, void *data) { return cb->func == data; } /** * task_work_cancel_func - cancel a pending work matching a function added by task_work_add() * @task: the task which should execute the func's work * @func: identifies the func to match with a work to remove * * Find the last queued pending work with ->func == @func and remove * it from queue. * * RETURNS: * The found work or NULL if not found. */ struct callback_head * task_work_cancel_func(struct task_struct *task, task_work_func_t func) { return task_work_cancel_match(task, task_work_func_match, func); } static bool task_work_match(struct callback_head *cb, void *data) { return cb == data; } /** * task_work_cancel - cancel a pending work added by task_work_add() * @task: the task which should execute the work * @cb: the callback to remove if queued * * Remove a callback from a task's queue if queued. * * RETURNS: * True if the callback was queued and got cancelled, false otherwise. */ bool task_work_cancel(struct task_struct *task, struct callback_head *cb) { struct callback_head *ret; ret = task_work_cancel_match(task, task_work_match, cb); return ret == cb; } /** * task_work_run - execute the works added by task_work_add() * * Flush the pending works. Should be used by the core kernel code. * Called before the task returns to the user-mode or stops, or when * it exits. In the latter case task_work_add() can no longer add the * new work after task_work_run() returns. */ void task_work_run(void) { struct task_struct *task = current; struct callback_head *work, *head, *next; for (;;) { /* * work->func() can do task_work_add(), do not set * work_exited unless the list is empty. */ work = READ_ONCE(task->task_works); do { head = NULL; if (!work) { if (task->flags & PF_EXITING) head = &work_exited; else break; } } while (!try_cmpxchg(&task->task_works, &work, head)); if (!work) break; /* * Synchronize with task_work_cancel_match(). It can not remove * the first entry == work, cmpxchg(task_works) must fail. * But it can remove another entry from the ->next list. */ raw_spin_lock_irq(&task->pi_lock); raw_spin_unlock_irq(&task->pi_lock); do { next = work->next; work->func(work); work = next; cond_resched(); } while (work); } }
203 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 /* SPDX-License-Identifier: GPL-2.0 */ /* * Prevent the compiler from merging or refetching reads or writes. The * compiler is also forbidden from reordering successive instances of * READ_ONCE and WRITE_ONCE, but only when the compiler is aware of some * particular ordering. One way to make the compiler aware of ordering is to * put the two invocations of READ_ONCE or WRITE_ONCE in different C * statements. * * These two macros will also work on aggregate data types like structs or * unions. * * Their two major use cases are: (1) Mediating communication between * process-level code and irq/NMI handlers, all running on the same CPU, * and (2) Ensuring that the compiler does not fold, spindle, or otherwise * mutilate accesses that either do not require ordering or that interact * with an explicit memory barrier or atomic instruction that provides the * required ordering. */ #ifndef __ASM_GENERIC_RWONCE_H #define __ASM_GENERIC_RWONCE_H #ifndef __ASSEMBLY__ #include <linux/compiler_types.h> #include <linux/kasan-checks.h> #include <linux/kcsan-checks.h> /* * Yes, this permits 64-bit accesses on 32-bit architectures. These will * actually be atomic in some cases (namely Armv7 + LPAE), but for others we * rely on the access being split into 2x32-bit accesses for a 32-bit quantity * (e.g. a virtual address) and a strong prevailing wind. */ #define compiletime_assert_rwonce_type(t) \ compiletime_assert(__native_word(t) || sizeof(t) == sizeof(long long), \ "Unsupported access size for {READ,WRITE}_ONCE().") /* * Use __READ_ONCE() instead of READ_ONCE() if you do not require any * atomicity. Note that this may result in tears! */ #ifndef __READ_ONCE #define __READ_ONCE(x) (*(const volatile __unqual_scalar_typeof(x) *)&(x)) #endif #define READ_ONCE(x) \ ({ \ compiletime_assert_rwonce_type(x); \ __READ_ONCE(x); \ }) #define __WRITE_ONCE(x, val) \ do { \ *(volatile typeof(x) *)&(x) = (val); \ } while (0) #define WRITE_ONCE(x, val) \ do { \ compiletime_assert_rwonce_type(x); \ __WRITE_ONCE(x, val); \ } while (0) static __no_sanitize_or_inline unsigned long __read_once_word_nocheck(const void *addr) { return __READ_ONCE(*(unsigned long *)addr); } /* * Use READ_ONCE_NOCHECK() instead of READ_ONCE() if you need to load a * word from memory atomically but without telling KASAN/KCSAN. This is * usually used by unwinding code when walking the stack of a running process. */ #define READ_ONCE_NOCHECK(x) \ ({ \ compiletime_assert(sizeof(x) == sizeof(unsigned long), \ "Unsupported access size for READ_ONCE_NOCHECK()."); \ (typeof(x))__read_once_word_nocheck(&(x)); \ }) static __no_sanitize_or_inline unsigned long read_word_at_a_time(const void *addr) { /* open-coded instrument_read(addr, 1) */ kasan_check_read(addr, 1); kcsan_check_read(addr, 1); /* * This load can race with concurrent stores to out-of-bounds memory, * but READ_ONCE() can't be used because it requires higher alignment * than plain loads in arm64 builds with LTO. */ return *(unsigned long *)addr; } #endif /* __ASSEMBLY__ */ #endif /* __ASM_GENERIC_RWONCE_H */
962 335 1067 1072 1071 1067 336 335 336 961 960 961 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 // SPDX-License-Identifier: GPL-2.0-only #include "cgroup-internal.h" #include <linux/sched/cputime.h> #include <linux/bpf.h> #include <linux/btf.h> #include <linux/btf_ids.h> #include <trace/events/cgroup.h> static DEFINE_SPINLOCK(rstat_base_lock); static DEFINE_PER_CPU(struct llist_head, rstat_backlog_list); static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu); /* * Determines whether a given css can participate in rstat. * css's that are cgroup::self use rstat for base stats. * Other css's associated with a subsystem use rstat only when * they define the ss->css_rstat_flush callback. */ static inline bool css_uses_rstat(struct cgroup_subsys_state *css) { return css_is_self(css) || css->ss->css_rstat_flush != NULL; } static struct css_rstat_cpu *css_rstat_cpu( struct cgroup_subsys_state *css, int cpu) { return per_cpu_ptr(css->rstat_cpu, cpu); } static struct cgroup_rstat_base_cpu *cgroup_rstat_base_cpu( struct cgroup *cgrp, int cpu) { return per_cpu_ptr(cgrp->rstat_base_cpu, cpu); } static spinlock_t *ss_rstat_lock(struct cgroup_subsys *ss) { if (ss) return &ss->rstat_ss_lock; return &rstat_base_lock; } static inline struct llist_head *ss_lhead_cpu(struct cgroup_subsys *ss, int cpu) { if (ss) return per_cpu_ptr(ss->lhead, cpu); return per_cpu_ptr(&rstat_backlog_list, cpu); } /** * css_rstat_updated - keep track of updated rstat_cpu * @css: target cgroup subsystem state * @cpu: cpu on which rstat_cpu was updated * * Atomically inserts the css in the ss's llist for the given cpu. This is * reentrant safe i.e. safe against softirq, hardirq and nmi. The ss's llist * will be processed at the flush time to create the update tree. * * NOTE: if the user needs the guarantee that the updater either add itself in * the lockless list or the concurrent flusher flushes its updated stats, a * memory barrier is needed before the call to css_rstat_updated() i.e. a * barrier after updating the per-cpu stats and before calling * css_rstat_updated(). */ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu) { struct llist_head *lhead; struct css_rstat_cpu *rstatc; struct css_rstat_cpu __percpu *rstatc_pcpu; struct llist_node *self; /* * Since bpf programs can call this function, prevent access to * uninitialized rstat pointers. */ if (!css_uses_rstat(css)) return; lockdep_assert_preemption_disabled(); /* * For archs withnot nmi safe cmpxchg or percpu ops support, ignore * the requests from nmi context. */ if ((!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) || !IS_ENABLED(CONFIG_ARCH_HAS_NMI_SAFE_THIS_CPU_OPS)) && in_nmi()) return; rstatc = css_rstat_cpu(css, cpu); /* * If already on list return. This check is racy and smp_mb() is needed * to pair it with the smp_mb() in css_process_update_tree() if the * guarantee that the updated stats are visible to concurrent flusher is * needed. */ if (llist_on_list(&rstatc->lnode)) return; /* * This function can be renentered by irqs and nmis for the same cgroup * and may try to insert the same per-cpu lnode into the llist. Note * that llist_add() does not protect against such scenarios. * * To protect against such stacked contexts of irqs/nmis, we use the * fact that lnode points to itself when not on a list and then use * this_cpu_cmpxchg() to atomically set to NULL to select the winner * which will call llist_add(). The losers can assume the insertion is * successful and the winner will eventually add the per-cpu lnode to * the llist. */ self = &rstatc->lnode; rstatc_pcpu = css->rstat_cpu; if (this_cpu_cmpxchg(rstatc_pcpu->lnode.next, self, NULL) != self) return; lhead = ss_lhead_cpu(css->ss, cpu); llist_add(&rstatc->lnode, lhead); } static void __css_process_update_tree(struct cgroup_subsys_state *css, int cpu) { /* put @css and all ancestors on the corresponding updated lists */ while (true) { struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu); struct cgroup_subsys_state *parent = css->parent; struct css_rstat_cpu *prstatc; /* * Both additions and removals are bottom-up. If a cgroup * is already in the tree, all ancestors are. */ if (rstatc->updated_next) break; /* Root has no parent to link it to, but mark it busy */ if (!parent) { rstatc->updated_next = css; break; } prstatc = css_rstat_cpu(parent, cpu); rstatc->updated_next = prstatc->updated_children; prstatc->updated_children = css; css = parent; } } static void css_process_update_tree(struct cgroup_subsys *ss, int cpu) { struct llist_head *lhead = ss_lhead_cpu(ss, cpu); struct llist_node *lnode; while ((lnode = llist_del_first_init(lhead))) { struct css_rstat_cpu *rstatc; /* * smp_mb() is needed here (more specifically in between * init_llist_node() and per-cpu stats flushing) if the * guarantee is required by a rstat user where etiher the * updater should add itself on the lockless list or the * flusher flush the stats updated by the updater who have * observed that they are already on the list. The * corresponding barrier pair for this one should be before * css_rstat_updated() by the user. * * For now, there aren't any such user, so not adding the * barrier here but if such a use-case arise, please add * smp_mb() here. */ rstatc = container_of(lnode, struct css_rstat_cpu, lnode); __css_process_update_tree(rstatc->owner, cpu); } } /** * css_rstat_push_children - push children css's into the given list * @head: current head of the list (= subtree root) * @child: first child of the root * @cpu: target cpu * Return: A new singly linked list of css's to be flushed * * Iteratively traverse down the css_rstat_cpu updated tree level by * level and push all the parents first before their next level children * into a singly linked list via the rstat_flush_next pointer built from the * tail backward like "pushing" css's into a stack. The root is pushed by * the caller. */ static struct cgroup_subsys_state *css_rstat_push_children( struct cgroup_subsys_state *head, struct cgroup_subsys_state *child, int cpu) { struct cgroup_subsys_state *cnext = child; /* Next head of child css level */ struct cgroup_subsys_state *ghead = NULL; /* Head of grandchild css level */ struct cgroup_subsys_state *parent, *grandchild; struct css_rstat_cpu *crstatc; child->rstat_flush_next = NULL; /* * The subsystem rstat lock must be held for the whole duration from * here as the rstat_flush_next list is being constructed to when * it is consumed later in css_rstat_flush(). */ lockdep_assert_held(ss_rstat_lock(head->ss)); /* * Notation: -> updated_next pointer * => rstat_flush_next pointer * * Assuming the following sample updated_children lists: * P: C1 -> C2 -> P * C1: G11 -> G12 -> C1 * C2: G21 -> G22 -> C2 * * After 1st iteration: * head => C2 => C1 => NULL * ghead => G21 => G11 => NULL * * After 2nd iteration: * head => G12 => G11 => G22 => G21 => C2 => C1 => NULL */ next_level: while (cnext) { child = cnext; cnext = child->rstat_flush_next; parent = child->parent; /* updated_next is parent cgroup terminated if !NULL */ while (child != parent) { child->rstat_flush_next = head; head = child; crstatc = css_rstat_cpu(child, cpu); grandchild = crstatc->updated_children; if (grandchild != child) { /* Push the grand child to the next level */ crstatc->updated_children = child; grandchild->rstat_flush_next = ghead; ghead = grandchild; } child = crstatc->updated_next; crstatc->updated_next = NULL; } } if (ghead) { cnext = ghead; ghead = NULL; goto next_level; } return head; } /** * css_rstat_updated_list - build a list of updated css's to be flushed * @root: root of the css subtree to traverse * @cpu: target cpu * Return: A singly linked list of css's to be flushed * * Walks the updated rstat_cpu tree on @cpu from @root. During traversal, * each returned css is unlinked from the updated tree. * * The only ordering guarantee is that, for a parent and a child pair * covered by a given traversal, the child is before its parent in * the list. * * Note that updated_children is self terminated and points to a list of * child css's if not empty. Whereas updated_next is like a sibling link * within the children list and terminated by the parent css. An exception * here is the css root whose updated_next can be self terminated. */ static struct cgroup_subsys_state *css_rstat_updated_list( struct cgroup_subsys_state *root, int cpu) { struct css_rstat_cpu *rstatc = css_rstat_cpu(root, cpu); struct cgroup_subsys_state *head = NULL, *parent, *child; css_process_update_tree(root->ss, cpu); /* Return NULL if this subtree is not on-list */ if (!rstatc->updated_next) return NULL; /* * Unlink @root from its parent. As the updated_children list is * singly linked, we have to walk it to find the removal point. */ parent = root->parent; if (parent) { struct css_rstat_cpu *prstatc; struct cgroup_subsys_state **nextp; prstatc = css_rstat_cpu(parent, cpu); nextp = &prstatc->updated_children; while (*nextp != root) { struct css_rstat_cpu *nrstatc; nrstatc = css_rstat_cpu(*nextp, cpu); WARN_ON_ONCE(*nextp == parent); nextp = &nrstatc->updated_next; } *nextp = rstatc->updated_next; } rstatc->updated_next = NULL; /* Push @root to the list first before pushing the children */ head = root; root->rstat_flush_next = NULL; child = rstatc->updated_children; rstatc->updated_children = root; if (child != root) head = css_rstat_push_children(head, child, cpu); return head; } /* * A hook for bpf stat collectors to attach to and flush their stats. * Together with providing bpf kfuncs for css_rstat_updated() and * css_rstat_flush(), this enables a complete workflow where bpf progs that * collect cgroup stats can integrate with rstat for efficient flushing. * * A static noinline declaration here could cause the compiler to optimize away * the function. A global noinline declaration will keep the definition, but may * optimize away the callsite. Therefore, __weak is needed to ensure that the * call is still emitted, by telling the compiler that we don't know what the * function might eventually be. */ __bpf_hook_start(); __weak noinline void bpf_rstat_flush(struct cgroup *cgrp, struct cgroup *parent, int cpu) { } __bpf_hook_end(); /* * Helper functions for locking. * * This makes it easier to diagnose locking issues and contention in * production environments. The parameter @cpu_in_loop indicate lock * was released and re-taken when collection data from the CPUs. The * value -1 is used when obtaining the main lock else this is the CPU * number processed last. */ static inline void __css_rstat_lock(struct cgroup_subsys_state *css, int cpu_in_loop) __acquires(ss_rstat_lock(css->ss)) { struct cgroup *cgrp = css->cgroup; spinlock_t *lock; bool contended; lock = ss_rstat_lock(css->ss); contended = !spin_trylock_irq(lock); if (contended) { trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, contended); spin_lock_irq(lock); } trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended); } static inline void __css_rstat_unlock(struct cgroup_subsys_state *css, int cpu_in_loop) __releases(ss_rstat_lock(css->ss)) { struct cgroup *cgrp = css->cgroup; spinlock_t *lock; lock = ss_rstat_lock(css->ss); trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false); spin_unlock_irq(lock); } /** * css_rstat_flush - flush stats in @css's rstat subtree * @css: target cgroup subsystem state * * Collect all per-cpu stats in @css's subtree into the global counters * and propagate them upwards. After this function returns, all rstat * nodes in the subtree have up-to-date ->stat. * * This also gets all rstat nodes in the subtree including @css off the * ->updated_children lists. * * This function may block. */ __bpf_kfunc void css_rstat_flush(struct cgroup_subsys_state *css) { int cpu; bool is_self = css_is_self(css); /* * Since bpf programs can call this function, prevent access to * uninitialized rstat pointers. */ if (!css_uses_rstat(css)) return; might_sleep(); for_each_possible_cpu(cpu) { struct cgroup_subsys_state *pos; /* Reacquire for each CPU to avoid disabling IRQs too long */ __css_rstat_lock(css, cpu); pos = css_rstat_updated_list(css, cpu); for (; pos; pos = pos->rstat_flush_next) { if (is_self) { cgroup_base_stat_flush(pos->cgroup, cpu); bpf_rstat_flush(pos->cgroup, cgroup_parent(pos->cgroup), cpu); } else pos->ss->css_rstat_flush(pos, cpu); } __css_rstat_unlock(css, cpu); if (!cond_resched()) cpu_relax(); } } int css_rstat_init(struct cgroup_subsys_state *css) { struct cgroup *cgrp = css->cgroup; int cpu; bool is_self = css_is_self(css); if (is_self) { /* the root cgrp has rstat_base_cpu preallocated */ if (!cgrp->rstat_base_cpu) { cgrp->rstat_base_cpu = alloc_percpu(struct cgroup_rstat_base_cpu); if (!cgrp->rstat_base_cpu) return -ENOMEM; } } else if (css->ss->css_rstat_flush == NULL) return 0; /* the root cgrp's self css has rstat_cpu preallocated */ if (!css->rstat_cpu) { css->rstat_cpu = alloc_percpu(struct css_rstat_cpu); if (!css->rstat_cpu) { if (is_self) free_percpu(cgrp->rstat_base_cpu); return -ENOMEM; } } /* ->updated_children list is self terminated */ for_each_possible_cpu(cpu) { struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu); rstatc->owner = rstatc->updated_children = css; init_llist_node(&rstatc->lnode); if (is_self) { struct cgroup_rstat_base_cpu *rstatbc; rstatbc = cgroup_rstat_base_cpu(cgrp, cpu); u64_stats_init(&rstatbc->bsync); } } return 0; } void css_rstat_exit(struct cgroup_subsys_state *css) { int cpu; if (!css_uses_rstat(css)) return; if (!css->rstat_cpu) return; css_rstat_flush(css); /* sanity check */ for_each_possible_cpu(cpu) { struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu); if (WARN_ON_ONCE(rstatc->updated_children != css) || WARN_ON_ONCE(rstatc->updated_next)) return; } if (css_is_self(css)) { struct cgroup *cgrp = css->cgroup; free_percpu(cgrp->rstat_base_cpu); cgrp->rstat_base_cpu = NULL; } free_percpu(css->rstat_cpu); css->rstat_cpu = NULL; } /** * ss_rstat_init - subsystem-specific rstat initialization * @ss: target subsystem * * If @ss is NULL, the static locks associated with the base stats * are initialized. If @ss is non-NULL, the subsystem-specific locks * are initialized. */ int __init ss_rstat_init(struct cgroup_subsys *ss) { int cpu; if (ss) { ss->lhead = alloc_percpu(struct llist_head); if (!ss->lhead) return -ENOMEM; } spin_lock_init(ss_rstat_lock(ss)); for_each_possible_cpu(cpu) init_llist_head(ss_lhead_cpu(ss, cpu)); return 0; } /* * Functions for cgroup basic resource statistics implemented on top of * rstat. */ static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat, struct cgroup_base_stat *src_bstat) { dst_bstat->cputime.utime += src_bstat->cputime.utime; dst_bstat->cputime.stime += src_bstat->cputime.stime; dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime; #ifdef CONFIG_SCHED_CORE dst_bstat->forceidle_sum += src_bstat->forceidle_sum; #endif dst_bstat->ntime += src_bstat->ntime; } static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat, struct cgroup_base_stat *src_bstat) { dst_bstat->cputime.utime -= src_bstat->cputime.utime; dst_bstat->cputime.stime -= src_bstat->cputime.stime; dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime; #ifdef CONFIG_SCHED_CORE dst_bstat->forceidle_sum -= src_bstat->forceidle_sum; #endif dst_bstat->ntime -= src_bstat->ntime; } static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) { struct cgroup_rstat_base_cpu *rstatbc = cgroup_rstat_base_cpu(cgrp, cpu); struct cgroup *parent = cgroup_parent(cgrp); struct cgroup_rstat_base_cpu *prstatbc; struct cgroup_base_stat delta; unsigned seq; /* Root-level stats are sourced from system-wide CPU stats */ if (!parent) return; /* fetch the current per-cpu values */ do { seq = __u64_stats_fetch_begin(&rstatbc->bsync); delta = rstatbc->bstat; } while (__u64_stats_fetch_retry(&rstatbc->bsync, seq)); /* propagate per-cpu delta to cgroup and per-cpu global statistics */ cgroup_base_stat_sub(&delta, &rstatbc->last_bstat); cgroup_base_stat_add(&cgrp->bstat, &delta); cgroup_base_stat_add(&rstatbc->last_bstat, &delta); cgroup_base_stat_add(&rstatbc->subtree_bstat, &delta); /* propagate cgroup and per-cpu global delta to parent (unless that's root) */ if (cgroup_parent(parent)) { delta = cgrp->bstat; cgroup_base_stat_sub(&delta, &cgrp->last_bstat); cgroup_base_stat_add(&parent->bstat, &delta); cgroup_base_stat_add(&cgrp->last_bstat, &delta); delta = rstatbc->subtree_bstat; prstatbc = cgroup_rstat_base_cpu(parent, cpu); cgroup_base_stat_sub(&delta, &rstatbc->last_subtree_bstat); cgroup_base_stat_add(&prstatbc->subtree_bstat, &delta); cgroup_base_stat_add(&rstatbc->last_subtree_bstat, &delta); } } static struct cgroup_rstat_base_cpu * cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp, unsigned long *flags) { struct cgroup_rstat_base_cpu *rstatbc; rstatbc = get_cpu_ptr(cgrp->rstat_base_cpu); *flags = u64_stats_update_begin_irqsave(&rstatbc->bsync); return rstatbc; } static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp, struct cgroup_rstat_base_cpu *rstatbc, unsigned long flags) { u64_stats_update_end_irqrestore(&rstatbc->bsync, flags); css_rstat_updated(&cgrp->self, smp_processor_id()); put_cpu_ptr(rstatbc); } void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec) { struct cgroup_rstat_base_cpu *rstatbc; unsigned long flags; rstatbc = cgroup_base_stat_cputime_account_begin(cgrp, &flags); rstatbc->bstat.cputime.sum_exec_runtime += delta_exec; cgroup_base_stat_cputime_account_end(cgrp, rstatbc, flags); } void __cgroup_account_cputime_field(struct cgroup *cgrp, enum cpu_usage_stat index, u64 delta_exec) { struct cgroup_rstat_base_cpu *rstatbc; unsigned long flags; rstatbc = cgroup_base_stat_cputime_account_begin(cgrp, &flags); switch (index) { case CPUTIME_NICE: rstatbc->bstat.ntime += delta_exec; fallthrough; case CPUTIME_USER: rstatbc->bstat.cputime.utime += delta_exec; break; case CPUTIME_SYSTEM: case CPUTIME_IRQ: case CPUTIME_SOFTIRQ: rstatbc->bstat.cputime.stime += delta_exec; break; #ifdef CONFIG_SCHED_CORE case CPUTIME_FORCEIDLE: rstatbc->bstat.forceidle_sum += delta_exec; break; #endif default: break; } cgroup_base_stat_cputime_account_end(cgrp, rstatbc, flags); } /* * compute the cputime for the root cgroup by getting the per cpu data * at a global level, then categorizing the fields in a manner consistent * with how it is done by __cgroup_account_cputime_field for each bit of * cpu time attributed to a cgroup. */ static void root_cgroup_cputime(struct cgroup_base_stat *bstat) { struct task_cputime *cputime = &bstat->cputime; int i; memset(bstat, 0, sizeof(*bstat)); for_each_possible_cpu(i) { struct kernel_cpustat kcpustat; u64 *cpustat = kcpustat.cpustat; u64 user = 0; u64 sys = 0; kcpustat_cpu_fetch(&kcpustat, i); user += cpustat[CPUTIME_USER]; user += cpustat[CPUTIME_NICE]; cputime->utime += user; sys += cpustat[CPUTIME_SYSTEM]; sys += cpustat[CPUTIME_IRQ]; sys += cpustat[CPUTIME_SOFTIRQ]; cputime->stime += sys; cputime->sum_exec_runtime += user; cputime->sum_exec_runtime += sys; #ifdef CONFIG_SCHED_CORE bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE]; #endif bstat->ntime += cpustat[CPUTIME_NICE]; } } static void cgroup_force_idle_show(struct seq_file *seq, struct cgroup_base_stat *bstat) { #ifdef CONFIG_SCHED_CORE u64 forceidle_time = bstat->forceidle_sum; do_div(forceidle_time, NSEC_PER_USEC); seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time); #endif } void cgroup_base_stat_cputime_show(struct seq_file *seq) { struct cgroup *cgrp = seq_css(seq)->cgroup; struct cgroup_base_stat bstat; if (cgroup_parent(cgrp)) { css_rstat_flush(&cgrp->self); __css_rstat_lock(&cgrp->self, -1); bstat = cgrp->bstat; cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, &bstat.cputime.utime, &bstat.cputime.stime); __css_rstat_unlock(&cgrp->self, -1); } else { root_cgroup_cputime(&bstat); } do_div(bstat.cputime.sum_exec_runtime, NSEC_PER_USEC); do_div(bstat.cputime.utime, NSEC_PER_USEC); do_div(bstat.cputime.stime, NSEC_PER_USEC); do_div(bstat.ntime, NSEC_PER_USEC); seq_printf(seq, "usage_usec %llu\n" "user_usec %llu\n" "system_usec %llu\n" "nice_usec %llu\n", bstat.cputime.sum_exec_runtime, bstat.cputime.utime, bstat.cputime.stime, bstat.ntime); cgroup_force_idle_show(seq, &bstat); } /* Add bpf kfuncs for css_rstat_updated() and css_rstat_flush() */ BTF_KFUNCS_START(bpf_rstat_kfunc_ids) BTF_ID_FLAGS(func, css_rstat_updated) BTF_ID_FLAGS(func, css_rstat_flush, KF_SLEEPABLE) BTF_KFUNCS_END(bpf_rstat_kfunc_ids) static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = { .owner = THIS_MODULE, .set = &bpf_rstat_kfunc_ids, }; static int __init bpf_rstat_kfunc_init(void) { return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_rstat_kfunc_set); } late_initcall(bpf_rstat_kfunc_init);
328 332 897 897 898 899 327 927 927 926 925 924 79 435 439 439 437 438 48 899 901 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 // SPDX-License-Identifier: GPL-2.0 /* * Lockless hierarchical page accounting & limiting * * Copyright (C) 2014 Red Hat, Inc., Johannes Weiner */ #include <linux/page_counter.h> #include <linux/atomic.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/sched.h> #include <linux/bug.h> #include <asm/page.h> static bool track_protection(struct page_counter *c) { return c->protection_support; } static void propagate_protected_usage(struct page_counter *c, unsigned long usage) { unsigned long protected, old_protected; long delta; if (!c->parent) return; protected = min(usage, READ_ONCE(c->min)); old_protected = atomic_long_read(&c->min_usage); if (protected != old_protected) { old_protected = atomic_long_xchg(&c->min_usage, protected); delta = protected - old_protected; if (delta) atomic_long_add(delta, &c->parent->children_min_usage); } protected = min(usage, READ_ONCE(c->low)); old_protected = atomic_long_read(&c->low_usage); if (protected != old_protected) { old_protected = atomic_long_xchg(&c->low_usage, protected); delta = protected - old_protected; if (delta) atomic_long_add(delta, &c->parent->children_low_usage); } } /** * page_counter_cancel - take pages out of the local counter * @counter: counter * @nr_pages: number of pages to cancel */ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages) { long new; new = atomic_long_sub_return(nr_pages, &counter->usage); /* More uncharges than charges? */ if (WARN_ONCE(new < 0, "page_counter underflow: %ld nr_pages=%lu\n", new, nr_pages)) { new = 0; atomic_long_set(&counter->usage, new); } if (track_protection(counter)) propagate_protected_usage(counter, new); } /** * page_counter_charge - hierarchically charge pages * @counter: counter * @nr_pages: number of pages to charge * * NOTE: This does not consider any configured counter limits. */ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages) { struct page_counter *c; bool protection = track_protection(counter); for (c = counter; c; c = c->parent) { long new; new = atomic_long_add_return(nr_pages, &c->usage); if (protection) propagate_protected_usage(c, new); /* * This is indeed racy, but we can live with some * inaccuracy in the watermark. * * Notably, we have two watermarks to allow for both a globally * visible peak and one that can be reset at a smaller scope. * * Since we reset both watermarks when the global reset occurs, * we can guarantee that watermark >= local_watermark, so we * don't need to do both comparisons every time. * * On systems with branch predictors, the inner condition should * be almost free. */ if (new > READ_ONCE(c->local_watermark)) { WRITE_ONCE(c->local_watermark, new); if (new > READ_ONCE(c->watermark)) WRITE_ONCE(c->watermark, new); } } } /** * page_counter_try_charge - try to hierarchically charge pages * @counter: counter * @nr_pages: number of pages to charge * @fail: points first counter to hit its limit, if any * * Returns %true on success, or %false and @fail if the counter or one * of its ancestors has hit its configured limit. */ bool page_counter_try_charge(struct page_counter *counter, unsigned long nr_pages, struct page_counter **fail) { struct page_counter *c; bool protection = track_protection(counter); bool track_failcnt = counter->track_failcnt; for (c = counter; c; c = c->parent) { long new; /* * Charge speculatively to avoid an expensive CAS. If * a bigger charge fails, it might falsely lock out a * racing smaller charge and send it into reclaim * early, but the error is limited to the difference * between the two sizes, which is less than 2M/4M in * case of a THP locking out a regular page charge. * * The atomic_long_add_return() implies a full memory * barrier between incrementing the count and reading * the limit. When racing with page_counter_set_max(), * we either see the new limit or the setter sees the * counter has changed and retries. */ new = atomic_long_add_return(nr_pages, &c->usage); if (new > c->max) { atomic_long_sub(nr_pages, &c->usage); /* * This is racy, but we can live with some * inaccuracy in the failcnt which is only used * to report stats. */ if (track_failcnt) data_race(c->failcnt++); *fail = c; goto failed; } if (protection) propagate_protected_usage(c, new); /* see comment on page_counter_charge */ if (new > READ_ONCE(c->local_watermark)) { WRITE_ONCE(c->local_watermark, new); if (new > READ_ONCE(c->watermark)) WRITE_ONCE(c->watermark, new); } } return true; failed: for (c = counter; c != *fail; c = c->parent) page_counter_cancel(c, nr_pages); return false; } /** * page_counter_uncharge - hierarchically uncharge pages * @counter: counter * @nr_pages: number of pages to uncharge */ void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages) { struct page_counter *c; for (c = counter; c; c = c->parent) page_counter_cancel(c, nr_pages); } /** * page_counter_set_max - set the maximum number of pages allowed * @counter: counter * @nr_pages: limit to set * * Returns 0 on success, -EBUSY if the current number of pages on the * counter already exceeds the specified limit. * * The caller must serialize invocations on the same counter. */ int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages) { for (;;) { unsigned long old; long usage; /* * Update the limit while making sure that it's not * below the concurrently-changing counter value. * * The xchg implies two full memory barriers before * and after, so the read-swap-read is ordered and * ensures coherency with page_counter_try_charge(): * that function modifies the count before checking * the limit, so if it sees the old limit, we see the * modified counter and retry. */ usage = page_counter_read(counter); if (usage > nr_pages) return -EBUSY; old = xchg(&counter->max, nr_pages); if (page_counter_read(counter) <= usage || nr_pages >= old) return 0; counter->max = old; cond_resched(); } } /** * page_counter_set_min - set the amount of protected memory * @counter: counter * @nr_pages: value to set * * The caller must serialize invocations on the same counter. */ void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages) { struct page_counter *c; WRITE_ONCE(counter->min, nr_pages); for (c = counter; c; c = c->parent) propagate_protected_usage(c, atomic_long_read(&c->usage)); } /** * page_counter_set_low - set the amount of protected memory * @counter: counter * @nr_pages: value to set * * The caller must serialize invocations on the same counter. */ void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages) { struct page_counter *c; WRITE_ONCE(counter->low, nr_pages); for (c = counter; c; c = c->parent) propagate_protected_usage(c, atomic_long_read(&c->usage)); } /** * page_counter_memparse - memparse() for page counter limits * @buf: string to parse * @max: string meaning maximum possible value * @nr_pages: returns the result in number of pages * * Returns -EINVAL, or 0 and @nr_pages on success. @nr_pages will be * limited to %PAGE_COUNTER_MAX. */ int page_counter_memparse(const char *buf, const char *max, unsigned long *nr_pages) { char *end; u64 bytes; if (!strcmp(buf, max)) { *nr_pages = PAGE_COUNTER_MAX; return 0; } bytes = memparse(buf, &end); if (*end != '\0') return -EINVAL; *nr_pages = min(bytes / PAGE_SIZE, (u64)PAGE_COUNTER_MAX); return 0; } #if IS_ENABLED(CONFIG_MEMCG) || IS_ENABLED(CONFIG_CGROUP_DMEM) /* * This function calculates an individual page counter's effective * protection which is derived from its own memory.min/low, its * parent's and siblings' settings, as well as the actual memory * distribution in the tree. * * The following rules apply to the effective protection values: * * 1. At the first level of reclaim, effective protection is equal to * the declared protection in memory.min and memory.low. * * 2. To enable safe delegation of the protection configuration, at * subsequent levels the effective protection is capped to the * parent's effective protection. * * 3. To make complex and dynamic subtrees easier to configure, the * user is allowed to overcommit the declared protection at a given * level. If that is the case, the parent's effective protection is * distributed to the children in proportion to how much protection * they have declared and how much of it they are utilizing. * * This makes distribution proportional, but also work-conserving: * if one counter claims much more protection than it uses memory, * the unused remainder is available to its siblings. * * 4. Conversely, when the declared protection is undercommitted at a * given level, the distribution of the larger parental protection * budget is NOT proportional. A counter's protection from a sibling * is capped to its own memory.min/low setting. * * 5. However, to allow protecting recursive subtrees from each other * without having to declare each individual counter's fixed share * of the ancestor's claim to protection, any unutilized - * "floating" - protection from up the tree is distributed in * proportion to each counter's *usage*. This makes the protection * neutral wrt sibling cgroups and lets them compete freely over * the shared parental protection budget, but it protects the * subtree as a whole from neighboring subtrees. * * Note that 4. and 5. are not in conflict: 4. is about protecting * against immediate siblings whereas 5. is about protecting against * neighboring subtrees. */ static unsigned long effective_protection(unsigned long usage, unsigned long parent_usage, unsigned long setting, unsigned long parent_effective, unsigned long siblings_protected, bool recursive_protection) { unsigned long protected; unsigned long ep; protected = min(usage, setting); /* * If all cgroups at this level combined claim and use more * protection than what the parent affords them, distribute * shares in proportion to utilization. * * We are using actual utilization rather than the statically * claimed protection in order to be work-conserving: claimed * but unused protection is available to siblings that would * otherwise get a smaller chunk than what they claimed. */ if (siblings_protected > parent_effective) return protected * parent_effective / siblings_protected; /* * Ok, utilized protection of all children is within what the * parent affords them, so we know whatever this child claims * and utilizes is effectively protected. * * If there is unprotected usage beyond this value, reclaim * will apply pressure in proportion to that amount. * * If there is unutilized protection, the cgroup will be fully * shielded from reclaim, but we do return a smaller value for * protection than what the group could enjoy in theory. This * is okay. With the overcommit distribution above, effective * protection is always dependent on how memory is actually * consumed among the siblings anyway. */ ep = protected; /* * If the children aren't claiming (all of) the protection * afforded to them by the parent, distribute the remainder in * proportion to the (unprotected) memory of each cgroup. That * way, cgroups that aren't explicitly prioritized wrt each * other compete freely over the allowance, but they are * collectively protected from neighboring trees. * * We're using unprotected memory for the weight so that if * some cgroups DO claim explicit protection, we don't protect * the same bytes twice. * * Check both usage and parent_usage against the respective * protected values. One should imply the other, but they * aren't read atomically - make sure the division is sane. */ if (!recursive_protection) return ep; if (parent_effective > siblings_protected && parent_usage > siblings_protected && usage > protected) { unsigned long unclaimed; unclaimed = parent_effective - siblings_protected; unclaimed *= usage - protected; unclaimed /= parent_usage - siblings_protected; ep += unclaimed; } return ep; } /** * page_counter_calculate_protection - check if memory consumption is in the normal range * @root: the top ancestor of the sub-tree being checked * @counter: the page_counter the counter to update * @recursive_protection: Whether to use memory_recursiveprot behavior. * * Calculates elow/emin thresholds for given page_counter. * * WARNING: This function is not stateless! It can only be used as part * of a top-down tree iteration, not for isolated queries. */ void page_counter_calculate_protection(struct page_counter *root, struct page_counter *counter, bool recursive_protection) { unsigned long usage, parent_usage; struct page_counter *parent = counter->parent; /* * Effective values of the reclaim targets are ignored so they * can be stale. Have a look at mem_cgroup_protection for more * details. * TODO: calculation should be more robust so that we do not need * that special casing. */ if (root == counter) return; usage = page_counter_read(counter); if (!usage) return; if (parent == root) { counter->emin = READ_ONCE(counter->min); counter->elow = READ_ONCE(counter->low); return; } parent_usage = page_counter_read(parent); WRITE_ONCE(counter->emin, effective_protection(usage, parent_usage, READ_ONCE(counter->min), READ_ONCE(parent->emin), atomic_long_read(&parent->children_min_usage), recursive_protection)); WRITE_ONCE(counter->elow, effective_protection(usage, parent_usage, READ_ONCE(counter->low), READ_ONCE(parent->elow), atomic_long_read(&parent->children_low_usage), recursive_protection)); } #endif /* CONFIG_MEMCG || CONFIG_CGROUP_DMEM */
25 24 25 25 25 25 25 25 25 25 24 25 25 25 25 25 23 24 25 3 3 3 24 25 25 25 3 24 25 19 19 20 20 25 25 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 // SPDX-License-Identifier: GPL-2.0-or-later /* audit.c -- Auditing support * Gateway between the kernel (e.g., selinux) and the user-space audit daemon. * System-call specific features have moved to auditsc.c * * Copyright 2003-2007 Red Hat Inc., Durham, North Carolina. * All Rights Reserved. * * Written by Rickard E. (Rik) Faith <faith@redhat.com> * * Goals: 1) Integrate fully with Security Modules. * 2) Minimal run-time overhead: * a) Minimal when syscall auditing is disabled (audit_enable=0). * b) Small when syscall auditing is enabled and no audit record * is generated (defer as much work as possible to record * generation time): * i) context is allocated, * ii) names from getname are stored without a copy, and * iii) inode information stored from path_lookup. * 3) Ability to disable syscall auditing at boot time (audit=0). * 4) Usable by other parts of the kernel (if audit_log* is called, * then a syscall record will be generated automatically for the * current syscall). * 5) Netlink interface to user-space. * 6) Support low-overhead kernel-based filtering to minimize the * information that must be passed to user-space. * * Audit userspace, documentation, tests, and bug/issue trackers: * https://github.com/linux-audit */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/file.h> #include <linux/init.h> #include <linux/types.h> #include <linux/atomic.h> #include <linux/mm.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/err.h> #include <linux/kthread.h> #include <linux/kernel.h> #include <linux/syscalls.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/mutex.h> #include <linux/gfp.h> #include <linux/pid.h> #include <linux/audit.h> #include <net/sock.h> #include <net/netlink.h> #include <linux/skbuff.h> #include <linux/security.h> #include <linux/freezer.h> #include <linux/pid_namespace.h> #include <net/netns/generic.h> #include "audit.h" /* No auditing will take place until audit_initialized == AUDIT_INITIALIZED. * (Initialization happens after skb_init is called.) */ #define AUDIT_DISABLED -1 #define AUDIT_UNINITIALIZED 0 #define AUDIT_INITIALIZED 1 static int audit_initialized = AUDIT_UNINITIALIZED; u32 audit_enabled = AUDIT_OFF; bool audit_ever_enabled = !!AUDIT_OFF; EXPORT_SYMBOL_GPL(audit_enabled); /* Default state when kernel boots without any parameters. */ static u32 audit_default = AUDIT_OFF; /* If auditing cannot proceed, audit_failure selects what happens. */ static u32 audit_failure = AUDIT_FAIL_PRINTK; /* private audit network namespace index */ static unsigned int audit_net_id; /** * struct audit_net - audit private network namespace data * @sk: communication socket */ struct audit_net { struct sock *sk; }; /** * struct auditd_connection - kernel/auditd connection state * @pid: auditd PID * @portid: netlink portid * @net: the associated network namespace * @rcu: RCU head * * Description: * This struct is RCU protected; you must either hold the RCU lock for reading * or the associated spinlock for writing. */ struct auditd_connection { struct pid *pid; u32 portid; struct net *net; struct rcu_head rcu; }; static struct auditd_connection __rcu *auditd_conn; static DEFINE_SPINLOCK(auditd_conn_lock); /* If audit_rate_limit is non-zero, limit the rate of sending audit records * to that number per second. This prevents DoS attacks, but results in * audit records being dropped. */ static u32 audit_rate_limit; /* Number of outstanding audit_buffers allowed. * When set to zero, this means unlimited. */ static u32 audit_backlog_limit = 64; #define AUDIT_BACKLOG_WAIT_TIME (60 * HZ) static u32 audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME; /* The identity of the user shutting down the audit system. */ static kuid_t audit_sig_uid = INVALID_UID; static pid_t audit_sig_pid = -1; static struct lsm_prop audit_sig_lsm; /* Records can be lost in several ways: 0) [suppressed in audit_alloc] 1) out of memory in audit_log_start [kmalloc of struct audit_buffer] 2) out of memory in audit_log_move [alloc_skb] 3) suppressed due to audit_rate_limit 4) suppressed due to audit_backlog_limit */ static atomic_t audit_lost = ATOMIC_INIT(0); /* Monotonically increasing sum of time the kernel has spent * waiting while the backlog limit is exceeded. */ static atomic_t audit_backlog_wait_time_actual = ATOMIC_INIT(0); /* Hash for inode-based rules */ struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; static struct kmem_cache *audit_buffer_cache; /* queue msgs to send via kauditd_task */ static struct sk_buff_head audit_queue; /* queue msgs due to temporary unicast send problems */ static struct sk_buff_head audit_retry_queue; /* queue msgs waiting for new auditd connection */ static struct sk_buff_head audit_hold_queue; /* queue servicing thread */ static struct task_struct *kauditd_task; static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait); /* waitqueue for callers who are blocked on the audit backlog */ static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); static struct audit_features af = {.vers = AUDIT_FEATURE_VERSION, .mask = -1, .features = 0, .lock = 0,}; static char *audit_feature_names[2] = { "only_unset_loginuid", "loginuid_immutable", }; /** * struct audit_ctl_mutex - serialize requests from userspace * @lock: the mutex used for locking * @owner: the task which owns the lock * * Description: * This is the lock struct used to ensure we only process userspace requests * in an orderly fashion. We can't simply use a mutex/lock here because we * need to track lock ownership so we don't end up blocking the lock owner in * audit_log_start() or similar. */ static struct audit_ctl_mutex { struct mutex lock; void *owner; } audit_cmd_mutex; /* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting * audit records. Since printk uses a 1024 byte buffer, this buffer * should be at least that large. */ #define AUDIT_BUFSIZ 1024 /* The audit_buffer is used when formatting an audit record. The caller * locks briefly to get the record off the freelist or to allocate the * buffer, and locks briefly to send the buffer to the netlink layer or * to place it on a transmit queue. Multiple audit_buffers can be in * use simultaneously. */ struct audit_buffer { struct sk_buff *skb; /* formatted skb ready to send */ struct audit_context *ctx; /* NULL or associated context */ gfp_t gfp_mask; }; struct audit_reply { __u32 portid; struct net *net; struct sk_buff *skb; }; /** * auditd_test_task - Check to see if a given task is an audit daemon * @task: the task to check * * Description: * Return 1 if the task is a registered audit daemon, 0 otherwise. */ int auditd_test_task(struct task_struct *task) { int rc; struct auditd_connection *ac; rcu_read_lock(); ac = rcu_dereference(auditd_conn); rc = (ac && ac->pid == task_tgid(task) ? 1 : 0); rcu_read_unlock(); return rc; } /** * audit_ctl_lock - Take the audit control lock */ void audit_ctl_lock(void) { mutex_lock(&audit_cmd_mutex.lock); audit_cmd_mutex.owner = current; } /** * audit_ctl_unlock - Drop the audit control lock */ void audit_ctl_unlock(void) { audit_cmd_mutex.owner = NULL; mutex_unlock(&audit_cmd_mutex.lock); } /** * audit_ctl_owner_current - Test to see if the current task owns the lock * * Description: * Return true if the current task owns the audit control lock, false if it * doesn't own the lock. */ static bool audit_ctl_owner_current(void) { return (current == audit_cmd_mutex.owner); } /** * auditd_pid_vnr - Return the auditd PID relative to the namespace * * Description: * Returns the PID in relation to the namespace, 0 on failure. */ static pid_t auditd_pid_vnr(void) { pid_t pid; const struct auditd_connection *ac; rcu_read_lock(); ac = rcu_dereference(auditd_conn); if (!ac || !ac->pid) pid = 0; else pid = pid_vnr(ac->pid); rcu_read_unlock(); return pid; } /** * audit_get_sk - Return the audit socket for the given network namespace * @net: the destination network namespace * * Description: * Returns the sock pointer if valid, NULL otherwise. The caller must ensure * that a reference is held for the network namespace while the sock is in use. */ static struct sock *audit_get_sk(const struct net *net) { struct audit_net *aunet; if (!net) return NULL; aunet = net_generic(net, audit_net_id); return aunet->sk; } void audit_panic(const char *message) { switch (audit_failure) { case AUDIT_FAIL_SILENT: break; case AUDIT_FAIL_PRINTK: if (printk_ratelimit()) pr_err("%s\n", message); break; case AUDIT_FAIL_PANIC: panic("audit: %s\n", message); break; } } static inline int audit_rate_check(void) { static unsigned long last_check = 0; static int messages = 0; static DEFINE_SPINLOCK(lock); unsigned long flags; unsigned long now; int retval = 0; if (!audit_rate_limit) return 1; spin_lock_irqsave(&lock, flags); if (++messages < audit_rate_limit) { retval = 1; } else { now = jiffies; if (time_after(now, last_check + HZ)) { last_check = now; messages = 0; retval = 1; } } spin_unlock_irqrestore(&lock, flags); return retval; } /** * audit_log_lost - conditionally log lost audit message event * @message: the message stating reason for lost audit message * * Emit at least 1 message per second, even if audit_rate_check is * throttling. * Always increment the lost messages counter. */ void audit_log_lost(const char *message) { static unsigned long last_msg = 0; static DEFINE_SPINLOCK(lock); unsigned long flags; unsigned long now; int print; atomic_inc(&audit_lost); print = (audit_failure == AUDIT_FAIL_PANIC || !audit_rate_limit); if (!print) { spin_lock_irqsave(&lock, flags); now = jiffies; if (time_after(now, last_msg + HZ)) { print = 1; last_msg = now; } spin_unlock_irqrestore(&lock, flags); } if (print) { if (printk_ratelimit()) pr_warn("audit_lost=%u audit_rate_limit=%u audit_backlog_limit=%u\n", atomic_read(&audit_lost), audit_rate_limit, audit_backlog_limit); audit_panic(message); } } static int audit_log_config_change(char *function_name, u32 new, u32 old, int allow_changes) { struct audit_buffer *ab; int rc = 0; ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_CONFIG_CHANGE); if (unlikely(!ab)) return rc; audit_log_format(ab, "op=set %s=%u old=%u ", function_name, new, old); audit_log_session_info(ab); rc = audit_log_task_context(ab); if (rc) allow_changes = 0; /* Something weird, deny request */ audit_log_format(ab, " res=%d", allow_changes); audit_log_end(ab); return rc; } static int audit_do_config_change(char *function_name, u32 *to_change, u32 new) { int allow_changes, rc = 0; u32 old = *to_change; /* check if we are locked */ if (audit_enabled == AUDIT_LOCKED) allow_changes = 0; else allow_changes = 1; if (audit_enabled != AUDIT_OFF) { rc = audit_log_config_change(function_name, new, old, allow_changes); if (rc) allow_changes = 0; } /* If we are allowed, make the change */ if (allow_changes == 1) *to_change = new; /* Not allowed, update reason */ else if (rc == 0) rc = -EPERM; return rc; } static int audit_set_rate_limit(u32 limit) { return audit_do_config_change("audit_rate_limit", &audit_rate_limit, limit); } static int audit_set_backlog_limit(u32 limit) { return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, limit); } static int audit_set_backlog_wait_time(u32 timeout) { return audit_do_config_change("audit_backlog_wait_time", &audit_backlog_wait_time, timeout); } static int audit_set_enabled(u32 state) { int rc; if (state > AUDIT_LOCKED) return -EINVAL; rc = audit_do_config_change("audit_enabled", &audit_enabled, state); if (!rc) audit_ever_enabled |= !!state; return rc; } static int audit_set_failure(u32 state) { if (state != AUDIT_FAIL_SILENT && state != AUDIT_FAIL_PRINTK && state != AUDIT_FAIL_PANIC) return -EINVAL; return audit_do_config_change("audit_failure", &audit_failure, state); } /** * auditd_conn_free - RCU helper to release an auditd connection struct * @rcu: RCU head * * Description: * Drop any references inside the auditd connection tracking struct and free * the memory. */ static void auditd_conn_free(struct rcu_head *rcu) { struct auditd_connection *ac; ac = container_of(rcu, struct auditd_connection, rcu); put_pid(ac->pid); put_net(ac->net); kfree(ac); } /** * auditd_set - Set/Reset the auditd connection state * @pid: auditd PID * @portid: auditd netlink portid * @net: auditd network namespace pointer * @skb: the netlink command from the audit daemon * @ack: netlink ack flag, cleared if ack'd here * * Description: * This function will obtain and drop network namespace references as * necessary. Returns zero on success, negative values on failure. */ static int auditd_set(struct pid *pid, u32 portid, struct net *net, struct sk_buff *skb, bool *ack) { unsigned long flags; struct auditd_connection *ac_old, *ac_new; struct nlmsghdr *nlh; if (!pid || !net) return -EINVAL; ac_new = kzalloc(sizeof(*ac_new), GFP_KERNEL); if (!ac_new) return -ENOMEM; ac_new->pid = get_pid(pid); ac_new->portid = portid; ac_new->net = get_net(net); /* send the ack now to avoid a race with the queue backlog */ if (*ack) { nlh = nlmsg_hdr(skb); netlink_ack(skb, nlh, 0, NULL); *ack = false; } spin_lock_irqsave(&auditd_conn_lock, flags); ac_old = rcu_dereference_protected(auditd_conn, lockdep_is_held(&auditd_conn_lock)); rcu_assign_pointer(auditd_conn, ac_new); spin_unlock_irqrestore(&auditd_conn_lock, flags); if (ac_old) call_rcu(&ac_old->rcu, auditd_conn_free); return 0; } /** * kauditd_printk_skb - Print the audit record to the ring buffer * @skb: audit record * * Whatever the reason, this packet may not make it to the auditd connection * so write it via printk so the information isn't completely lost. */ static void kauditd_printk_skb(struct sk_buff *skb) { struct nlmsghdr *nlh = nlmsg_hdr(skb); char *data = nlmsg_data(nlh); if (nlh->nlmsg_type != AUDIT_EOE && printk_ratelimit()) pr_notice("type=%d %s\n", nlh->nlmsg_type, data); } /** * kauditd_rehold_skb - Handle a audit record send failure in the hold queue * @skb: audit record * @error: error code (unused) * * Description: * This should only be used by the kauditd_thread when it fails to flush the * hold queue. */ static void kauditd_rehold_skb(struct sk_buff *skb, __always_unused int error) { /* put the record back in the queue */ skb_queue_tail(&audit_hold_queue, skb); } /** * kauditd_hold_skb - Queue an audit record, waiting for auditd * @skb: audit record * @error: error code * * Description: * Queue the audit record, waiting for an instance of auditd. When this * function is called we haven't given up yet on sending the record, but things * are not looking good. The first thing we want to do is try to write the * record via printk and then see if we want to try and hold on to the record * and queue it, if we have room. If we want to hold on to the record, but we * don't have room, record a record lost message. */ static void kauditd_hold_skb(struct sk_buff *skb, int error) { /* at this point it is uncertain if we will ever send this to auditd so * try to send the message via printk before we go any further */ kauditd_printk_skb(skb); /* can we just silently drop the message? */ if (!audit_default) goto drop; /* the hold queue is only for when the daemon goes away completely, * not -EAGAIN failures; if we are in a -EAGAIN state requeue the * record on the retry queue unless it's full, in which case drop it */ if (error == -EAGAIN) { if (!audit_backlog_limit || skb_queue_len(&audit_retry_queue) < audit_backlog_limit) { skb_queue_tail(&audit_retry_queue, skb); return; } audit_log_lost("kauditd retry queue overflow"); goto drop; } /* if we have room in the hold queue, queue the message */ if (!audit_backlog_limit || skb_queue_len(&audit_hold_queue) < audit_backlog_limit) { skb_queue_tail(&audit_hold_queue, skb); return; } /* we have no other options - drop the message */ audit_log_lost("kauditd hold queue overflow"); drop: kfree_skb(skb); } /** * kauditd_retry_skb - Queue an audit record, attempt to send again to auditd * @skb: audit record * @error: error code (unused) * * Description: * Not as serious as kauditd_hold_skb() as we still have a connected auditd, * but for some reason we are having problems sending it audit records so * queue the given record and attempt to resend. */ static void kauditd_retry_skb(struct sk_buff *skb, __always_unused int error) { if (!audit_backlog_limit || skb_queue_len(&audit_retry_queue) < audit_backlog_limit) { skb_queue_tail(&audit_retry_queue, skb); return; } /* we have to drop the record, send it via printk as a last effort */ kauditd_printk_skb(skb); audit_log_lost("kauditd retry queue overflow"); kfree_skb(skb); } /** * auditd_reset - Disconnect the auditd connection * @ac: auditd connection state * * Description: * Break the auditd/kauditd connection and move all the queued records into the * hold queue in case auditd reconnects. It is important to note that the @ac * pointer should never be dereferenced inside this function as it may be NULL * or invalid, you can only compare the memory address! If @ac is NULL then * the connection will always be reset. */ static void auditd_reset(const struct auditd_connection *ac) { unsigned long flags; struct sk_buff *skb; struct auditd_connection *ac_old; /* if it isn't already broken, break the connection */ spin_lock_irqsave(&auditd_conn_lock, flags); ac_old = rcu_dereference_protected(auditd_conn, lockdep_is_held(&auditd_conn_lock)); if (ac && ac != ac_old) { /* someone already registered a new auditd connection */ spin_unlock_irqrestore(&auditd_conn_lock, flags); return; } rcu_assign_pointer(auditd_conn, NULL); spin_unlock_irqrestore(&auditd_conn_lock, flags); if (ac_old) call_rcu(&ac_old->rcu, auditd_conn_free); /* flush the retry queue to the hold queue, but don't touch the main * queue since we need to process that normally for multicast */ while ((skb = skb_dequeue(&audit_retry_queue))) kauditd_hold_skb(skb, -ECONNREFUSED); } /** * auditd_send_unicast_skb - Send a record via unicast to auditd * @skb: audit record * * Description: * Send a skb to the audit daemon, returns positive/zero values on success and * negative values on failure; in all cases the skb will be consumed by this * function. If the send results in -ECONNREFUSED the connection with auditd * will be reset. This function may sleep so callers should not hold any locks * where this would cause a problem. */ static int auditd_send_unicast_skb(struct sk_buff *skb) { int rc; u32 portid; struct net *net; struct sock *sk; struct auditd_connection *ac; /* NOTE: we can't call netlink_unicast while in the RCU section so * take a reference to the network namespace and grab local * copies of the namespace, the sock, and the portid; the * namespace and sock aren't going to go away while we hold a * reference and if the portid does become invalid after the RCU * section netlink_unicast() should safely return an error */ rcu_read_lock(); ac = rcu_dereference(auditd_conn); if (!ac) { rcu_read_unlock(); kfree_skb(skb); rc = -ECONNREFUSED; goto err; } net = get_net(ac->net); sk = audit_get_sk(net); portid = ac->portid; rcu_read_unlock(); rc = netlink_unicast(sk, skb, portid, 0); put_net(net); if (rc < 0) goto err; return rc; err: if (ac && rc == -ECONNREFUSED) auditd_reset(ac); return rc; } /** * kauditd_send_queue - Helper for kauditd_thread to flush skb queues * @sk: the sending sock * @portid: the netlink destination * @queue: the skb queue to process * @retry_limit: limit on number of netlink unicast failures * @skb_hook: per-skb hook for additional processing * @err_hook: hook called if the skb fails the netlink unicast send * * Description: * Run through the given queue and attempt to send the audit records to auditd, * returns zero on success, negative values on failure. It is up to the caller * to ensure that the @sk is valid for the duration of this function. * */ static int kauditd_send_queue(struct sock *sk, u32 portid, struct sk_buff_head *queue, unsigned int retry_limit, void (*skb_hook)(struct sk_buff *skb), void (*err_hook)(struct sk_buff *skb, int error)) { int rc = 0; struct sk_buff *skb = NULL; struct sk_buff *skb_tail; unsigned int failed = 0; /* NOTE: kauditd_thread takes care of all our locking, we just use * the netlink info passed to us (e.g. sk and portid) */ skb_tail = skb_peek_tail(queue); while ((skb != skb_tail) && (skb = skb_dequeue(queue))) { /* call the skb_hook for each skb we touch */ if (skb_hook) (*skb_hook)(skb); /* can we send to anyone via unicast? */ if (!sk) { if (err_hook) (*err_hook)(skb, -ECONNREFUSED); continue; } retry: /* grab an extra skb reference in case of error */ skb_get(skb); rc = netlink_unicast(sk, skb, portid, 0); if (rc < 0) { /* send failed - try a few times unless fatal error */ if (++failed >= retry_limit || rc == -ECONNREFUSED || rc == -EPERM) { sk = NULL; if (err_hook) (*err_hook)(skb, rc); if (rc == -EAGAIN) rc = 0; /* continue to drain the queue */ continue; } else goto retry; } else { /* skb sent - drop the extra reference and continue */ consume_skb(skb); failed = 0; } } return (rc >= 0 ? 0 : rc); } /* * kauditd_send_multicast_skb - Send a record to any multicast listeners * @skb: audit record * * Description: * Write a multicast message to anyone listening in the initial network * namespace. This function doesn't consume an skb as might be expected since * it has to copy it anyways. */ static void kauditd_send_multicast_skb(struct sk_buff *skb) { struct sk_buff *copy; struct sock *sock = audit_get_sk(&init_net); struct nlmsghdr *nlh; /* NOTE: we are not taking an additional reference for init_net since * we don't have to worry about it going away */ if (!netlink_has_listeners(sock, AUDIT_NLGRP_READLOG)) return; /* * The seemingly wasteful skb_copy() rather than bumping the refcount * using skb_get() is necessary because non-standard mods are made to * the skb by the original kaudit unicast socket send routine. The * existing auditd daemon assumes this breakage. Fixing this would * require co-ordinating a change in the established protocol between * the kaudit kernel subsystem and the auditd userspace code. There is * no reason for new multicast clients to continue with this * non-compliance. */ copy = skb_copy(skb, GFP_KERNEL); if (!copy) return; nlh = nlmsg_hdr(copy); nlh->nlmsg_len = skb->len; nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, GFP_KERNEL); } /** * kauditd_thread - Worker thread to send audit records to userspace * @dummy: unused */ static int kauditd_thread(void *dummy) { int rc; u32 portid = 0; struct net *net = NULL; struct sock *sk = NULL; struct auditd_connection *ac; #define UNICAST_RETRIES 5 set_freezable(); while (!kthread_should_stop()) { /* NOTE: see the lock comments in auditd_send_unicast_skb() */ rcu_read_lock(); ac = rcu_dereference(auditd_conn); if (!ac) { rcu_read_unlock(); goto main_queue; } net = get_net(ac->net); sk = audit_get_sk(net); portid = ac->portid; rcu_read_unlock(); /* attempt to flush the hold queue */ rc = kauditd_send_queue(sk, portid, &audit_hold_queue, UNICAST_RETRIES, NULL, kauditd_rehold_skb); if (rc < 0) { sk = NULL; auditd_reset(ac); goto main_queue; } /* attempt to flush the retry queue */ rc = kauditd_send_queue(sk, portid, &audit_retry_queue, UNICAST_RETRIES, NULL, kauditd_hold_skb); if (rc < 0) { sk = NULL; auditd_reset(ac); goto main_queue; } main_queue: /* process the main queue - do the multicast send and attempt * unicast, dump failed record sends to the retry queue; if * sk == NULL due to previous failures we will just do the * multicast send and move the record to the hold queue */ rc = kauditd_send_queue(sk, portid, &audit_queue, 1, kauditd_send_multicast_skb, (sk ? kauditd_retry_skb : kauditd_hold_skb)); if (ac && rc < 0) auditd_reset(ac); sk = NULL; /* drop our netns reference, no auditd sends past this line */ if (net) { put_net(net); net = NULL; } /* we have processed all the queues so wake everyone */ wake_up(&audit_backlog_wait); /* NOTE: we want to wake up if there is anything on the queue, * regardless of if an auditd is connected, as we need to * do the multicast send and rotate records from the * main queue to the retry/hold queues */ wait_event_freezable(kauditd_wait, (skb_queue_len(&audit_queue) ? 1 : 0)); } return 0; } int audit_send_list_thread(void *_dest) { struct audit_netlink_list *dest = _dest; struct sk_buff *skb; struct sock *sk = audit_get_sk(dest->net); /* wait for parent to finish and send an ACK */ audit_ctl_lock(); audit_ctl_unlock(); while ((skb = __skb_dequeue(&dest->q)) != NULL) netlink_unicast(sk, skb, dest->portid, 0); put_net(dest->net); kfree(dest); return 0; } struct sk_buff *audit_make_reply(int seq, int type, int done, int multi, const void *payload, int size) { struct sk_buff *skb; struct nlmsghdr *nlh; void *data; int flags = multi ? NLM_F_MULTI : 0; int t = done ? NLMSG_DONE : type; skb = nlmsg_new(size, GFP_KERNEL); if (!skb) return NULL; nlh = nlmsg_put(skb, 0, seq, t, size, flags); if (!nlh) goto out_kfree_skb; data = nlmsg_data(nlh); memcpy(data, payload, size); return skb; out_kfree_skb: kfree_skb(skb); return NULL; } static void audit_free_reply(struct audit_reply *reply) { if (!reply) return; kfree_skb(reply->skb); if (reply->net) put_net(reply->net); kfree(reply); } static int audit_send_reply_thread(void *arg) { struct audit_reply *reply = (struct audit_reply *)arg; audit_ctl_lock(); audit_ctl_unlock(); /* Ignore failure. It'll only happen if the sender goes away, because our timeout is set to infinite. */ netlink_unicast(audit_get_sk(reply->net), reply->skb, reply->portid, 0); reply->skb = NULL; audit_free_reply(reply); return 0; } /** * audit_send_reply - send an audit reply message via netlink * @request_skb: skb of request we are replying to (used to target the reply) * @seq: sequence number * @type: audit message type * @done: done (last) flag * @multi: multi-part message flag * @payload: payload data * @size: payload size * * Allocates a skb, builds the netlink message, and sends it to the port id. */ static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int done, int multi, const void *payload, int size) { struct task_struct *tsk; struct audit_reply *reply; reply = kzalloc(sizeof(*reply), GFP_KERNEL); if (!reply) return; reply->skb = audit_make_reply(seq, type, done, multi, payload, size); if (!reply->skb) goto err; reply->net = get_net(sock_net(NETLINK_CB(request_skb).sk)); reply->portid = NETLINK_CB(request_skb).portid; tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply"); if (IS_ERR(tsk)) goto err; return; err: audit_free_reply(reply); } /* * Check for appropriate CAP_AUDIT_ capabilities on incoming audit * control messages. */ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) { int err = 0; /* Only support initial user namespace for now. */ /* * We return ECONNREFUSED because it tricks userspace into thinking * that audit was not configured into the kernel. Lots of users * configure their PAM stack (because that's what the distro does) * to reject login if unable to send messages to audit. If we return * ECONNREFUSED the PAM stack thinks the kernel does not have audit * configured in and will let login proceed. If we return EPERM * userspace will reject all logins. This should be removed when we * support non init namespaces!! */ if (current_user_ns() != &init_user_ns) return -ECONNREFUSED; switch (msg_type) { case AUDIT_LIST: case AUDIT_ADD: case AUDIT_DEL: return -EOPNOTSUPP; case AUDIT_GET: case AUDIT_SET: case AUDIT_GET_FEATURE: case AUDIT_SET_FEATURE: case AUDIT_LIST_RULES: case AUDIT_ADD_RULE: case AUDIT_DEL_RULE: case AUDIT_SIGNAL_INFO: case AUDIT_TTY_GET: case AUDIT_TTY_SET: case AUDIT_TRIM: case AUDIT_MAKE_EQUIV: /* Only support auditd and auditctl in initial pid namespace * for now. */ if (task_active_pid_ns(current) != &init_pid_ns) return -EPERM; if (!netlink_capable(skb, CAP_AUDIT_CONTROL)) err = -EPERM; break; case AUDIT_USER: case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: if (!netlink_capable(skb, CAP_AUDIT_WRITE)) err = -EPERM; break; default: /* bad msg */ err = -EINVAL; } return err; } static void audit_log_common_recv_msg(struct audit_context *context, struct audit_buffer **ab, u16 msg_type) { uid_t uid = from_kuid(&init_user_ns, current_uid()); pid_t pid = task_tgid_nr(current); if (!audit_enabled && msg_type != AUDIT_USER_AVC) { *ab = NULL; return; } *ab = audit_log_start(context, GFP_KERNEL, msg_type); if (unlikely(!*ab)) return; audit_log_format(*ab, "pid=%d uid=%u ", pid, uid); audit_log_session_info(*ab); audit_log_task_context(*ab); } static inline void audit_log_user_recv_msg(struct audit_buffer **ab, u16 msg_type) { audit_log_common_recv_msg(NULL, ab, msg_type); } static int is_audit_feature_set(int i) { return af.features & AUDIT_FEATURE_TO_MASK(i); } static int audit_get_feature(struct sk_buff *skb) { u32 seq; seq = nlmsg_hdr(skb)->nlmsg_seq; audit_send_reply(skb, seq, AUDIT_GET_FEATURE, 0, 0, &af, sizeof(af)); return 0; } static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature, u32 old_lock, u32 new_lock, int res) { struct audit_buffer *ab; if (audit_enabled == AUDIT_OFF) return; ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_FEATURE_CHANGE); if (!ab) return; audit_log_task_info(ab); audit_log_format(ab, " feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d", audit_feature_names[which], !!old_feature, !!new_feature, !!old_lock, !!new_lock, res); audit_log_end(ab); } static int audit_set_feature(struct audit_features *uaf) { int i; BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > ARRAY_SIZE(audit_feature_names)); /* if there is ever a version 2 we should handle that here */ for (i = 0; i <= AUDIT_LAST_FEATURE; i++) { u32 feature = AUDIT_FEATURE_TO_MASK(i); u32 old_feature, new_feature, old_lock, new_lock; /* if we are not changing this feature, move along */ if (!(feature & uaf->mask)) continue; old_feature = af.features & feature; new_feature = uaf->features & feature; new_lock = (uaf->lock | af.lock) & feature; old_lock = af.lock & feature; /* are we changing a locked feature? */ if (old_lock && (new_feature != old_feature)) { audit_log_feature_change(i, old_feature, new_feature, old_lock, new_lock, 0); return -EPERM; } } /* nothing invalid, do the changes */ for (i = 0; i <= AUDIT_LAST_FEATURE; i++) { u32 feature = AUDIT_FEATURE_TO_MASK(i); u32 old_feature, new_feature, old_lock, new_lock; /* if we are not changing this feature, move along */ if (!(feature & uaf->mask)) continue; old_feature = af.features & feature; new_feature = uaf->features & feature; old_lock = af.lock & feature; new_lock = (uaf->lock | af.lock) & feature; if (new_feature != old_feature) audit_log_feature_change(i, old_feature, new_feature, old_lock, new_lock, 1); if (new_feature) af.features |= feature; else af.features &= ~feature; af.lock |= new_lock; } return 0; } static int audit_replace(struct pid *pid) { pid_t pvnr; struct sk_buff *skb; pvnr = pid_vnr(pid); skb = audit_make_reply(0, AUDIT_REPLACE, 0, 0, &pvnr, sizeof(pvnr)); if (!skb) return -ENOMEM; return auditd_send_unicast_skb(skb); } static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh, bool *ack) { u32 seq; void *data; int data_len; int err; struct audit_buffer *ab; u16 msg_type = nlh->nlmsg_type; struct audit_sig_info *sig_data; struct lsm_context lsmctx = { NULL, 0, 0 }; err = audit_netlink_ok(skb, msg_type); if (err) return err; seq = nlh->nlmsg_seq; data = nlmsg_data(nlh); data_len = nlmsg_len(nlh); switch (msg_type) { case AUDIT_GET: { struct audit_status s; memset(&s, 0, sizeof(s)); s.enabled = audit_enabled; s.failure = audit_failure; /* NOTE: use pid_vnr() so the PID is relative to the current * namespace */ s.pid = auditd_pid_vnr(); s.rate_limit = audit_rate_limit; s.backlog_limit = audit_backlog_limit; s.lost = atomic_read(&audit_lost); s.backlog = skb_queue_len(&audit_queue); s.feature_bitmap = AUDIT_FEATURE_BITMAP_ALL; s.backlog_wait_time = audit_backlog_wait_time; s.backlog_wait_time_actual = atomic_read(&audit_backlog_wait_time_actual); audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s)); break; } case AUDIT_SET: { struct audit_status s; memset(&s, 0, sizeof(s)); /* guard against past and future API changes */ memcpy(&s, data, min_t(size_t, sizeof(s), data_len)); if (s.mask & AUDIT_STATUS_ENABLED) { err = audit_set_enabled(s.enabled); if (err < 0) return err; } if (s.mask & AUDIT_STATUS_FAILURE) { err = audit_set_failure(s.failure); if (err < 0) return err; } if (s.mask & AUDIT_STATUS_PID) { /* NOTE: we are using the vnr PID functions below * because the s.pid value is relative to the * namespace of the caller; at present this * doesn't matter much since you can really only * run auditd from the initial pid namespace, but * something to keep in mind if this changes */ pid_t new_pid = s.pid; pid_t auditd_pid; struct pid *req_pid = task_tgid(current); /* Sanity check - PID values must match. Setting * pid to 0 is how auditd ends auditing. */ if (new_pid && (new_pid != pid_vnr(req_pid))) return -EINVAL; /* test the auditd connection */ audit_replace(req_pid); auditd_pid = auditd_pid_vnr(); if (auditd_pid) { /* replacing a healthy auditd is not allowed */ if (new_pid) { audit_log_config_change("audit_pid", new_pid, auditd_pid, 0); return -EEXIST; } /* only current auditd can unregister itself */ if (pid_vnr(req_pid) != auditd_pid) { audit_log_config_change("audit_pid", new_pid, auditd_pid, 0); return -EACCES; } } if (new_pid) { /* register a new auditd connection */ err = auditd_set(req_pid, NETLINK_CB(skb).portid, sock_net(NETLINK_CB(skb).sk), skb, ack); if (audit_enabled != AUDIT_OFF) audit_log_config_change("audit_pid", new_pid, auditd_pid, err ? 0 : 1); if (err) return err; /* try to process any backlog */ wake_up_interruptible(&kauditd_wait); } else { if (audit_enabled != AUDIT_OFF) audit_log_config_change("audit_pid", new_pid, auditd_pid, 1); /* unregister the auditd connection */ auditd_reset(NULL); } } if (s.mask & AUDIT_STATUS_RATE_LIMIT) { err = audit_set_rate_limit(s.rate_limit); if (err < 0) return err; } if (s.mask & AUDIT_STATUS_BACKLOG_LIMIT) { err = audit_set_backlog_limit(s.backlog_limit); if (err < 0) return err; } if (s.mask & AUDIT_STATUS_BACKLOG_WAIT_TIME) { if (sizeof(s) > (size_t)nlh->nlmsg_len) return -EINVAL; if (s.backlog_wait_time > 10*AUDIT_BACKLOG_WAIT_TIME) return -EINVAL; err = audit_set_backlog_wait_time(s.backlog_wait_time); if (err < 0) return err; } if (s.mask == AUDIT_STATUS_LOST) { u32 lost = atomic_xchg(&audit_lost, 0); audit_log_config_change("lost", 0, lost, 1); return lost; } if (s.mask == AUDIT_STATUS_BACKLOG_WAIT_TIME_ACTUAL) { u32 actual = atomic_xchg(&audit_backlog_wait_time_actual, 0); audit_log_config_change("backlog_wait_time_actual", 0, actual, 1); return actual; } break; } case AUDIT_GET_FEATURE: err = audit_get_feature(skb); if (err) return err; break; case AUDIT_SET_FEATURE: if (data_len < sizeof(struct audit_features)) return -EINVAL; err = audit_set_feature(data); if (err) return err; break; case AUDIT_USER: case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: if (!audit_enabled && msg_type != AUDIT_USER_AVC) return 0; /* exit early if there isn't at least one character to print */ if (data_len < 2) return -EINVAL; err = audit_filter(msg_type, AUDIT_FILTER_USER); if (err == 1) { /* match or error */ char *str = data; err = 0; if (msg_type == AUDIT_USER_TTY) { err = tty_audit_push(); if (err) break; } audit_log_user_recv_msg(&ab, msg_type); if (msg_type != AUDIT_USER_TTY) { /* ensure NULL termination */ str[data_len - 1] = '\0'; audit_log_format(ab, " msg='%.*s'", AUDIT_MESSAGE_TEXT_MAX, str); } else { audit_log_format(ab, " data="); if (str[data_len - 1] == '\0') data_len--; audit_log_n_untrustedstring(ab, str, data_len); } audit_log_end(ab); } break; case AUDIT_ADD_RULE: case AUDIT_DEL_RULE: if (data_len < sizeof(struct audit_rule_data)) return -EINVAL; if (audit_enabled == AUDIT_LOCKED) { audit_log_common_recv_msg(audit_context(), &ab, AUDIT_CONFIG_CHANGE); audit_log_format(ab, " op=%s audit_enabled=%d res=0", msg_type == AUDIT_ADD_RULE ? "add_rule" : "remove_rule", audit_enabled); audit_log_end(ab); return -EPERM; } err = audit_rule_change(msg_type, seq, data, data_len); break; case AUDIT_LIST_RULES: err = audit_list_rules_send(skb, seq); break; case AUDIT_TRIM: audit_trim_trees(); audit_log_common_recv_msg(audit_context(), &ab, AUDIT_CONFIG_CHANGE); audit_log_format(ab, " op=trim res=1"); audit_log_end(ab); break; case AUDIT_MAKE_EQUIV: { void *bufp = data; u32 sizes[2]; size_t msglen = data_len; char *old, *new; err = -EINVAL; if (msglen < 2 * sizeof(u32)) break; memcpy(sizes, bufp, 2 * sizeof(u32)); bufp += 2 * sizeof(u32); msglen -= 2 * sizeof(u32); old = audit_unpack_string(&bufp, &msglen, sizes[0]); if (IS_ERR(old)) { err = PTR_ERR(old); break; } new = audit_unpack_string(&bufp, &msglen, sizes[1]); if (IS_ERR(new)) { err = PTR_ERR(new); kfree(old); break; } /* OK, here comes... */ err = audit_tag_tree(old, new); audit_log_common_recv_msg(audit_context(), &ab, AUDIT_CONFIG_CHANGE); audit_log_format(ab, " op=make_equiv old="); audit_log_untrustedstring(ab, old); audit_log_format(ab, " new="); audit_log_untrustedstring(ab, new); audit_log_format(ab, " res=%d", !err); audit_log_end(ab); kfree(old); kfree(new); break; } case AUDIT_SIGNAL_INFO: if (lsmprop_is_set(&audit_sig_lsm)) { err = security_lsmprop_to_secctx(&audit_sig_lsm, &lsmctx); if (err < 0) return err; } sig_data = kmalloc(struct_size(sig_data, ctx, lsmctx.len), GFP_KERNEL); if (!sig_data) { if (lsmprop_is_set(&audit_sig_lsm)) security_release_secctx(&lsmctx); return -ENOMEM; } sig_data->uid = from_kuid(&init_user_ns, audit_sig_uid); sig_data->pid = audit_sig_pid; if (lsmprop_is_set(&audit_sig_lsm)) { memcpy(sig_data->ctx, lsmctx.context, lsmctx.len); security_release_secctx(&lsmctx); } audit_send_reply(skb, seq, AUDIT_SIGNAL_INFO, 0, 0, sig_data, struct_size(sig_data, ctx, lsmctx.len)); kfree(sig_data); break; case AUDIT_TTY_GET: { struct audit_tty_status s; unsigned int t; t = READ_ONCE(current->signal->audit_tty); s.enabled = t & AUDIT_TTY_ENABLE; s.log_passwd = !!(t & AUDIT_TTY_LOG_PASSWD); audit_send_reply(skb, seq, AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); break; } case AUDIT_TTY_SET: { struct audit_tty_status s, old; struct audit_buffer *ab; unsigned int t; memset(&s, 0, sizeof(s)); /* guard against past and future API changes */ memcpy(&s, data, min_t(size_t, sizeof(s), data_len)); /* check if new data is valid */ if ((s.enabled != 0 && s.enabled != 1) || (s.log_passwd != 0 && s.log_passwd != 1)) err = -EINVAL; if (err) t = READ_ONCE(current->signal->audit_tty); else { t = s.enabled | (-s.log_passwd & AUDIT_TTY_LOG_PASSWD); t = xchg(&current->signal->audit_tty, t); } old.enabled = t & AUDIT_TTY_ENABLE; old.log_passwd = !!(t & AUDIT_TTY_LOG_PASSWD); audit_log_common_recv_msg(audit_context(), &ab, AUDIT_CONFIG_CHANGE); audit_log_format(ab, " op=tty_set old-enabled=%d new-enabled=%d" " old-log_passwd=%d new-log_passwd=%d res=%d", old.enabled, s.enabled, old.log_passwd, s.log_passwd, !err); audit_log_end(ab); break; } default: err = -EINVAL; break; } return err < 0 ? err : 0; } /** * audit_receive - receive messages from a netlink control socket * @skb: the message buffer * * Parse the provided skb and deal with any messages that may be present, * malformed skbs are discarded. */ static void audit_receive(struct sk_buff *skb) { struct nlmsghdr *nlh; bool ack; /* * len MUST be signed for nlmsg_next to be able to dec it below 0 * if the nlmsg_len was not aligned */ int len; int err; nlh = nlmsg_hdr(skb); len = skb->len; audit_ctl_lock(); while (nlmsg_ok(nlh, len)) { ack = nlh->nlmsg_flags & NLM_F_ACK; err = audit_receive_msg(skb, nlh, &ack); /* send an ack if the user asked for one and audit_receive_msg * didn't already do it, or if there was an error. */ if (ack || err) netlink_ack(skb, nlh, err, NULL); nlh = nlmsg_next(nlh, &len); } audit_ctl_unlock(); /* can't block with the ctrl lock, so penalize the sender now */ if (audit_backlog_limit && (skb_queue_len(&audit_queue) > audit_backlog_limit)) { DECLARE_WAITQUEUE(wait, current); /* wake kauditd to try and flush the queue */ wake_up_interruptible(&kauditd_wait); add_wait_queue_exclusive(&audit_backlog_wait, &wait); set_current_state(TASK_UNINTERRUPTIBLE); schedule_timeout(audit_backlog_wait_time); remove_wait_queue(&audit_backlog_wait, &wait); } } /* Log information about who is connecting to the audit multicast socket */ static void audit_log_multicast(int group, const char *op, int err) { const struct cred *cred; struct tty_struct *tty; char comm[sizeof(current->comm)]; struct audit_buffer *ab; if (!audit_enabled) return; ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_EVENT_LISTENER); if (!ab) return; cred = current_cred(); tty = audit_get_tty(); audit_log_format(ab, "pid=%u uid=%u auid=%u tty=%s ses=%u", task_tgid_nr(current), from_kuid(&init_user_ns, cred->uid), from_kuid(&init_user_ns, audit_get_loginuid(current)), tty ? tty_name(tty) : "(none)", audit_get_sessionid(current)); audit_put_tty(tty); audit_log_task_context(ab); /* subj= */ audit_log_format(ab, " comm="); audit_log_untrustedstring(ab, get_task_comm(comm, current)); audit_log_d_path_exe(ab, current->mm); /* exe= */ audit_log_format(ab, " nl-mcgrp=%d op=%s res=%d", group, op, !err); audit_log_end(ab); } /* Run custom bind function on netlink socket group connect or bind requests. */ static int audit_multicast_bind(struct net *net, int group) { int err = 0; if (!capable(CAP_AUDIT_READ)) err = -EPERM; audit_log_multicast(group, "connect", err); return err; } static void audit_multicast_unbind(struct net *net, int group) { audit_log_multicast(group, "disconnect", 0); } static int __net_init audit_net_init(struct net *net) { struct netlink_kernel_cfg cfg = { .input = audit_receive, .bind = audit_multicast_bind, .unbind = audit_multicast_unbind, .flags = NL_CFG_F_NONROOT_RECV, .groups = AUDIT_NLGRP_MAX, }; struct audit_net *aunet = net_generic(net, audit_net_id); aunet->sk = netlink_kernel_create(net, NETLINK_AUDIT, &cfg); if (aunet->sk == NULL) { audit_panic("cannot initialize netlink socket in namespace"); return -ENOMEM; } /* limit the timeout in case auditd is blocked/stopped */ aunet->sk->sk_sndtimeo = HZ / 10; return 0; } static void __net_exit audit_net_exit(struct net *net) { struct audit_net *aunet = net_generic(net, audit_net_id); /* NOTE: you would think that we would want to check the auditd * connection and potentially reset it here if it lives in this * namespace, but since the auditd connection tracking struct holds a * reference to this namespace (see auditd_set()) we are only ever * going to get here after that connection has been released */ netlink_kernel_release(aunet->sk); } static struct pernet_operations audit_net_ops __net_initdata = { .init = audit_net_init, .exit = audit_net_exit, .id = &audit_net_id, .size = sizeof(struct audit_net), }; /* Initialize audit support at boot time. */ static int __init audit_init(void) { int i; if (audit_initialized == AUDIT_DISABLED) return 0; audit_buffer_cache = KMEM_CACHE(audit_buffer, SLAB_PANIC); skb_queue_head_init(&audit_queue); skb_queue_head_init(&audit_retry_queue); skb_queue_head_init(&audit_hold_queue); for (i = 0; i < AUDIT_INODE_BUCKETS; i++) INIT_LIST_HEAD(&audit_inode_hash[i]); mutex_init(&audit_cmd_mutex.lock); audit_cmd_mutex.owner = NULL; pr_info("initializing netlink subsys (%s)\n", str_enabled_disabled(audit_default)); register_pernet_subsys(&audit_net_ops); audit_initialized = AUDIT_INITIALIZED; kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd"); if (IS_ERR(kauditd_task)) { int err = PTR_ERR(kauditd_task); panic("audit: failed to start the kauditd thread (%d)\n", err); } audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "state=initialized audit_enabled=%u res=1", audit_enabled); return 0; } postcore_initcall(audit_init); /* * Process kernel command-line parameter at boot time. * audit={0|off} or audit={1|on}. */ static int __init audit_enable(char *str) { if (!strcasecmp(str, "off") || !strcmp(str, "0")) audit_default = AUDIT_OFF; else if (!strcasecmp(str, "on") || !strcmp(str, "1")) audit_default = AUDIT_ON; else { pr_err("audit: invalid 'audit' parameter value (%s)\n", str); audit_default = AUDIT_ON; } if (audit_default == AUDIT_OFF) audit_initialized = AUDIT_DISABLED; if (audit_set_enabled(audit_default)) pr_err("audit: error setting audit state (%d)\n", audit_default); pr_info("%s\n", audit_default ? "enabled (after initialization)" : "disabled (until reboot)"); return 1; } __setup("audit=", audit_enable); /* Process kernel command-line parameter at boot time. * audit_backlog_limit=<n> */ static int __init audit_backlog_limit_set(char *str) { u32 audit_backlog_limit_arg; pr_info("audit_backlog_limit: "); if (kstrtouint(str, 0, &audit_backlog_limit_arg)) { pr_cont("using default of %u, unable to parse %s\n", audit_backlog_limit, str); return 1; } audit_backlog_limit = audit_backlog_limit_arg; pr_cont("%d\n", audit_backlog_limit); return 1; } __setup("audit_backlog_limit=", audit_backlog_limit_set); static void audit_buffer_free(struct audit_buffer *ab) { if (!ab) return; kfree_skb(ab->skb); kmem_cache_free(audit_buffer_cache, ab); } static struct audit_buffer *audit_buffer_alloc(struct audit_context *ctx, gfp_t gfp_mask, int type) { struct audit_buffer *ab; ab = kmem_cache_alloc(audit_buffer_cache, gfp_mask); if (!ab) return NULL; ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask); if (!ab->skb) goto err; if (!nlmsg_put(ab->skb, 0, 0, type, 0, 0)) goto err; ab->ctx = ctx; ab->gfp_mask = gfp_mask; return ab; err: audit_buffer_free(ab); return NULL; } /** * audit_serial - compute a serial number for the audit record * * Compute a serial number for the audit record. Audit records are * written to user-space as soon as they are generated, so a complete * audit record may be written in several pieces. The timestamp of the * record and this serial number are used by the user-space tools to * determine which pieces belong to the same audit record. The * (timestamp,serial) tuple is unique for each syscall and is live from * syscall entry to syscall exit. * * NOTE: Another possibility is to store the formatted records off the * audit context (for those records that have a context), and emit them * all at syscall exit. However, this could delay the reporting of * significant errors until syscall exit (or never, if the system * halts). */ unsigned int audit_serial(void) { static atomic_t serial = ATOMIC_INIT(0); return atomic_inc_return(&serial); } static inline void audit_get_stamp(struct audit_context *ctx, struct timespec64 *t, unsigned int *serial) { if (!ctx || !auditsc_get_stamp(ctx, t, serial)) { ktime_get_coarse_real_ts64(t); *serial = audit_serial(); } } /** * audit_log_start - obtain an audit buffer * @ctx: audit_context (may be NULL) * @gfp_mask: type of allocation * @type: audit message type * * Returns audit_buffer pointer on success or NULL on error. * * Obtain an audit buffer. This routine does locking to obtain the * audit buffer, but then no locking is required for calls to * audit_log_*format. If the task (ctx) is a task that is currently in a * syscall, then the syscall is marked as auditable and an audit record * will be written at syscall exit. If there is no associated task, then * task context (ctx) should be NULL. */ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type) { struct audit_buffer *ab; struct timespec64 t; unsigned int serial; if (audit_initialized != AUDIT_INITIALIZED) return NULL; if (unlikely(!audit_filter(type, AUDIT_FILTER_EXCLUDE))) return NULL; /* NOTE: don't ever fail/sleep on these two conditions: * 1. auditd generated record - since we need auditd to drain the * queue; also, when we are checking for auditd, compare PIDs using * task_tgid_vnr() since auditd_pid is set in audit_receive_msg() * using a PID anchored in the caller's namespace * 2. generator holding the audit_cmd_mutex - we don't want to block * while holding the mutex, although we do penalize the sender * later in audit_receive() when it is safe to block */ if (!(auditd_test_task(current) || audit_ctl_owner_current())) { long stime = audit_backlog_wait_time; while (audit_backlog_limit && (skb_queue_len(&audit_queue) > audit_backlog_limit)) { /* wake kauditd to try and flush the queue */ wake_up_interruptible(&kauditd_wait); /* sleep if we are allowed and we haven't exhausted our * backlog wait limit */ if (gfpflags_allow_blocking(gfp_mask) && (stime > 0)) { long rtime = stime; DECLARE_WAITQUEUE(wait, current); add_wait_queue_exclusive(&audit_backlog_wait, &wait); set_current_state(TASK_UNINTERRUPTIBLE); stime = schedule_timeout(rtime); atomic_add(rtime - stime, &audit_backlog_wait_time_actual); remove_wait_queue(&audit_backlog_wait, &wait); } else { if (audit_rate_check() && printk_ratelimit()) pr_warn("audit_backlog=%d > audit_backlog_limit=%d\n", skb_queue_len(&audit_queue), audit_backlog_limit); audit_log_lost("backlog limit exceeded"); return NULL; } } } ab = audit_buffer_alloc(ctx, gfp_mask, type); if (!ab) { audit_log_lost("out of memory in audit_log_start"); return NULL; } audit_get_stamp(ab->ctx, &t, &serial); /* cancel dummy context to enable supporting records */ if (ctx) ctx->dummy = 0; audit_log_format(ab, "audit(%llu.%03lu:%u): ", (unsigned long long)t.tv_sec, t.tv_nsec/1000000, serial); return ab; } /** * audit_expand - expand skb in the audit buffer * @ab: audit_buffer * @extra: space to add at tail of the skb * * Returns 0 (no space) on failed expansion, or available space if * successful. */ static inline int audit_expand(struct audit_buffer *ab, int extra) { struct sk_buff *skb = ab->skb; int oldtail = skb_tailroom(skb); int ret = pskb_expand_head(skb, 0, extra, ab->gfp_mask); int newtail = skb_tailroom(skb); if (ret < 0) { audit_log_lost("out of memory in audit_expand"); return 0; } skb->truesize += newtail - oldtail; return newtail; } /* * Format an audit message into the audit buffer. If there isn't enough * room in the audit buffer, more room will be allocated and vsnprint * will be called a second time. Currently, we assume that a printk * can't format message larger than 1024 bytes, so we don't either. */ static __printf(2, 0) void audit_log_vformat(struct audit_buffer *ab, const char *fmt, va_list args) { int len, avail; struct sk_buff *skb; va_list args2; if (!ab) return; BUG_ON(!ab->skb); skb = ab->skb; avail = skb_tailroom(skb); if (avail == 0) { avail = audit_expand(ab, AUDIT_BUFSIZ); if (!avail) goto out; } va_copy(args2, args); len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args); if (len >= avail) { /* The printk buffer is 1024 bytes long, so if we get * here and AUDIT_BUFSIZ is at least 1024, then we can * log everything that printk could have logged. */ avail = audit_expand(ab, max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail)); if (!avail) goto out_va_end; len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args2); } if (len > 0) skb_put(skb, len); out_va_end: va_end(args2); out: return; } /** * audit_log_format - format a message into the audit buffer. * @ab: audit_buffer * @fmt: format string * @...: optional parameters matching @fmt string * * All the work is done in audit_log_vformat. */ void audit_log_format(struct audit_buffer *ab, const char *fmt, ...) { va_list args; if (!ab) return; va_start(args, fmt); audit_log_vformat(ab, fmt, args); va_end(args); } /** * audit_log_n_hex - convert a buffer to hex and append it to the audit skb * @ab: the audit_buffer * @buf: buffer to convert to hex * @len: length of @buf to be converted * * No return value; failure to expand is silently ignored. * * This function will take the passed buf and convert it into a string of * ascii hex digits. The new string is placed onto the skb. */ void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf, size_t len) { int i, avail, new_len; unsigned char *ptr; struct sk_buff *skb; if (!ab) return; BUG_ON(!ab->skb); skb = ab->skb; avail = skb_tailroom(skb); new_len = len<<1; if (new_len >= avail) { /* Round the buffer request up to the next multiple */ new_len = AUDIT_BUFSIZ*(((new_len-avail)/AUDIT_BUFSIZ) + 1); avail = audit_expand(ab, new_len); if (!avail) return; } ptr = skb_tail_pointer(skb); for (i = 0; i < len; i++) ptr = hex_byte_pack_upper(ptr, buf[i]); *ptr = 0; skb_put(skb, len << 1); /* new string is twice the old string */ } /* * Format a string of no more than slen characters into the audit buffer, * enclosed in quote marks. */ void audit_log_n_string(struct audit_buffer *ab, const char *string, size_t slen) { int avail, new_len; unsigned char *ptr; struct sk_buff *skb; if (!ab) return; BUG_ON(!ab->skb); skb = ab->skb; avail = skb_tailroom(skb); new_len = slen + 3; /* enclosing quotes + null terminator */ if (new_len > avail) { avail = audit_expand(ab, new_len); if (!avail) return; } ptr = skb_tail_pointer(skb); *ptr++ = '"'; memcpy(ptr, string, slen); ptr += slen; *ptr++ = '"'; *ptr = 0; skb_put(skb, slen + 2); /* don't include null terminator */ } /** * audit_string_contains_control - does a string need to be logged in hex * @string: string to be checked * @len: max length of the string to check */ bool audit_string_contains_control(const char *string, size_t len) { const unsigned char *p; for (p = string; p < (const unsigned char *)string + len; p++) { if (*p == '"' || *p < 0x21 || *p > 0x7e) return true; } return false; } /** * audit_log_n_untrustedstring - log a string that may contain random characters * @ab: audit_buffer * @string: string to be logged * @len: length of string (not including trailing null) * * This code will escape a string that is passed to it if the string * contains a control character, unprintable character, double quote mark, * or a space. Unescaped strings will start and end with a double quote mark. * Strings that are escaped are printed in hex (2 digits per char). * * The caller specifies the number of characters in the string to log, which may * or may not be the entire string. */ void audit_log_n_untrustedstring(struct audit_buffer *ab, const char *string, size_t len) { if (audit_string_contains_control(string, len)) audit_log_n_hex(ab, string, len); else audit_log_n_string(ab, string, len); } /** * audit_log_untrustedstring - log a string that may contain random characters * @ab: audit_buffer * @string: string to be logged * * Same as audit_log_n_untrustedstring(), except that strlen is used to * determine string length. */ void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) { audit_log_n_untrustedstring(ab, string, strlen(string)); } /* This is a helper-function to print the escaped d_path */ void audit_log_d_path(struct audit_buffer *ab, const char *prefix, const struct path *path) { char *p, *pathname; if (prefix) audit_log_format(ab, "%s", prefix); /* We will allow 11 spaces for ' (deleted)' to be appended */ pathname = kmalloc(PATH_MAX+11, ab->gfp_mask); if (!pathname) { audit_log_format(ab, "\"<no_memory>\""); return; } p = d_path(path, pathname, PATH_MAX+11); if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */ /* FIXME: can we save some information here? */ audit_log_format(ab, "\"<too_long>\""); } else audit_log_untrustedstring(ab, p); kfree(pathname); } void audit_log_session_info(struct audit_buffer *ab) { unsigned int sessionid = audit_get_sessionid(current); uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current)); audit_log_format(ab, "auid=%u ses=%u", auid, sessionid); } void audit_log_key(struct audit_buffer *ab, char *key) { audit_log_format(ab, " key="); if (key) audit_log_untrustedstring(ab, key); else audit_log_format(ab, "(null)"); } int audit_log_task_context(struct audit_buffer *ab) { struct lsm_prop prop; struct lsm_context ctx; int error; security_current_getlsmprop_subj(&prop); if (!lsmprop_is_set(&prop)) return 0; error = security_lsmprop_to_secctx(&prop, &ctx); if (error < 0) { if (error != -EINVAL) goto error_path; return 0; } audit_log_format(ab, " subj=%s", ctx.context); security_release_secctx(&ctx); return 0; error_path: audit_panic("error in audit_log_task_context"); return error; } EXPORT_SYMBOL(audit_log_task_context); void audit_log_d_path_exe(struct audit_buffer *ab, struct mm_struct *mm) { struct file *exe_file; if (!mm) goto out_null; exe_file = get_mm_exe_file(mm); if (!exe_file) goto out_null; audit_log_d_path(ab, " exe=", &exe_file->f_path); fput(exe_file); return; out_null: audit_log_format(ab, " exe=(null)"); } struct tty_struct *audit_get_tty(void) { struct tty_struct *tty = NULL; unsigned long flags; spin_lock_irqsave(&current->sighand->siglock, flags); if (current->signal) tty = tty_kref_get(current->signal->tty); spin_unlock_irqrestore(&current->sighand->siglock, flags); return tty; } void audit_put_tty(struct tty_struct *tty) { tty_kref_put(tty); } void audit_log_task_info(struct audit_buffer *ab) { const struct cred *cred; char comm[sizeof(current->comm)]; struct tty_struct *tty; if (!ab) return; cred = current_cred(); tty = audit_get_tty(); audit_log_format(ab, " ppid=%d pid=%d auid=%u uid=%u gid=%u" " euid=%u suid=%u fsuid=%u" " egid=%u sgid=%u fsgid=%u tty=%s ses=%u", task_ppid_nr(current), task_tgid_nr(current), from_kuid(&init_user_ns, audit_get_loginuid(current)), from_kuid(&init_user_ns, cred->uid), from_kgid(&init_user_ns, cred->gid), from_kuid(&init_user_ns, cred->euid), from_kuid(&init_user_ns, cred->suid), from_kuid(&init_user_ns, cred->fsuid), from_kgid(&init_user_ns, cred->egid), from_kgid(&init_user_ns, cred->sgid), from_kgid(&init_user_ns, cred->fsgid), tty ? tty_name(tty) : "(none)", audit_get_sessionid(current)); audit_put_tty(tty); audit_log_format(ab, " comm="); audit_log_untrustedstring(ab, get_task_comm(comm, current)); audit_log_d_path_exe(ab, current->mm); audit_log_task_context(ab); } EXPORT_SYMBOL(audit_log_task_info); /** * audit_log_path_denied - report a path restriction denial * @type: audit message type (AUDIT_ANOM_LINK, AUDIT_ANOM_CREAT, etc) * @operation: specific operation name */ void audit_log_path_denied(int type, const char *operation) { struct audit_buffer *ab; if (!audit_enabled) return; /* Generate log with subject, operation, outcome. */ ab = audit_log_start(audit_context(), GFP_KERNEL, type); if (!ab) return; audit_log_format(ab, "op=%s", operation); audit_log_task_info(ab); audit_log_format(ab, " res=0"); audit_log_end(ab); } /* global counter which is incremented every time something logs in */ static atomic_t session_id = ATOMIC_INIT(0); static int audit_set_loginuid_perm(kuid_t loginuid) { /* if we are unset, we don't need privs */ if (!audit_loginuid_set(current)) return 0; /* if AUDIT_FEATURE_LOGINUID_IMMUTABLE means never ever allow a change*/ if (is_audit_feature_set(AUDIT_FEATURE_LOGINUID_IMMUTABLE)) return -EPERM; /* it is set, you need permission */ if (!capable(CAP_AUDIT_CONTROL)) return -EPERM; /* reject if this is not an unset and we don't allow that */ if (is_audit_feature_set(AUDIT_FEATURE_ONLY_UNSET_LOGINUID) && uid_valid(loginuid)) return -EPERM; return 0; } static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, unsigned int oldsessionid, unsigned int sessionid, int rc) { struct audit_buffer *ab; uid_t uid, oldloginuid, loginuid; struct tty_struct *tty; if (!audit_enabled) return; ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_LOGIN); if (!ab) return; uid = from_kuid(&init_user_ns, task_uid(current)); oldloginuid = from_kuid(&init_user_ns, koldloginuid); loginuid = from_kuid(&init_user_ns, kloginuid); tty = audit_get_tty(); audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid); audit_log_task_context(ab); audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d", oldloginuid, loginuid, tty ? tty_name(tty) : "(none)", oldsessionid, sessionid, !rc); audit_put_tty(tty); audit_log_end(ab); } /** * audit_set_loginuid - set current task's loginuid * @loginuid: loginuid value * * Returns 0. * * Called (set) from fs/proc/base.c::proc_loginuid_write(). */ int audit_set_loginuid(kuid_t loginuid) { unsigned int oldsessionid, sessionid = AUDIT_SID_UNSET; kuid_t oldloginuid; int rc; oldloginuid = audit_get_loginuid(current); oldsessionid = audit_get_sessionid(current); rc = audit_set_loginuid_perm(loginuid); if (rc) goto out; /* are we setting or clearing? */ if (uid_valid(loginuid)) { sessionid = (unsigned int)atomic_inc_return(&session_id); if (unlikely(sessionid == AUDIT_SID_UNSET)) sessionid = (unsigned int)atomic_inc_return(&session_id); } current->sessionid = sessionid; current->loginuid = loginuid; out: audit_log_set_loginuid(oldloginuid, loginuid, oldsessionid, sessionid, rc); return rc; } /** * audit_signal_info - record signal info for shutting down audit subsystem * @sig: signal value * @t: task being signaled * * If the audit subsystem is being terminated, record the task (pid) * and uid that is doing that. */ int audit_signal_info(int sig, struct task_struct *t) { kuid_t uid = current_uid(), auid; if (auditd_test_task(t) && (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2)) { audit_sig_pid = task_tgid_nr(current); auid = audit_get_loginuid(current); if (uid_valid(auid)) audit_sig_uid = auid; else audit_sig_uid = uid; security_current_getlsmprop_subj(&audit_sig_lsm); } return audit_signal_info_syscall(t); } /** * audit_log_end - end one audit record * @ab: the audit_buffer * * We can not do a netlink send inside an irq context because it blocks (last * arg, flags, is not set to MSG_DONTWAIT), so the audit buffer is placed on a * queue and a kthread is scheduled to remove them from the queue outside the * irq context. May be called in any context. */ void audit_log_end(struct audit_buffer *ab) { struct sk_buff *skb; struct nlmsghdr *nlh; if (!ab) return; if (audit_rate_check()) { skb = ab->skb; ab->skb = NULL; /* setup the netlink header, see the comments in * kauditd_send_multicast_skb() for length quirks */ nlh = nlmsg_hdr(skb); nlh->nlmsg_len = skb->len - NLMSG_HDRLEN; /* queue the netlink packet and poke the kauditd thread */ skb_queue_tail(&audit_queue, skb); wake_up_interruptible(&kauditd_wait); } else audit_log_lost("rate limit exceeded"); audit_buffer_free(ab); } /** * audit_log - Log an audit record * @ctx: audit context * @gfp_mask: type of allocation * @type: audit message type * @fmt: format string to use * @...: variable parameters matching the format string * * This is a convenience function that calls audit_log_start, * audit_log_vformat, and audit_log_end. It may be called * in any context. */ void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, const char *fmt, ...) { struct audit_buffer *ab; va_list args; ab = audit_log_start(ctx, gfp_mask, type); if (ab) { va_start(args, fmt); audit_log_vformat(ab, fmt, args); va_end(args); audit_log_end(ab); } } EXPORT_SYMBOL(audit_log_start); EXPORT_SYMBOL(audit_log_end); EXPORT_SYMBOL(audit_log_format); EXPORT_SYMBOL(audit_log);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2023 Arm Ltd. * * Based on arch/x86/include/asm/pkeys.h */ #ifndef _ASM_ARM64_PKEYS_H #define _ASM_ARM64_PKEYS_H #define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2) #define arch_max_pkey() 8 int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val); static inline bool arch_pkeys_enabled(void) { return system_supports_poe(); } static inline int vma_pkey(struct vm_area_struct *vma) { return (vma->vm_flags & ARCH_VM_PKEY_FLAGS) >> VM_PKEY_SHIFT; } static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey) { if (pkey != -1) return pkey; return vma_pkey(vma); } static inline int execute_only_pkey(struct mm_struct *mm) { // Execute-only mappings are handled by EPAN/FEAT_PAN3. return -1; } #define mm_pkey_allocation_map(mm) (mm)->context.pkey_allocation_map #define mm_set_pkey_allocated(mm, pkey) do { \ mm_pkey_allocation_map(mm) |= (1U << pkey); \ } while (0) #define mm_set_pkey_free(mm, pkey) do { \ mm_pkey_allocation_map(mm) &= ~(1U << pkey); \ } while (0) static inline bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey) { /* * "Allocated" pkeys are those that have been returned * from pkey_alloc() or pkey 0 which is allocated * implicitly when the mm is created. */ if (pkey < 0 || pkey >= arch_max_pkey()) return false; return mm_pkey_allocation_map(mm) & (1U << pkey); } /* * Returns a positive, 3-bit key on success, or -1 on failure. */ static inline int mm_pkey_alloc(struct mm_struct *mm) { /* * Note: this is the one and only place we make sure * that the pkey is valid as far as the hardware is * concerned. The rest of the kernel trusts that * only good, valid pkeys come out of here. */ u8 all_pkeys_mask = GENMASK(arch_max_pkey() - 1, 0); int ret; if (!arch_pkeys_enabled()) return -1; /* * Are we out of pkeys? We must handle this specially * because ffz() behavior is undefined if there are no * zeros. */ if (mm_pkey_allocation_map(mm) == all_pkeys_mask) return -1; ret = ffz(mm_pkey_allocation_map(mm)); mm_set_pkey_allocated(mm, ret); return ret; } static inline int mm_pkey_free(struct mm_struct *mm, int pkey) { if (!mm_pkey_is_allocated(mm, pkey)) return -EINVAL; mm_set_pkey_free(mm, pkey); return 0; } #endif /* _ASM_ARM64_PKEYS_H */
12 12 12 12 12 12 12 12 12 10 11 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra * * Provides a framework for enqueueing and running callbacks from hardirq * context. The enqueueing is NMI-safe. */ #include <linux/bug.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/irq_work.h> #include <linux/percpu.h> #include <linux/hardirq.h> #include <linux/irqflags.h> #include <linux/sched.h> #include <linux/tick.h> #include <linux/cpu.h> #include <linux/notifier.h> #include <linux/smp.h> #include <linux/smpboot.h> #include <asm/processor.h> #include <linux/kasan.h> #include <trace/events/ipi.h> static DEFINE_PER_CPU(struct llist_head, raised_list); static DEFINE_PER_CPU(struct llist_head, lazy_list); static DEFINE_PER_CPU(struct task_struct *, irq_workd); static void wake_irq_workd(void) { struct task_struct *tsk = __this_cpu_read(irq_workd); if (!llist_empty(this_cpu_ptr(&lazy_list)) && tsk) wake_up_process(tsk); } #ifdef CONFIG_SMP static void irq_work_wake(struct irq_work *entry) { wake_irq_workd(); } static DEFINE_PER_CPU(struct irq_work, irq_work_wakeup) = IRQ_WORK_INIT_HARD(irq_work_wake); #endif static int irq_workd_should_run(unsigned int cpu) { return !llist_empty(this_cpu_ptr(&lazy_list)); } /* * Claim the entry so that no one else will poke at it. */ static bool irq_work_claim(struct irq_work *work) { int oflags; oflags = atomic_fetch_or(IRQ_WORK_CLAIMED | CSD_TYPE_IRQ_WORK, &work->node.a_flags); /* * If the work is already pending, no need to raise the IPI. * The pairing smp_mb() in irq_work_single() makes sure * everything we did before is visible. */ if (oflags & IRQ_WORK_PENDING) return false; return true; } void __weak arch_irq_work_raise(void) { /* * Lame architectures will get the timer tick callback */ } static __always_inline void irq_work_raise(struct irq_work *work) { if (trace_ipi_send_cpu_enabled() && arch_irq_work_has_interrupt()) trace_ipi_send_cpu(smp_processor_id(), _RET_IP_, work->func); arch_irq_work_raise(); } /* Enqueue on current CPU, work must already be claimed and preempt disabled */ static void __irq_work_queue_local(struct irq_work *work) { struct llist_head *list; bool rt_lazy_work = false; bool lazy_work = false; int work_flags; work_flags = atomic_read(&work->node.a_flags); if (work_flags & IRQ_WORK_LAZY) lazy_work = true; else if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(work_flags & IRQ_WORK_HARD_IRQ)) rt_lazy_work = true; if (lazy_work || rt_lazy_work) list = this_cpu_ptr(&lazy_list); else list = this_cpu_ptr(&raised_list); if (!llist_add(&work->node.llist, list)) return; /* If the work is "lazy", handle it from next tick if any */ if (!lazy_work || tick_nohz_tick_stopped()) irq_work_raise(work); } /* Enqueue the irq work @work on the current CPU */ bool irq_work_queue(struct irq_work *work) { /* Only queue if not already pending */ if (!irq_work_claim(work)) return false; /* Queue the entry and raise the IPI if needed. */ preempt_disable(); __irq_work_queue_local(work); preempt_enable(); return true; } EXPORT_SYMBOL_GPL(irq_work_queue); /* * Enqueue the irq_work @work on @cpu unless it's already pending * somewhere. * * Can be re-enqueued while the callback is still in progress. */ bool irq_work_queue_on(struct irq_work *work, int cpu) { #ifndef CONFIG_SMP return irq_work_queue(work); #else /* CONFIG_SMP: */ /* All work should have been flushed before going offline */ WARN_ON_ONCE(cpu_is_offline(cpu)); /* Only queue if not already pending */ if (!irq_work_claim(work)) return false; kasan_record_aux_stack(work); preempt_disable(); if (cpu != smp_processor_id()) { /* Arch remote IPI send/receive backend aren't NMI safe */ WARN_ON_ONCE(in_nmi()); /* * On PREEMPT_RT the items which are not marked as * IRQ_WORK_HARD_IRQ are added to the lazy list and a HARD work * item is used on the remote CPU to wake the thread. */ if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(atomic_read(&work->node.a_flags) & IRQ_WORK_HARD_IRQ)) { if (!llist_add(&work->node.llist, &per_cpu(lazy_list, cpu))) goto out; work = &per_cpu(irq_work_wakeup, cpu); if (!irq_work_claim(work)) goto out; } __smp_call_single_queue(cpu, &work->node.llist); } else { __irq_work_queue_local(work); } out: preempt_enable(); return true; #endif /* CONFIG_SMP */ } bool irq_work_needs_cpu(void) { struct llist_head *raised, *lazy; raised = this_cpu_ptr(&raised_list); lazy = this_cpu_ptr(&lazy_list); if (llist_empty(raised) || arch_irq_work_has_interrupt()) if (llist_empty(lazy)) return false; /* All work should have been flushed before going offline */ WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); return true; } void irq_work_single(void *arg) { struct irq_work *work = arg; int flags; /* * Clear the PENDING bit, after this point the @work can be re-used. * The PENDING bit acts as a lock, and we own it, so we can clear it * without atomic ops. */ flags = atomic_read(&work->node.a_flags); flags &= ~IRQ_WORK_PENDING; atomic_set(&work->node.a_flags, flags); /* * See irq_work_claim(). */ smp_mb(); lockdep_irq_work_enter(flags); work->func(work); lockdep_irq_work_exit(flags); /* * Clear the BUSY bit, if set, and return to the free state if no-one * else claimed it meanwhile. */ (void)atomic_cmpxchg(&work->node.a_flags, flags, flags & ~IRQ_WORK_BUSY); if ((IS_ENABLED(CONFIG_PREEMPT_RT) && !irq_work_is_hard(work)) || !arch_irq_work_has_interrupt()) rcuwait_wake_up(&work->irqwait); } static void irq_work_run_list(struct llist_head *list) { struct irq_work *work, *tmp; struct llist_node *llnode; /* * On PREEMPT_RT IRQ-work which is not marked as HARD will be processed * in a per-CPU thread in preemptible context. Only the items which are * marked as IRQ_WORK_HARD_IRQ will be processed in hardirq context. */ BUG_ON(!irqs_disabled() && !IS_ENABLED(CONFIG_PREEMPT_RT)); if (llist_empty(list)) return; llnode = llist_del_all(list); llist_for_each_entry_safe(work, tmp, llnode, node.llist) irq_work_single(work); } /* * hotplug calls this through: * hotplug_cfd() -> flush_smp_call_function_queue() */ void irq_work_run(void) { irq_work_run_list(this_cpu_ptr(&raised_list)); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) irq_work_run_list(this_cpu_ptr(&lazy_list)); else wake_irq_workd(); } EXPORT_SYMBOL_GPL(irq_work_run); void irq_work_tick(void) { struct llist_head *raised = this_cpu_ptr(&raised_list); if (!llist_empty(raised) && !arch_irq_work_has_interrupt()) irq_work_run_list(raised); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) irq_work_run_list(this_cpu_ptr(&lazy_list)); else wake_irq_workd(); } /* * Synchronize against the irq_work @entry, ensures the entry is not * currently in use. */ void irq_work_sync(struct irq_work *work) { lockdep_assert_irqs_enabled(); might_sleep(); if ((IS_ENABLED(CONFIG_PREEMPT_RT) && !irq_work_is_hard(work)) || !arch_irq_work_has_interrupt()) { rcuwait_wait_event(&work->irqwait, !irq_work_is_busy(work), TASK_UNINTERRUPTIBLE); return; } while (irq_work_is_busy(work)) cpu_relax(); } EXPORT_SYMBOL_GPL(irq_work_sync); static void run_irq_workd(unsigned int cpu) { irq_work_run_list(this_cpu_ptr(&lazy_list)); } static void irq_workd_setup(unsigned int cpu) { sched_set_fifo_low(current); } static struct smp_hotplug_thread irqwork_threads = { .store = &irq_workd, .setup = irq_workd_setup, .thread_should_run = irq_workd_should_run, .thread_fn = run_irq_workd, .thread_comm = "irq_work/%u", }; static __init int irq_work_init_threads(void) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) BUG_ON(smpboot_register_percpu_thread(&irqwork_threads)); return 0; } early_initcall(irq_work_init_threads);
107 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 /* SPDX-License-Identifier: GPL-2.0 */ /* * A hash table (hashtab) maintains associations between * key values and datum values. The type of the key values * and the type of the datum values is arbitrary. The * functions for hash computation and key comparison are * provided by the creator of the table. * * Author : Stephen Smalley, <stephen.smalley.work@gmail.com> */ #ifndef _SS_HASHTAB_H_ #define _SS_HASHTAB_H_ #include <linux/types.h> #include <linux/errno.h> #include <linux/sched.h> #define HASHTAB_MAX_NODES U32_MAX struct hashtab_key_params { u32 (*hash)(const void *key); /* hash func */ int (*cmp)(const void *key1, const void *key2); /* comparison func */ }; struct hashtab_node { void *key; void *datum; struct hashtab_node *next; }; struct hashtab { struct hashtab_node **htable; /* hash table */ u32 size; /* number of slots in hash table */ u32 nel; /* number of elements in hash table */ }; struct hashtab_info { u32 slots_used; u32 max_chain_len; u64 chain2_len_sum; }; /* * Initializes a new hash table with the specified characteristics. * * Returns -ENOMEM if insufficient space is available or 0 otherwise. */ int hashtab_init(struct hashtab *h, u32 nel_hint); int __hashtab_insert(struct hashtab *h, struct hashtab_node **dst, void *key, void *datum); /* * Inserts the specified (key, datum) pair into the specified hash table. * * Returns -ENOMEM on memory allocation error, * -EEXIST if there is already an entry with the same key, * -EINVAL for general errors or 0 otherwise. */ static inline int hashtab_insert(struct hashtab *h, void *key, void *datum, struct hashtab_key_params key_params) { u32 hvalue; struct hashtab_node *prev, *cur; cond_resched(); if (!h->size || h->nel == HASHTAB_MAX_NODES) return -EINVAL; hvalue = key_params.hash(key) & (h->size - 1); prev = NULL; cur = h->htable[hvalue]; while (cur) { int cmp = key_params.cmp(key, cur->key); if (cmp == 0) return -EEXIST; if (cmp < 0) break; prev = cur; cur = cur->next; } return __hashtab_insert(h, prev ? &prev->next : &h->htable[hvalue], key, datum); } /* * Searches for the entry with the specified key in the hash table. * * Returns NULL if no entry has the specified key or * the datum of the entry otherwise. */ static inline void *hashtab_search(struct hashtab *h, const void *key, struct hashtab_key_params key_params) { u32 hvalue; struct hashtab_node *cur; if (!h->size) return NULL; hvalue = key_params.hash(key) & (h->size - 1); cur = h->htable[hvalue]; while (cur) { int cmp = key_params.cmp(key, cur->key); if (cmp == 0) return cur->datum; if (cmp < 0) break; cur = cur->next; } return NULL; } /* * Destroys the specified hash table. */ void hashtab_destroy(struct hashtab *h); /* * Applies the specified apply function to (key,datum,args) * for each entry in the specified hash table. * * The order in which the function is applied to the entries * is dependent upon the internal structure of the hash table. * * If apply returns a non-zero status, then hashtab_map will cease * iterating through the hash table and will propagate the error * return to its caller. */ int hashtab_map(struct hashtab *h, int (*apply)(void *k, void *d, void *args), void *args); int hashtab_duplicate(struct hashtab *new, const struct hashtab *orig, int (*copy)(struct hashtab_node *new, const struct hashtab_node *orig, void *args), int (*destroy)(void *k, void *d, void *args), void *args); #ifdef CONFIG_SECURITY_SELINUX_DEBUG /* Fill info with some hash table statistics */ void hashtab_stat(struct hashtab *h, struct hashtab_info *info); #else static inline void hashtab_stat(struct hashtab *h, struct hashtab_info *info) { return; } #endif #endif /* _SS_HASHTAB_H */
4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008-2011, Intel Corporation. * * Description: Data Center Bridging netlink interface * Author: Lucy Liu <lucy.liu@intel.com> */ #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/slab.h> #include <net/netlink.h> #include <net/rtnetlink.h> #include <linux/dcbnl.h> #include <net/dcbevent.h> #include <linux/rtnetlink.h> #include <linux/init.h> #include <net/sock.h> /* Data Center Bridging (DCB) is a collection of Ethernet enhancements * intended to allow network traffic with differing requirements * (highly reliable, no drops vs. best effort vs. low latency) to operate * and co-exist on Ethernet. Current DCB features are: * * Enhanced Transmission Selection (aka Priority Grouping [PG]) - provides a * framework for assigning bandwidth guarantees to traffic classes. * * Priority-based Flow Control (PFC) - provides a flow control mechanism which * can work independently for each 802.1p priority. * * Congestion Notification - provides a mechanism for end-to-end congestion * control for protocols which do not have built-in congestion management. * * More information about the emerging standards for these Ethernet features * can be found at: http://www.ieee802.org/1/pages/dcbridges.html * * This file implements an rtnetlink interface to allow configuration of DCB * features for capable devices. */ /**************** DCB attribute policies *************************************/ /* DCB netlink attributes policy */ static const struct nla_policy dcbnl_rtnl_policy[DCB_ATTR_MAX + 1] = { [DCB_ATTR_IFNAME] = {.type = NLA_NUL_STRING, .len = IFNAMSIZ - 1}, [DCB_ATTR_STATE] = {.type = NLA_U8}, [DCB_ATTR_PFC_CFG] = {.type = NLA_NESTED}, [DCB_ATTR_PG_CFG] = {.type = NLA_NESTED}, [DCB_ATTR_SET_ALL] = {.type = NLA_U8}, [DCB_ATTR_PERM_HWADDR] = {.type = NLA_FLAG}, [DCB_ATTR_CAP] = {.type = NLA_NESTED}, [DCB_ATTR_PFC_STATE] = {.type = NLA_U8}, [DCB_ATTR_BCN] = {.type = NLA_NESTED}, [DCB_ATTR_APP] = {.type = NLA_NESTED}, [DCB_ATTR_IEEE] = {.type = NLA_NESTED}, [DCB_ATTR_DCBX] = {.type = NLA_U8}, [DCB_ATTR_FEATCFG] = {.type = NLA_NESTED}, }; /* DCB priority flow control to User Priority nested attributes */ static const struct nla_policy dcbnl_pfc_up_nest[DCB_PFC_UP_ATTR_MAX + 1] = { [DCB_PFC_UP_ATTR_0] = {.type = NLA_U8}, [DCB_PFC_UP_ATTR_1] = {.type = NLA_U8}, [DCB_PFC_UP_ATTR_2] = {.type = NLA_U8}, [DCB_PFC_UP_ATTR_3] = {.type = NLA_U8}, [DCB_PFC_UP_ATTR_4] = {.type = NLA_U8}, [DCB_PFC_UP_ATTR_5] = {.type = NLA_U8}, [DCB_PFC_UP_ATTR_6] = {.type = NLA_U8}, [DCB_PFC_UP_ATTR_7] = {.type = NLA_U8}, [DCB_PFC_UP_ATTR_ALL] = {.type = NLA_FLAG}, }; /* DCB priority grouping nested attributes */ static const struct nla_policy dcbnl_pg_nest[DCB_PG_ATTR_MAX + 1] = { [DCB_PG_ATTR_TC_0] = {.type = NLA_NESTED}, [DCB_PG_ATTR_TC_1] = {.type = NLA_NESTED}, [DCB_PG_ATTR_TC_2] = {.type = NLA_NESTED}, [DCB_PG_ATTR_TC_3] = {.type = NLA_NESTED}, [DCB_PG_ATTR_TC_4] = {.type = NLA_NESTED}, [DCB_PG_ATTR_TC_5] = {.type = NLA_NESTED}, [DCB_PG_ATTR_TC_6] = {.type = NLA_NESTED}, [DCB_PG_ATTR_TC_7] = {.type = NLA_NESTED}, [DCB_PG_ATTR_TC_ALL] = {.type = NLA_NESTED}, [DCB_PG_ATTR_BW_ID_0] = {.type = NLA_U8}, [DCB_PG_ATTR_BW_ID_1] = {.type = NLA_U8}, [DCB_PG_ATTR_BW_ID_2] = {.type = NLA_U8}, [DCB_PG_ATTR_BW_ID_3] = {.type = NLA_U8}, [DCB_PG_ATTR_BW_ID_4] = {.type = NLA_U8}, [DCB_PG_ATTR_BW_ID_5] = {.type = NLA_U8}, [DCB_PG_ATTR_BW_ID_6] = {.type = NLA_U8}, [DCB_PG_ATTR_BW_ID_7] = {.type = NLA_U8}, [DCB_PG_ATTR_BW_ID_ALL] = {.type = NLA_FLAG}, }; /* DCB traffic class nested attributes. */ static const struct nla_policy dcbnl_tc_param_nest[DCB_TC_ATTR_PARAM_MAX + 1] = { [DCB_TC_ATTR_PARAM_PGID] = {.type = NLA_U8}, [DCB_TC_ATTR_PARAM_UP_MAPPING] = {.type = NLA_U8}, [DCB_TC_ATTR_PARAM_STRICT_PRIO] = {.type = NLA_U8}, [DCB_TC_ATTR_PARAM_BW_PCT] = {.type = NLA_U8}, [DCB_TC_ATTR_PARAM_ALL] = {.type = NLA_FLAG}, }; /* DCB capabilities nested attributes. */ static const struct nla_policy dcbnl_cap_nest[DCB_CAP_ATTR_MAX + 1] = { [DCB_CAP_ATTR_ALL] = {.type = NLA_FLAG}, [DCB_CAP_ATTR_PG] = {.type = NLA_U8}, [DCB_CAP_ATTR_PFC] = {.type = NLA_U8}, [DCB_CAP_ATTR_UP2TC] = {.type = NLA_U8}, [DCB_CAP_ATTR_PG_TCS] = {.type = NLA_U8}, [DCB_CAP_ATTR_PFC_TCS] = {.type = NLA_U8}, [DCB_CAP_ATTR_GSP] = {.type = NLA_U8}, [DCB_CAP_ATTR_BCN] = {.type = NLA_U8}, [DCB_CAP_ATTR_DCBX] = {.type = NLA_U8}, }; /* DCB capabilities nested attributes. */ static const struct nla_policy dcbnl_numtcs_nest[DCB_NUMTCS_ATTR_MAX + 1] = { [DCB_NUMTCS_ATTR_ALL] = {.type = NLA_FLAG}, [DCB_NUMTCS_ATTR_PG] = {.type = NLA_U8}, [DCB_NUMTCS_ATTR_PFC] = {.type = NLA_U8}, }; /* DCB BCN nested attributes. */ static const struct nla_policy dcbnl_bcn_nest[DCB_BCN_ATTR_MAX + 1] = { [DCB_BCN_ATTR_RP_0] = {.type = NLA_U8}, [DCB_BCN_ATTR_RP_1] = {.type = NLA_U8}, [DCB_BCN_ATTR_RP_2] = {.type = NLA_U8}, [DCB_BCN_ATTR_RP_3] = {.type = NLA_U8}, [DCB_BCN_ATTR_RP_4] = {.type = NLA_U8}, [DCB_BCN_ATTR_RP_5] = {.type = NLA_U8}, [DCB_BCN_ATTR_RP_6] = {.type = NLA_U8}, [DCB_BCN_ATTR_RP_7] = {.type = NLA_U8}, [DCB_BCN_ATTR_RP_ALL] = {.type = NLA_FLAG}, [DCB_BCN_ATTR_BCNA_0] = {.type = NLA_U32}, [DCB_BCN_ATTR_BCNA_1] = {.type = NLA_U32}, [DCB_BCN_ATTR_ALPHA] = {.type = NLA_U32}, [DCB_BCN_ATTR_BETA] = {.type = NLA_U32}, [DCB_BCN_ATTR_GD] = {.type = NLA_U32}, [DCB_BCN_ATTR_GI] = {.type = NLA_U32}, [DCB_BCN_ATTR_TMAX] = {.type = NLA_U32}, [DCB_BCN_ATTR_TD] = {.type = NLA_U32}, [DCB_BCN_ATTR_RMIN] = {.type = NLA_U32}, [DCB_BCN_ATTR_W] = {.type = NLA_U32}, [DCB_BCN_ATTR_RD] = {.type = NLA_U32}, [DCB_BCN_ATTR_RU] = {.type = NLA_U32}, [DCB_BCN_ATTR_WRTT] = {.type = NLA_U32}, [DCB_BCN_ATTR_RI] = {.type = NLA_U32}, [DCB_BCN_ATTR_C] = {.type = NLA_U32}, [DCB_BCN_ATTR_ALL] = {.type = NLA_FLAG}, }; /* DCB APP nested attributes. */ static const struct nla_policy dcbnl_app_nest[DCB_APP_ATTR_MAX + 1] = { [DCB_APP_ATTR_IDTYPE] = {.type = NLA_U8}, [DCB_APP_ATTR_ID] = {.type = NLA_U16}, [DCB_APP_ATTR_PRIORITY] = {.type = NLA_U8}, }; /* IEEE 802.1Qaz nested attributes. */ static const struct nla_policy dcbnl_ieee_policy[DCB_ATTR_IEEE_MAX + 1] = { [DCB_ATTR_IEEE_ETS] = {.len = sizeof(struct ieee_ets)}, [DCB_ATTR_IEEE_PFC] = {.len = sizeof(struct ieee_pfc)}, [DCB_ATTR_IEEE_APP_TABLE] = {.type = NLA_NESTED}, [DCB_ATTR_IEEE_MAXRATE] = {.len = sizeof(struct ieee_maxrate)}, [DCB_ATTR_IEEE_QCN] = {.len = sizeof(struct ieee_qcn)}, [DCB_ATTR_IEEE_QCN_STATS] = {.len = sizeof(struct ieee_qcn_stats)}, [DCB_ATTR_DCB_BUFFER] = {.len = sizeof(struct dcbnl_buffer)}, [DCB_ATTR_DCB_APP_TRUST_TABLE] = {.type = NLA_NESTED}, }; /* DCB number of traffic classes nested attributes. */ static const struct nla_policy dcbnl_featcfg_nest[DCB_FEATCFG_ATTR_MAX + 1] = { [DCB_FEATCFG_ATTR_ALL] = {.type = NLA_FLAG}, [DCB_FEATCFG_ATTR_PG] = {.type = NLA_U8}, [DCB_FEATCFG_ATTR_PFC] = {.type = NLA_U8}, [DCB_FEATCFG_ATTR_APP] = {.type = NLA_U8}, }; static LIST_HEAD(dcb_app_list); static LIST_HEAD(dcb_rewr_list); static DEFINE_SPINLOCK(dcb_lock); static enum ieee_attrs_app dcbnl_app_attr_type_get(u8 selector) { switch (selector) { case IEEE_8021QAZ_APP_SEL_ETHERTYPE: case IEEE_8021QAZ_APP_SEL_STREAM: case IEEE_8021QAZ_APP_SEL_DGRAM: case IEEE_8021QAZ_APP_SEL_ANY: case IEEE_8021QAZ_APP_SEL_DSCP: return DCB_ATTR_IEEE_APP; case DCB_APP_SEL_PCP: return DCB_ATTR_DCB_APP; default: return DCB_ATTR_IEEE_APP_UNSPEC; } } static bool dcbnl_app_attr_type_validate(enum ieee_attrs_app type) { switch (type) { case DCB_ATTR_IEEE_APP: case DCB_ATTR_DCB_APP: return true; default: return false; } } static bool dcbnl_app_selector_validate(enum ieee_attrs_app type, u8 selector) { return dcbnl_app_attr_type_get(selector) == type; } static struct sk_buff *dcbnl_newmsg(int type, u8 cmd, u32 port, u32 seq, u32 flags, struct nlmsghdr **nlhp) { struct sk_buff *skb; struct dcbmsg *dcb; struct nlmsghdr *nlh; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) return NULL; nlh = nlmsg_put(skb, port, seq, type, sizeof(*dcb), flags); BUG_ON(!nlh); dcb = nlmsg_data(nlh); dcb->dcb_family = AF_UNSPEC; dcb->cmd = cmd; dcb->dcb_pad = 0; if (nlhp) *nlhp = nlh; return skb; } static int dcbnl_getstate(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { /* if (!tb[DCB_ATTR_STATE] || !netdev->dcbnl_ops->getstate) */ if (!netdev->dcbnl_ops->getstate) return -EOPNOTSUPP; return nla_put_u8(skb, DCB_ATTR_STATE, netdev->dcbnl_ops->getstate(netdev)); } static int dcbnl_getpfccfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *data[DCB_PFC_UP_ATTR_MAX + 1], *nest; u8 value; int ret; int i; int getall = 0; if (!tb[DCB_ATTR_PFC_CFG]) return -EINVAL; if (!netdev->dcbnl_ops->getpfccfg) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(data, DCB_PFC_UP_ATTR_MAX, tb[DCB_ATTR_PFC_CFG], dcbnl_pfc_up_nest, NULL); if (ret) return ret; nest = nla_nest_start_noflag(skb, DCB_ATTR_PFC_CFG); if (!nest) return -EMSGSIZE; if (data[DCB_PFC_UP_ATTR_ALL]) getall = 1; for (i = DCB_PFC_UP_ATTR_0; i <= DCB_PFC_UP_ATTR_7; i++) { if (!getall && !data[i]) continue; netdev->dcbnl_ops->getpfccfg(netdev, i - DCB_PFC_UP_ATTR_0, &value); ret = nla_put_u8(skb, i, value); if (ret) { nla_nest_cancel(skb, nest); return ret; } } nla_nest_end(skb, nest); return 0; } static int dcbnl_getperm_hwaddr(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { u8 perm_addr[MAX_ADDR_LEN]; if (!netdev->dcbnl_ops->getpermhwaddr) return -EOPNOTSUPP; memset(perm_addr, 0, sizeof(perm_addr)); netdev->dcbnl_ops->getpermhwaddr(netdev, perm_addr); return nla_put(skb, DCB_ATTR_PERM_HWADDR, sizeof(perm_addr), perm_addr); } static int dcbnl_getcap(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *data[DCB_CAP_ATTR_MAX + 1], *nest; u8 value; int ret; int i; int getall = 0; if (!tb[DCB_ATTR_CAP]) return -EINVAL; if (!netdev->dcbnl_ops->getcap) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(data, DCB_CAP_ATTR_MAX, tb[DCB_ATTR_CAP], dcbnl_cap_nest, NULL); if (ret) return ret; nest = nla_nest_start_noflag(skb, DCB_ATTR_CAP); if (!nest) return -EMSGSIZE; if (data[DCB_CAP_ATTR_ALL]) getall = 1; for (i = DCB_CAP_ATTR_ALL+1; i <= DCB_CAP_ATTR_MAX; i++) { if (!getall && !data[i]) continue; if (!netdev->dcbnl_ops->getcap(netdev, i, &value)) { ret = nla_put_u8(skb, i, value); if (ret) { nla_nest_cancel(skb, nest); return ret; } } } nla_nest_end(skb, nest); return 0; } static int dcbnl_getnumtcs(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *data[DCB_NUMTCS_ATTR_MAX + 1], *nest; u8 value; int ret; int i; int getall = 0; if (!tb[DCB_ATTR_NUMTCS]) return -EINVAL; if (!netdev->dcbnl_ops->getnumtcs) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(data, DCB_NUMTCS_ATTR_MAX, tb[DCB_ATTR_NUMTCS], dcbnl_numtcs_nest, NULL); if (ret) return ret; nest = nla_nest_start_noflag(skb, DCB_ATTR_NUMTCS); if (!nest) return -EMSGSIZE; if (data[DCB_NUMTCS_ATTR_ALL]) getall = 1; for (i = DCB_NUMTCS_ATTR_ALL+1; i <= DCB_NUMTCS_ATTR_MAX; i++) { if (!getall && !data[i]) continue; ret = netdev->dcbnl_ops->getnumtcs(netdev, i, &value); if (!ret) { ret = nla_put_u8(skb, i, value); if (ret) { nla_nest_cancel(skb, nest); return ret; } } else return -EINVAL; } nla_nest_end(skb, nest); return 0; } static int dcbnl_setnumtcs(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *data[DCB_NUMTCS_ATTR_MAX + 1]; int ret; u8 value; int i; if (!tb[DCB_ATTR_NUMTCS]) return -EINVAL; if (!netdev->dcbnl_ops->setnumtcs) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(data, DCB_NUMTCS_ATTR_MAX, tb[DCB_ATTR_NUMTCS], dcbnl_numtcs_nest, NULL); if (ret) return ret; for (i = DCB_NUMTCS_ATTR_ALL+1; i <= DCB_NUMTCS_ATTR_MAX; i++) { if (data[i] == NULL) continue; value = nla_get_u8(data[i]); ret = netdev->dcbnl_ops->setnumtcs(netdev, i, value); if (ret) break; } return nla_put_u8(skb, DCB_ATTR_NUMTCS, !!ret); } static int dcbnl_getpfcstate(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { if (!netdev->dcbnl_ops->getpfcstate) return -EOPNOTSUPP; return nla_put_u8(skb, DCB_ATTR_PFC_STATE, netdev->dcbnl_ops->getpfcstate(netdev)); } static int dcbnl_setpfcstate(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { u8 value; if (!tb[DCB_ATTR_PFC_STATE]) return -EINVAL; if (!netdev->dcbnl_ops->setpfcstate) return -EOPNOTSUPP; value = nla_get_u8(tb[DCB_ATTR_PFC_STATE]); netdev->dcbnl_ops->setpfcstate(netdev, value); return nla_put_u8(skb, DCB_ATTR_PFC_STATE, 0); } static int dcbnl_getapp(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *app_nest; struct nlattr *app_tb[DCB_APP_ATTR_MAX + 1]; u16 id; u8 up, idtype; int ret; if (!tb[DCB_ATTR_APP]) return -EINVAL; ret = nla_parse_nested_deprecated(app_tb, DCB_APP_ATTR_MAX, tb[DCB_ATTR_APP], dcbnl_app_nest, NULL); if (ret) return ret; /* all must be non-null */ if ((!app_tb[DCB_APP_ATTR_IDTYPE]) || (!app_tb[DCB_APP_ATTR_ID])) return -EINVAL; /* either by eth type or by socket number */ idtype = nla_get_u8(app_tb[DCB_APP_ATTR_IDTYPE]); if ((idtype != DCB_APP_IDTYPE_ETHTYPE) && (idtype != DCB_APP_IDTYPE_PORTNUM)) return -EINVAL; id = nla_get_u16(app_tb[DCB_APP_ATTR_ID]); if (netdev->dcbnl_ops->getapp) { ret = netdev->dcbnl_ops->getapp(netdev, idtype, id); if (ret < 0) return ret; else up = ret; } else { struct dcb_app app = { .selector = idtype, .protocol = id, }; up = dcb_getapp(netdev, &app); } app_nest = nla_nest_start_noflag(skb, DCB_ATTR_APP); if (!app_nest) return -EMSGSIZE; ret = nla_put_u8(skb, DCB_APP_ATTR_IDTYPE, idtype); if (ret) goto out_cancel; ret = nla_put_u16(skb, DCB_APP_ATTR_ID, id); if (ret) goto out_cancel; ret = nla_put_u8(skb, DCB_APP_ATTR_PRIORITY, up); if (ret) goto out_cancel; nla_nest_end(skb, app_nest); return 0; out_cancel: nla_nest_cancel(skb, app_nest); return ret; } static int dcbnl_setapp(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { int ret; u16 id; u8 up, idtype; struct nlattr *app_tb[DCB_APP_ATTR_MAX + 1]; if (!tb[DCB_ATTR_APP]) return -EINVAL; ret = nla_parse_nested_deprecated(app_tb, DCB_APP_ATTR_MAX, tb[DCB_ATTR_APP], dcbnl_app_nest, NULL); if (ret) return ret; /* all must be non-null */ if ((!app_tb[DCB_APP_ATTR_IDTYPE]) || (!app_tb[DCB_APP_ATTR_ID]) || (!app_tb[DCB_APP_ATTR_PRIORITY])) return -EINVAL; /* either by eth type or by socket number */ idtype = nla_get_u8(app_tb[DCB_APP_ATTR_IDTYPE]); if ((idtype != DCB_APP_IDTYPE_ETHTYPE) && (idtype != DCB_APP_IDTYPE_PORTNUM)) return -EINVAL; id = nla_get_u16(app_tb[DCB_APP_ATTR_ID]); up = nla_get_u8(app_tb[DCB_APP_ATTR_PRIORITY]); if (netdev->dcbnl_ops->setapp) { ret = netdev->dcbnl_ops->setapp(netdev, idtype, id, up); if (ret < 0) return ret; } else { struct dcb_app app; app.selector = idtype; app.protocol = id; app.priority = up; ret = dcb_setapp(netdev, &app); } ret = nla_put_u8(skb, DCB_ATTR_APP, ret); dcbnl_cee_notify(netdev, RTM_SETDCB, DCB_CMD_SAPP, seq, 0); return ret; } static int __dcbnl_pg_getcfg(struct net_device *netdev, struct nlmsghdr *nlh, struct nlattr **tb, struct sk_buff *skb, int dir) { struct nlattr *pg_nest, *param_nest, *data; struct nlattr *pg_tb[DCB_PG_ATTR_MAX + 1]; struct nlattr *param_tb[DCB_TC_ATTR_PARAM_MAX + 1]; u8 prio, pgid, tc_pct, up_map; int ret; int getall = 0; int i; if (!tb[DCB_ATTR_PG_CFG]) return -EINVAL; if (!netdev->dcbnl_ops->getpgtccfgtx || !netdev->dcbnl_ops->getpgtccfgrx || !netdev->dcbnl_ops->getpgbwgcfgtx || !netdev->dcbnl_ops->getpgbwgcfgrx) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(pg_tb, DCB_PG_ATTR_MAX, tb[DCB_ATTR_PG_CFG], dcbnl_pg_nest, NULL); if (ret) return ret; pg_nest = nla_nest_start_noflag(skb, DCB_ATTR_PG_CFG); if (!pg_nest) return -EMSGSIZE; if (pg_tb[DCB_PG_ATTR_TC_ALL]) getall = 1; for (i = DCB_PG_ATTR_TC_0; i <= DCB_PG_ATTR_TC_7; i++) { if (!getall && !pg_tb[i]) continue; if (pg_tb[DCB_PG_ATTR_TC_ALL]) data = pg_tb[DCB_PG_ATTR_TC_ALL]; else data = pg_tb[i]; ret = nla_parse_nested_deprecated(param_tb, DCB_TC_ATTR_PARAM_MAX, data, dcbnl_tc_param_nest, NULL); if (ret) goto err_pg; param_nest = nla_nest_start_noflag(skb, i); if (!param_nest) goto err_pg; pgid = DCB_ATTR_VALUE_UNDEFINED; prio = DCB_ATTR_VALUE_UNDEFINED; tc_pct = DCB_ATTR_VALUE_UNDEFINED; up_map = DCB_ATTR_VALUE_UNDEFINED; if (dir) { /* Rx */ netdev->dcbnl_ops->getpgtccfgrx(netdev, i - DCB_PG_ATTR_TC_0, &prio, &pgid, &tc_pct, &up_map); } else { /* Tx */ netdev->dcbnl_ops->getpgtccfgtx(netdev, i - DCB_PG_ATTR_TC_0, &prio, &pgid, &tc_pct, &up_map); } if (param_tb[DCB_TC_ATTR_PARAM_PGID] || param_tb[DCB_TC_ATTR_PARAM_ALL]) { ret = nla_put_u8(skb, DCB_TC_ATTR_PARAM_PGID, pgid); if (ret) goto err_param; } if (param_tb[DCB_TC_ATTR_PARAM_UP_MAPPING] || param_tb[DCB_TC_ATTR_PARAM_ALL]) { ret = nla_put_u8(skb, DCB_TC_ATTR_PARAM_UP_MAPPING, up_map); if (ret) goto err_param; } if (param_tb[DCB_TC_ATTR_PARAM_STRICT_PRIO] || param_tb[DCB_TC_ATTR_PARAM_ALL]) { ret = nla_put_u8(skb, DCB_TC_ATTR_PARAM_STRICT_PRIO, prio); if (ret) goto err_param; } if (param_tb[DCB_TC_ATTR_PARAM_BW_PCT] || param_tb[DCB_TC_ATTR_PARAM_ALL]) { ret = nla_put_u8(skb, DCB_TC_ATTR_PARAM_BW_PCT, tc_pct); if (ret) goto err_param; } nla_nest_end(skb, param_nest); } if (pg_tb[DCB_PG_ATTR_BW_ID_ALL]) getall = 1; else getall = 0; for (i = DCB_PG_ATTR_BW_ID_0; i <= DCB_PG_ATTR_BW_ID_7; i++) { if (!getall && !pg_tb[i]) continue; tc_pct = DCB_ATTR_VALUE_UNDEFINED; if (dir) { /* Rx */ netdev->dcbnl_ops->getpgbwgcfgrx(netdev, i - DCB_PG_ATTR_BW_ID_0, &tc_pct); } else { /* Tx */ netdev->dcbnl_ops->getpgbwgcfgtx(netdev, i - DCB_PG_ATTR_BW_ID_0, &tc_pct); } ret = nla_put_u8(skb, i, tc_pct); if (ret) goto err_pg; } nla_nest_end(skb, pg_nest); return 0; err_param: nla_nest_cancel(skb, param_nest); err_pg: nla_nest_cancel(skb, pg_nest); return -EMSGSIZE; } static int dcbnl_pgtx_getcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { return __dcbnl_pg_getcfg(netdev, nlh, tb, skb, 0); } static int dcbnl_pgrx_getcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { return __dcbnl_pg_getcfg(netdev, nlh, tb, skb, 1); } static int dcbnl_setstate(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { u8 value; if (!tb[DCB_ATTR_STATE]) return -EINVAL; if (!netdev->dcbnl_ops->setstate) return -EOPNOTSUPP; value = nla_get_u8(tb[DCB_ATTR_STATE]); return nla_put_u8(skb, DCB_ATTR_STATE, netdev->dcbnl_ops->setstate(netdev, value)); } static int dcbnl_setpfccfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *data[DCB_PFC_UP_ATTR_MAX + 1]; int i; int ret; u8 value; if (!tb[DCB_ATTR_PFC_CFG]) return -EINVAL; if (!netdev->dcbnl_ops->setpfccfg) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(data, DCB_PFC_UP_ATTR_MAX, tb[DCB_ATTR_PFC_CFG], dcbnl_pfc_up_nest, NULL); if (ret) return ret; for (i = DCB_PFC_UP_ATTR_0; i <= DCB_PFC_UP_ATTR_7; i++) { if (data[i] == NULL) continue; value = nla_get_u8(data[i]); netdev->dcbnl_ops->setpfccfg(netdev, data[i]->nla_type - DCB_PFC_UP_ATTR_0, value); } return nla_put_u8(skb, DCB_ATTR_PFC_CFG, 0); } static int dcbnl_setall(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { int ret; if (!tb[DCB_ATTR_SET_ALL]) return -EINVAL; if (!netdev->dcbnl_ops->setall) return -EOPNOTSUPP; ret = nla_put_u8(skb, DCB_ATTR_SET_ALL, netdev->dcbnl_ops->setall(netdev)); dcbnl_cee_notify(netdev, RTM_SETDCB, DCB_CMD_SET_ALL, seq, 0); return ret; } static int __dcbnl_pg_setcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb, int dir) { struct nlattr *pg_tb[DCB_PG_ATTR_MAX + 1]; struct nlattr *param_tb[DCB_TC_ATTR_PARAM_MAX + 1]; int ret; int i; u8 pgid; u8 up_map; u8 prio; u8 tc_pct; if (!tb[DCB_ATTR_PG_CFG]) return -EINVAL; if (!netdev->dcbnl_ops->setpgtccfgtx || !netdev->dcbnl_ops->setpgtccfgrx || !netdev->dcbnl_ops->setpgbwgcfgtx || !netdev->dcbnl_ops->setpgbwgcfgrx) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(pg_tb, DCB_PG_ATTR_MAX, tb[DCB_ATTR_PG_CFG], dcbnl_pg_nest, NULL); if (ret) return ret; for (i = DCB_PG_ATTR_TC_0; i <= DCB_PG_ATTR_TC_7; i++) { if (!pg_tb[i]) continue; ret = nla_parse_nested_deprecated(param_tb, DCB_TC_ATTR_PARAM_MAX, pg_tb[i], dcbnl_tc_param_nest, NULL); if (ret) return ret; pgid = DCB_ATTR_VALUE_UNDEFINED; prio = DCB_ATTR_VALUE_UNDEFINED; tc_pct = DCB_ATTR_VALUE_UNDEFINED; up_map = DCB_ATTR_VALUE_UNDEFINED; if (param_tb[DCB_TC_ATTR_PARAM_STRICT_PRIO]) prio = nla_get_u8(param_tb[DCB_TC_ATTR_PARAM_STRICT_PRIO]); if (param_tb[DCB_TC_ATTR_PARAM_PGID]) pgid = nla_get_u8(param_tb[DCB_TC_ATTR_PARAM_PGID]); if (param_tb[DCB_TC_ATTR_PARAM_BW_PCT]) tc_pct = nla_get_u8(param_tb[DCB_TC_ATTR_PARAM_BW_PCT]); if (param_tb[DCB_TC_ATTR_PARAM_UP_MAPPING]) up_map = nla_get_u8(param_tb[DCB_TC_ATTR_PARAM_UP_MAPPING]); /* dir: Tx = 0, Rx = 1 */ if (dir) { /* Rx */ netdev->dcbnl_ops->setpgtccfgrx(netdev, i - DCB_PG_ATTR_TC_0, prio, pgid, tc_pct, up_map); } else { /* Tx */ netdev->dcbnl_ops->setpgtccfgtx(netdev, i - DCB_PG_ATTR_TC_0, prio, pgid, tc_pct, up_map); } } for (i = DCB_PG_ATTR_BW_ID_0; i <= DCB_PG_ATTR_BW_ID_7; i++) { if (!pg_tb[i]) continue; tc_pct = nla_get_u8(pg_tb[i]); /* dir: Tx = 0, Rx = 1 */ if (dir) { /* Rx */ netdev->dcbnl_ops->setpgbwgcfgrx(netdev, i - DCB_PG_ATTR_BW_ID_0, tc_pct); } else { /* Tx */ netdev->dcbnl_ops->setpgbwgcfgtx(netdev, i - DCB_PG_ATTR_BW_ID_0, tc_pct); } } return nla_put_u8(skb, DCB_ATTR_PG_CFG, 0); } static int dcbnl_pgtx_setcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { return __dcbnl_pg_setcfg(netdev, nlh, seq, tb, skb, 0); } static int dcbnl_pgrx_setcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { return __dcbnl_pg_setcfg(netdev, nlh, seq, tb, skb, 1); } static int dcbnl_bcn_getcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *bcn_nest; struct nlattr *bcn_tb[DCB_BCN_ATTR_MAX + 1]; u8 value_byte; u32 value_integer; int ret; bool getall = false; int i; if (!tb[DCB_ATTR_BCN]) return -EINVAL; if (!netdev->dcbnl_ops->getbcnrp || !netdev->dcbnl_ops->getbcncfg) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(bcn_tb, DCB_BCN_ATTR_MAX, tb[DCB_ATTR_BCN], dcbnl_bcn_nest, NULL); if (ret) return ret; bcn_nest = nla_nest_start_noflag(skb, DCB_ATTR_BCN); if (!bcn_nest) return -EMSGSIZE; if (bcn_tb[DCB_BCN_ATTR_ALL]) getall = true; for (i = DCB_BCN_ATTR_RP_0; i <= DCB_BCN_ATTR_RP_7; i++) { if (!getall && !bcn_tb[i]) continue; netdev->dcbnl_ops->getbcnrp(netdev, i - DCB_BCN_ATTR_RP_0, &value_byte); ret = nla_put_u8(skb, i, value_byte); if (ret) goto err_bcn; } for (i = DCB_BCN_ATTR_BCNA_0; i <= DCB_BCN_ATTR_RI; i++) { if (!getall && !bcn_tb[i]) continue; netdev->dcbnl_ops->getbcncfg(netdev, i, &value_integer); ret = nla_put_u32(skb, i, value_integer); if (ret) goto err_bcn; } nla_nest_end(skb, bcn_nest); return 0; err_bcn: nla_nest_cancel(skb, bcn_nest); return ret; } static int dcbnl_bcn_setcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *data[DCB_BCN_ATTR_MAX + 1]; int i; int ret; u8 value_byte; u32 value_int; if (!tb[DCB_ATTR_BCN]) return -EINVAL; if (!netdev->dcbnl_ops->setbcncfg || !netdev->dcbnl_ops->setbcnrp) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(data, DCB_BCN_ATTR_MAX, tb[DCB_ATTR_BCN], dcbnl_bcn_nest, NULL); if (ret) return ret; for (i = DCB_BCN_ATTR_RP_0; i <= DCB_BCN_ATTR_RP_7; i++) { if (data[i] == NULL) continue; value_byte = nla_get_u8(data[i]); netdev->dcbnl_ops->setbcnrp(netdev, data[i]->nla_type - DCB_BCN_ATTR_RP_0, value_byte); } for (i = DCB_BCN_ATTR_BCNA_0; i <= DCB_BCN_ATTR_RI; i++) { if (data[i] == NULL) continue; value_int = nla_get_u32(data[i]); netdev->dcbnl_ops->setbcncfg(netdev, i, value_int); } return nla_put_u8(skb, DCB_ATTR_BCN, 0); } static int dcbnl_build_peer_app(struct net_device *netdev, struct sk_buff* skb, int app_nested_type, int app_info_type, int app_entry_type) { struct dcb_peer_app_info info; struct dcb_app *table = NULL; const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; u16 app_count; int err; /** * retrieve the peer app configuration form the driver. If the driver * handlers fail exit without doing anything */ err = ops->peer_getappinfo(netdev, &info, &app_count); if (!err && app_count) { table = kmalloc_array(app_count, sizeof(struct dcb_app), GFP_KERNEL); if (!table) return -ENOMEM; err = ops->peer_getapptable(netdev, table); } if (!err) { u16 i; struct nlattr *app; /** * build the message, from here on the only possible failure * is due to the skb size */ err = -EMSGSIZE; app = nla_nest_start_noflag(skb, app_nested_type); if (!app) goto nla_put_failure; if (app_info_type && nla_put(skb, app_info_type, sizeof(info), &info)) goto nla_put_failure; for (i = 0; i < app_count; i++) { if (nla_put(skb, app_entry_type, sizeof(struct dcb_app), &table[i])) goto nla_put_failure; } nla_nest_end(skb, app); } err = 0; nla_put_failure: kfree(table); return err; } static int dcbnl_getapptrust(struct net_device *netdev, struct sk_buff *skb) { const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; enum ieee_attrs_app type; struct nlattr *apptrust; int nselectors, err, i; u8 *selectors; selectors = kzalloc(IEEE_8021QAZ_APP_SEL_MAX + 1, GFP_KERNEL); if (!selectors) return -ENOMEM; err = ops->dcbnl_getapptrust(netdev, selectors, &nselectors); if (err) { err = 0; goto out; } apptrust = nla_nest_start(skb, DCB_ATTR_DCB_APP_TRUST_TABLE); if (!apptrust) { err = -EMSGSIZE; goto out; } for (i = 0; i < nselectors; i++) { type = dcbnl_app_attr_type_get(selectors[i]); err = nla_put_u8(skb, type, selectors[i]); if (err) { nla_nest_cancel(skb, apptrust); goto out; } } nla_nest_end(skb, apptrust); out: kfree(selectors); return err; } /* Set or delete APP table or rewrite table entries. The APP struct is validated * and the appropriate callback function is called. */ static int dcbnl_app_table_setdel(struct nlattr *attr, struct net_device *netdev, int (*setdel)(struct net_device *dev, struct dcb_app *app)) { struct dcb_app *app_data; enum ieee_attrs_app type; struct nlattr *attr_itr; int rem, err; nla_for_each_nested(attr_itr, attr, rem) { type = nla_type(attr_itr); if (!dcbnl_app_attr_type_validate(type)) continue; if (nla_len(attr_itr) < sizeof(struct dcb_app)) return -ERANGE; app_data = nla_data(attr_itr); if (!dcbnl_app_selector_validate(type, app_data->selector)) return -EINVAL; err = setdel(netdev, app_data); if (err) return err; } return 0; } /* Handle IEEE 802.1Qaz/802.1Qau/802.1Qbb GET commands. */ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev) { const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; struct nlattr *ieee, *app, *rewr; struct dcb_app_type *itr; int dcbx; int err; if (nla_put_string(skb, DCB_ATTR_IFNAME, netdev->name)) return -EMSGSIZE; ieee = nla_nest_start_noflag(skb, DCB_ATTR_IEEE); if (!ieee) return -EMSGSIZE; if (ops->ieee_getets) { struct ieee_ets ets; memset(&ets, 0, sizeof(ets)); err = ops->ieee_getets(netdev, &ets); if (!err && nla_put(skb, DCB_ATTR_IEEE_ETS, sizeof(ets), &ets)) return -EMSGSIZE; } if (ops->ieee_getmaxrate) { struct ieee_maxrate maxrate; memset(&maxrate, 0, sizeof(maxrate)); err = ops->ieee_getmaxrate(netdev, &maxrate); if (!err) { err = nla_put(skb, DCB_ATTR_IEEE_MAXRATE, sizeof(maxrate), &maxrate); if (err) return -EMSGSIZE; } } if (ops->ieee_getqcn) { struct ieee_qcn qcn; memset(&qcn, 0, sizeof(qcn)); err = ops->ieee_getqcn(netdev, &qcn); if (!err) { err = nla_put(skb, DCB_ATTR_IEEE_QCN, sizeof(qcn), &qcn); if (err) return -EMSGSIZE; } } if (ops->ieee_getqcnstats) { struct ieee_qcn_stats qcn_stats; memset(&qcn_stats, 0, sizeof(qcn_stats)); err = ops->ieee_getqcnstats(netdev, &qcn_stats); if (!err) { err = nla_put(skb, DCB_ATTR_IEEE_QCN_STATS, sizeof(qcn_stats), &qcn_stats); if (err) return -EMSGSIZE; } } if (ops->ieee_getpfc) { struct ieee_pfc pfc; memset(&pfc, 0, sizeof(pfc)); err = ops->ieee_getpfc(netdev, &pfc); if (!err && nla_put(skb, DCB_ATTR_IEEE_PFC, sizeof(pfc), &pfc)) return -EMSGSIZE; } if (ops->dcbnl_getbuffer) { struct dcbnl_buffer buffer; memset(&buffer, 0, sizeof(buffer)); err = ops->dcbnl_getbuffer(netdev, &buffer); if (!err && nla_put(skb, DCB_ATTR_DCB_BUFFER, sizeof(buffer), &buffer)) return -EMSGSIZE; } app = nla_nest_start_noflag(skb, DCB_ATTR_IEEE_APP_TABLE); if (!app) return -EMSGSIZE; spin_lock_bh(&dcb_lock); list_for_each_entry(itr, &dcb_app_list, list) { if (itr->ifindex == netdev->ifindex) { enum ieee_attrs_app type = dcbnl_app_attr_type_get(itr->app.selector); err = nla_put(skb, type, sizeof(itr->app), &itr->app); if (err) { spin_unlock_bh(&dcb_lock); return -EMSGSIZE; } } } if (netdev->dcbnl_ops->getdcbx) dcbx = netdev->dcbnl_ops->getdcbx(netdev); else dcbx = -EOPNOTSUPP; spin_unlock_bh(&dcb_lock); nla_nest_end(skb, app); rewr = nla_nest_start(skb, DCB_ATTR_DCB_REWR_TABLE); if (!rewr) return -EMSGSIZE; spin_lock_bh(&dcb_lock); list_for_each_entry(itr, &dcb_rewr_list, list) { if (itr->ifindex == netdev->ifindex) { enum ieee_attrs_app type = dcbnl_app_attr_type_get(itr->app.selector); err = nla_put(skb, type, sizeof(itr->app), &itr->app); if (err) { spin_unlock_bh(&dcb_lock); nla_nest_cancel(skb, rewr); return -EMSGSIZE; } } } spin_unlock_bh(&dcb_lock); nla_nest_end(skb, rewr); if (ops->dcbnl_getapptrust) { err = dcbnl_getapptrust(netdev, skb); if (err) return err; } /* get peer info if available */ if (ops->ieee_peer_getets) { struct ieee_ets ets; memset(&ets, 0, sizeof(ets)); err = ops->ieee_peer_getets(netdev, &ets); if (!err && nla_put(skb, DCB_ATTR_IEEE_PEER_ETS, sizeof(ets), &ets)) return -EMSGSIZE; } if (ops->ieee_peer_getpfc) { struct ieee_pfc pfc; memset(&pfc, 0, sizeof(pfc)); err = ops->ieee_peer_getpfc(netdev, &pfc); if (!err && nla_put(skb, DCB_ATTR_IEEE_PEER_PFC, sizeof(pfc), &pfc)) return -EMSGSIZE; } if (ops->peer_getappinfo && ops->peer_getapptable) { err = dcbnl_build_peer_app(netdev, skb, DCB_ATTR_IEEE_PEER_APP, DCB_ATTR_IEEE_APP_UNSPEC, DCB_ATTR_IEEE_APP); if (err) return -EMSGSIZE; } nla_nest_end(skb, ieee); if (dcbx >= 0) { err = nla_put_u8(skb, DCB_ATTR_DCBX, dcbx); if (err) return -EMSGSIZE; } return 0; } static int dcbnl_cee_pg_fill(struct sk_buff *skb, struct net_device *dev, int dir) { u8 pgid, up_map, prio, tc_pct; const struct dcbnl_rtnl_ops *ops = dev->dcbnl_ops; int i = dir ? DCB_ATTR_CEE_TX_PG : DCB_ATTR_CEE_RX_PG; struct nlattr *pg = nla_nest_start_noflag(skb, i); if (!pg) return -EMSGSIZE; for (i = DCB_PG_ATTR_TC_0; i <= DCB_PG_ATTR_TC_7; i++) { struct nlattr *tc_nest = nla_nest_start_noflag(skb, i); if (!tc_nest) return -EMSGSIZE; pgid = DCB_ATTR_VALUE_UNDEFINED; prio = DCB_ATTR_VALUE_UNDEFINED; tc_pct = DCB_ATTR_VALUE_UNDEFINED; up_map = DCB_ATTR_VALUE_UNDEFINED; if (!dir) ops->getpgtccfgrx(dev, i - DCB_PG_ATTR_TC_0, &prio, &pgid, &tc_pct, &up_map); else ops->getpgtccfgtx(dev, i - DCB_PG_ATTR_TC_0, &prio, &pgid, &tc_pct, &up_map); if (nla_put_u8(skb, DCB_TC_ATTR_PARAM_PGID, pgid) || nla_put_u8(skb, DCB_TC_ATTR_PARAM_UP_MAPPING, up_map) || nla_put_u8(skb, DCB_TC_ATTR_PARAM_STRICT_PRIO, prio) || nla_put_u8(skb, DCB_TC_ATTR_PARAM_BW_PCT, tc_pct)) return -EMSGSIZE; nla_nest_end(skb, tc_nest); } for (i = DCB_PG_ATTR_BW_ID_0; i <= DCB_PG_ATTR_BW_ID_7; i++) { tc_pct = DCB_ATTR_VALUE_UNDEFINED; if (!dir) ops->getpgbwgcfgrx(dev, i - DCB_PG_ATTR_BW_ID_0, &tc_pct); else ops->getpgbwgcfgtx(dev, i - DCB_PG_ATTR_BW_ID_0, &tc_pct); if (nla_put_u8(skb, i, tc_pct)) return -EMSGSIZE; } nla_nest_end(skb, pg); return 0; } static int dcbnl_cee_fill(struct sk_buff *skb, struct net_device *netdev) { struct nlattr *cee, *app; struct dcb_app_type *itr; const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; int dcbx, i, err = -EMSGSIZE; u8 value; if (nla_put_string(skb, DCB_ATTR_IFNAME, netdev->name)) goto nla_put_failure; cee = nla_nest_start_noflag(skb, DCB_ATTR_CEE); if (!cee) goto nla_put_failure; /* local pg */ if (ops->getpgtccfgtx && ops->getpgbwgcfgtx) { err = dcbnl_cee_pg_fill(skb, netdev, 1); if (err) goto nla_put_failure; } if (ops->getpgtccfgrx && ops->getpgbwgcfgrx) { err = dcbnl_cee_pg_fill(skb, netdev, 0); if (err) goto nla_put_failure; } /* local pfc */ if (ops->getpfccfg) { struct nlattr *pfc_nest = nla_nest_start_noflag(skb, DCB_ATTR_CEE_PFC); if (!pfc_nest) goto nla_put_failure; for (i = DCB_PFC_UP_ATTR_0; i <= DCB_PFC_UP_ATTR_7; i++) { ops->getpfccfg(netdev, i - DCB_PFC_UP_ATTR_0, &value); if (nla_put_u8(skb, i, value)) goto nla_put_failure; } nla_nest_end(skb, pfc_nest); } /* local app */ spin_lock_bh(&dcb_lock); app = nla_nest_start_noflag(skb, DCB_ATTR_CEE_APP_TABLE); if (!app) goto dcb_unlock; list_for_each_entry(itr, &dcb_app_list, list) { if (itr->ifindex == netdev->ifindex) { struct nlattr *app_nest = nla_nest_start_noflag(skb, DCB_ATTR_APP); if (!app_nest) goto dcb_unlock; err = nla_put_u8(skb, DCB_APP_ATTR_IDTYPE, itr->app.selector); if (err) goto dcb_unlock; err = nla_put_u16(skb, DCB_APP_ATTR_ID, itr->app.protocol); if (err) goto dcb_unlock; err = nla_put_u8(skb, DCB_APP_ATTR_PRIORITY, itr->app.priority); if (err) goto dcb_unlock; nla_nest_end(skb, app_nest); } } nla_nest_end(skb, app); if (netdev->dcbnl_ops->getdcbx) dcbx = netdev->dcbnl_ops->getdcbx(netdev); else dcbx = -EOPNOTSUPP; spin_unlock_bh(&dcb_lock); /* features flags */ if (ops->getfeatcfg) { struct nlattr *feat = nla_nest_start_noflag(skb, DCB_ATTR_CEE_FEAT); if (!feat) goto nla_put_failure; for (i = DCB_FEATCFG_ATTR_ALL + 1; i <= DCB_FEATCFG_ATTR_MAX; i++) if (!ops->getfeatcfg(netdev, i, &value) && nla_put_u8(skb, i, value)) goto nla_put_failure; nla_nest_end(skb, feat); } /* peer info if available */ if (ops->cee_peer_getpg) { struct cee_pg pg; memset(&pg, 0, sizeof(pg)); err = ops->cee_peer_getpg(netdev, &pg); if (!err && nla_put(skb, DCB_ATTR_CEE_PEER_PG, sizeof(pg), &pg)) goto nla_put_failure; } if (ops->cee_peer_getpfc) { struct cee_pfc pfc; memset(&pfc, 0, sizeof(pfc)); err = ops->cee_peer_getpfc(netdev, &pfc); if (!err && nla_put(skb, DCB_ATTR_CEE_PEER_PFC, sizeof(pfc), &pfc)) goto nla_put_failure; } if (ops->peer_getappinfo && ops->peer_getapptable) { err = dcbnl_build_peer_app(netdev, skb, DCB_ATTR_CEE_PEER_APP_TABLE, DCB_ATTR_CEE_PEER_APP_INFO, DCB_ATTR_CEE_PEER_APP); if (err) goto nla_put_failure; } nla_nest_end(skb, cee); /* DCBX state */ if (dcbx >= 0) { err = nla_put_u8(skb, DCB_ATTR_DCBX, dcbx); if (err) goto nla_put_failure; } return 0; dcb_unlock: spin_unlock_bh(&dcb_lock); nla_put_failure: err = -EMSGSIZE; return err; } static int dcbnl_notify(struct net_device *dev, int event, int cmd, u32 seq, u32 portid, int dcbx_ver) { struct net *net = dev_net(dev); struct sk_buff *skb; struct nlmsghdr *nlh; const struct dcbnl_rtnl_ops *ops = dev->dcbnl_ops; int err; if (!ops) return -EOPNOTSUPP; skb = dcbnl_newmsg(event, cmd, portid, seq, 0, &nlh); if (!skb) return -ENOMEM; if (dcbx_ver == DCB_CAP_DCBX_VER_IEEE) err = dcbnl_ieee_fill(skb, dev); else err = dcbnl_cee_fill(skb, dev); if (err < 0) { /* Report error to broadcast listeners */ nlmsg_free(skb); rtnl_set_sk_err(net, RTNLGRP_DCB, err); } else { /* End nlmsg and notify broadcast listeners */ nlmsg_end(skb, nlh); rtnl_notify(skb, net, 0, RTNLGRP_DCB, NULL, GFP_KERNEL); } return err; } int dcbnl_ieee_notify(struct net_device *dev, int event, int cmd, u32 seq, u32 portid) { return dcbnl_notify(dev, event, cmd, seq, portid, DCB_CAP_DCBX_VER_IEEE); } EXPORT_SYMBOL(dcbnl_ieee_notify); int dcbnl_cee_notify(struct net_device *dev, int event, int cmd, u32 seq, u32 portid) { return dcbnl_notify(dev, event, cmd, seq, portid, DCB_CAP_DCBX_VER_CEE); } EXPORT_SYMBOL(dcbnl_cee_notify); /* Handle IEEE 802.1Qaz/802.1Qau/802.1Qbb SET commands. * If any requested operation can not be completed * the entire msg is aborted and error value is returned. * No attempt is made to reconcile the case where only part of the * cmd can be completed. */ static int dcbnl_ieee_set(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; struct nlattr *ieee[DCB_ATTR_IEEE_MAX + 1]; int prio; int err; if (!ops) return -EOPNOTSUPP; if (!tb[DCB_ATTR_IEEE]) return -EINVAL; err = nla_parse_nested_deprecated(ieee, DCB_ATTR_IEEE_MAX, tb[DCB_ATTR_IEEE], dcbnl_ieee_policy, NULL); if (err) return err; if (ieee[DCB_ATTR_IEEE_ETS] && ops->ieee_setets) { struct ieee_ets *ets = nla_data(ieee[DCB_ATTR_IEEE_ETS]); err = ops->ieee_setets(netdev, ets); if (err) goto err; } if (ieee[DCB_ATTR_IEEE_MAXRATE] && ops->ieee_setmaxrate) { struct ieee_maxrate *maxrate = nla_data(ieee[DCB_ATTR_IEEE_MAXRATE]); err = ops->ieee_setmaxrate(netdev, maxrate); if (err) goto err; } if (ieee[DCB_ATTR_IEEE_QCN] && ops->ieee_setqcn) { struct ieee_qcn *qcn = nla_data(ieee[DCB_ATTR_IEEE_QCN]); err = ops->ieee_setqcn(netdev, qcn); if (err) goto err; } if (ieee[DCB_ATTR_IEEE_PFC] && ops->ieee_setpfc) { struct ieee_pfc *pfc = nla_data(ieee[DCB_ATTR_IEEE_PFC]); err = ops->ieee_setpfc(netdev, pfc); if (err) goto err; } if (ieee[DCB_ATTR_DCB_BUFFER] && ops->dcbnl_setbuffer) { struct dcbnl_buffer *buffer = nla_data(ieee[DCB_ATTR_DCB_BUFFER]); for (prio = 0; prio < ARRAY_SIZE(buffer->prio2buffer); prio++) { if (buffer->prio2buffer[prio] >= DCBX_MAX_BUFFERS) { err = -EINVAL; goto err; } } err = ops->dcbnl_setbuffer(netdev, buffer); if (err) goto err; } if (ieee[DCB_ATTR_DCB_REWR_TABLE]) { err = dcbnl_app_table_setdel(ieee[DCB_ATTR_DCB_REWR_TABLE], netdev, ops->dcbnl_setrewr ?: dcb_setrewr); if (err) goto err; } if (ieee[DCB_ATTR_IEEE_APP_TABLE]) { err = dcbnl_app_table_setdel(ieee[DCB_ATTR_IEEE_APP_TABLE], netdev, ops->ieee_setapp ?: dcb_ieee_setapp); if (err) goto err; } if (ieee[DCB_ATTR_DCB_APP_TRUST_TABLE]) { u8 selectors[IEEE_8021QAZ_APP_SEL_MAX + 1] = {0}; struct nlattr *attr; int nselectors = 0; int rem; if (!ops->dcbnl_setapptrust) { err = -EOPNOTSUPP; goto err; } nla_for_each_nested(attr, ieee[DCB_ATTR_DCB_APP_TRUST_TABLE], rem) { enum ieee_attrs_app type = nla_type(attr); u8 selector; int i; if (!dcbnl_app_attr_type_validate(type) || nla_len(attr) != 1 || nselectors >= sizeof(selectors)) { err = -EINVAL; goto err; } selector = nla_get_u8(attr); if (!dcbnl_app_selector_validate(type, selector)) { err = -EINVAL; goto err; } /* Duplicate selector ? */ for (i = 0; i < nselectors; i++) { if (selectors[i] == selector) { err = -EINVAL; goto err; } } selectors[nselectors++] = selector; } err = ops->dcbnl_setapptrust(netdev, selectors, nselectors); if (err) goto err; } err: err = nla_put_u8(skb, DCB_ATTR_IEEE, err); dcbnl_ieee_notify(netdev, RTM_SETDCB, DCB_CMD_IEEE_SET, seq, 0); return err; } static int dcbnl_ieee_get(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; if (!ops) return -EOPNOTSUPP; return dcbnl_ieee_fill(skb, netdev); } static int dcbnl_ieee_del(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; struct nlattr *ieee[DCB_ATTR_IEEE_MAX + 1]; int err; if (!ops) return -EOPNOTSUPP; if (!tb[DCB_ATTR_IEEE]) return -EINVAL; err = nla_parse_nested_deprecated(ieee, DCB_ATTR_IEEE_MAX, tb[DCB_ATTR_IEEE], dcbnl_ieee_policy, NULL); if (err) return err; if (ieee[DCB_ATTR_IEEE_APP_TABLE]) { err = dcbnl_app_table_setdel(ieee[DCB_ATTR_IEEE_APP_TABLE], netdev, ops->ieee_delapp ?: dcb_ieee_delapp); if (err) goto err; } if (ieee[DCB_ATTR_DCB_REWR_TABLE]) { err = dcbnl_app_table_setdel(ieee[DCB_ATTR_DCB_REWR_TABLE], netdev, ops->dcbnl_delrewr ?: dcb_delrewr); if (err) goto err; } err: err = nla_put_u8(skb, DCB_ATTR_IEEE, err); dcbnl_ieee_notify(netdev, RTM_SETDCB, DCB_CMD_IEEE_DEL, seq, 0); return err; } /* DCBX configuration */ static int dcbnl_getdcbx(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { if (!netdev->dcbnl_ops->getdcbx) return -EOPNOTSUPP; return nla_put_u8(skb, DCB_ATTR_DCBX, netdev->dcbnl_ops->getdcbx(netdev)); } static int dcbnl_setdcbx(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { u8 value; if (!netdev->dcbnl_ops->setdcbx) return -EOPNOTSUPP; if (!tb[DCB_ATTR_DCBX]) return -EINVAL; value = nla_get_u8(tb[DCB_ATTR_DCBX]); return nla_put_u8(skb, DCB_ATTR_DCBX, netdev->dcbnl_ops->setdcbx(netdev, value)); } static int dcbnl_getfeatcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *data[DCB_FEATCFG_ATTR_MAX + 1], *nest; u8 value; int ret, i; int getall = 0; if (!netdev->dcbnl_ops->getfeatcfg) return -EOPNOTSUPP; if (!tb[DCB_ATTR_FEATCFG]) return -EINVAL; ret = nla_parse_nested_deprecated(data, DCB_FEATCFG_ATTR_MAX, tb[DCB_ATTR_FEATCFG], dcbnl_featcfg_nest, NULL); if (ret) return ret; nest = nla_nest_start_noflag(skb, DCB_ATTR_FEATCFG); if (!nest) return -EMSGSIZE; if (data[DCB_FEATCFG_ATTR_ALL]) getall = 1; for (i = DCB_FEATCFG_ATTR_ALL+1; i <= DCB_FEATCFG_ATTR_MAX; i++) { if (!getall && !data[i]) continue; ret = netdev->dcbnl_ops->getfeatcfg(netdev, i, &value); if (!ret) ret = nla_put_u8(skb, i, value); if (ret) { nla_nest_cancel(skb, nest); goto nla_put_failure; } } nla_nest_end(skb, nest); nla_put_failure: return ret; } static int dcbnl_setfeatcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *data[DCB_FEATCFG_ATTR_MAX + 1]; int ret, i; u8 value; if (!netdev->dcbnl_ops->setfeatcfg) return -ENOTSUPP; if (!tb[DCB_ATTR_FEATCFG]) return -EINVAL; ret = nla_parse_nested_deprecated(data, DCB_FEATCFG_ATTR_MAX, tb[DCB_ATTR_FEATCFG], dcbnl_featcfg_nest, NULL); if (ret) goto err; for (i = DCB_FEATCFG_ATTR_ALL+1; i <= DCB_FEATCFG_ATTR_MAX; i++) { if (data[i] == NULL) continue; value = nla_get_u8(data[i]); ret = netdev->dcbnl_ops->setfeatcfg(netdev, i, value); if (ret) goto err; } err: ret = nla_put_u8(skb, DCB_ATTR_FEATCFG, ret); return ret; } /* Handle CEE DCBX GET commands. */ static int dcbnl_cee_get(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; if (!ops) return -EOPNOTSUPP; return dcbnl_cee_fill(skb, netdev); } struct reply_func { /* reply netlink message type */ int type; /* function to fill message contents */ int (*cb)(struct net_device *, struct nlmsghdr *, u32, struct nlattr **, struct sk_buff *); }; static const struct reply_func reply_funcs[DCB_CMD_MAX+1] = { [DCB_CMD_GSTATE] = { RTM_GETDCB, dcbnl_getstate }, [DCB_CMD_SSTATE] = { RTM_SETDCB, dcbnl_setstate }, [DCB_CMD_PFC_GCFG] = { RTM_GETDCB, dcbnl_getpfccfg }, [DCB_CMD_PFC_SCFG] = { RTM_SETDCB, dcbnl_setpfccfg }, [DCB_CMD_GPERM_HWADDR] = { RTM_GETDCB, dcbnl_getperm_hwaddr }, [DCB_CMD_GCAP] = { RTM_GETDCB, dcbnl_getcap }, [DCB_CMD_GNUMTCS] = { RTM_GETDCB, dcbnl_getnumtcs }, [DCB_CMD_SNUMTCS] = { RTM_SETDCB, dcbnl_setnumtcs }, [DCB_CMD_PFC_GSTATE] = { RTM_GETDCB, dcbnl_getpfcstate }, [DCB_CMD_PFC_SSTATE] = { RTM_SETDCB, dcbnl_setpfcstate }, [DCB_CMD_GAPP] = { RTM_GETDCB, dcbnl_getapp }, [DCB_CMD_SAPP] = { RTM_SETDCB, dcbnl_setapp }, [DCB_CMD_PGTX_GCFG] = { RTM_GETDCB, dcbnl_pgtx_getcfg }, [DCB_CMD_PGTX_SCFG] = { RTM_SETDCB, dcbnl_pgtx_setcfg }, [DCB_CMD_PGRX_GCFG] = { RTM_GETDCB, dcbnl_pgrx_getcfg }, [DCB_CMD_PGRX_SCFG] = { RTM_SETDCB, dcbnl_pgrx_setcfg }, [DCB_CMD_SET_ALL] = { RTM_SETDCB, dcbnl_setall }, [DCB_CMD_BCN_GCFG] = { RTM_GETDCB, dcbnl_bcn_getcfg }, [DCB_CMD_BCN_SCFG] = { RTM_SETDCB, dcbnl_bcn_setcfg }, [DCB_CMD_IEEE_GET] = { RTM_GETDCB, dcbnl_ieee_get }, [DCB_CMD_IEEE_SET] = { RTM_SETDCB, dcbnl_ieee_set }, [DCB_CMD_IEEE_DEL] = { RTM_SETDCB, dcbnl_ieee_del }, [DCB_CMD_GDCBX] = { RTM_GETDCB, dcbnl_getdcbx }, [DCB_CMD_SDCBX] = { RTM_SETDCB, dcbnl_setdcbx }, [DCB_CMD_GFEATCFG] = { RTM_GETDCB, dcbnl_getfeatcfg }, [DCB_CMD_SFEATCFG] = { RTM_SETDCB, dcbnl_setfeatcfg }, [DCB_CMD_CEE_GET] = { RTM_GETDCB, dcbnl_cee_get }, }; static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct net_device *netdev; struct dcbmsg *dcb = nlmsg_data(nlh); struct nlattr *tb[DCB_ATTR_MAX + 1]; u32 portid = NETLINK_CB(skb).portid; int ret = -EINVAL; struct sk_buff *reply_skb; struct nlmsghdr *reply_nlh = NULL; const struct reply_func *fn; if ((nlh->nlmsg_type == RTM_SETDCB) && !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; ret = nlmsg_parse_deprecated(nlh, sizeof(*dcb), tb, DCB_ATTR_MAX, dcbnl_rtnl_policy, extack); if (ret < 0) return ret; if (dcb->cmd > DCB_CMD_MAX) return -EINVAL; /* check if a reply function has been defined for the command */ fn = &reply_funcs[dcb->cmd]; if (!fn->cb) return -EOPNOTSUPP; if (fn->type == RTM_SETDCB && !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if (!tb[DCB_ATTR_IFNAME]) return -EINVAL; netdev = __dev_get_by_name(net, nla_data(tb[DCB_ATTR_IFNAME])); if (!netdev) return -ENODEV; if (!netdev->dcbnl_ops) return -EOPNOTSUPP; reply_skb = dcbnl_newmsg(fn->type, dcb->cmd, portid, nlh->nlmsg_seq, nlh->nlmsg_flags, &reply_nlh); if (!reply_skb) return -ENOMEM; ret = fn->cb(netdev, nlh, nlh->nlmsg_seq, tb, reply_skb); if (ret < 0) { nlmsg_free(reply_skb); goto out; } nlmsg_end(reply_skb, reply_nlh); ret = rtnl_unicast(reply_skb, net, portid); out: return ret; } static struct dcb_app_type *dcb_rewr_lookup(const struct dcb_app *app, int ifindex, int proto) { struct dcb_app_type *itr; list_for_each_entry(itr, &dcb_rewr_list, list) { if (itr->app.selector == app->selector && itr->app.priority == app->priority && itr->ifindex == ifindex && ((proto == -1) || itr->app.protocol == proto)) return itr; } return NULL; } static struct dcb_app_type *dcb_app_lookup(const struct dcb_app *app, int ifindex, int prio) { struct dcb_app_type *itr; list_for_each_entry(itr, &dcb_app_list, list) { if (itr->app.selector == app->selector && itr->app.protocol == app->protocol && itr->ifindex == ifindex && ((prio == -1) || itr->app.priority == prio)) return itr; } return NULL; } static int dcb_app_add(struct list_head *list, const struct dcb_app *app, int ifindex) { struct dcb_app_type *entry; entry = kmalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) return -ENOMEM; memcpy(&entry->app, app, sizeof(*app)); entry->ifindex = ifindex; list_add(&entry->list, list); return 0; } /** * dcb_getapp - retrieve the DCBX application user priority * @dev: network interface * @app: application to get user priority of * * On success returns a non-zero 802.1p user priority bitmap * otherwise returns 0 as the invalid user priority bitmap to * indicate an error. */ u8 dcb_getapp(struct net_device *dev, struct dcb_app *app) { struct dcb_app_type *itr; u8 prio = 0; spin_lock_bh(&dcb_lock); itr = dcb_app_lookup(app, dev->ifindex, -1); if (itr) prio = itr->app.priority; spin_unlock_bh(&dcb_lock); return prio; } EXPORT_SYMBOL(dcb_getapp); /** * dcb_setapp - add CEE dcb application data to app list * @dev: network interface * @new: application data to add * * Priority 0 is an invalid priority in CEE spec. This routine * removes applications from the app list if the priority is * set to zero. Priority is expected to be 8-bit 802.1p user priority bitmap */ int dcb_setapp(struct net_device *dev, struct dcb_app *new) { struct dcb_app_type *itr; struct dcb_app_type event; int err = 0; event.ifindex = dev->ifindex; memcpy(&event.app, new, sizeof(event.app)); if (dev->dcbnl_ops->getdcbx) event.dcbx = dev->dcbnl_ops->getdcbx(dev); spin_lock_bh(&dcb_lock); /* Search for existing match and replace */ itr = dcb_app_lookup(new, dev->ifindex, -1); if (itr) { if (new->priority) itr->app.priority = new->priority; else { list_del(&itr->list); kfree(itr); } goto out; } /* App type does not exist add new application type */ if (new->priority) err = dcb_app_add(&dcb_app_list, new, dev->ifindex); out: spin_unlock_bh(&dcb_lock); if (!err) call_dcbevent_notifiers(DCB_APP_EVENT, &event); return err; } EXPORT_SYMBOL(dcb_setapp); /** * dcb_ieee_getapp_mask - retrieve the IEEE DCB application priority * @dev: network interface * @app: where to store the retrieve application data * * Helper routine which on success returns a non-zero 802.1Qaz user * priority bitmap otherwise returns 0 to indicate the dcb_app was * not found in APP list. */ u8 dcb_ieee_getapp_mask(struct net_device *dev, struct dcb_app *app) { struct dcb_app_type *itr; u8 prio = 0; spin_lock_bh(&dcb_lock); itr = dcb_app_lookup(app, dev->ifindex, -1); if (itr) prio |= 1 << itr->app.priority; spin_unlock_bh(&dcb_lock); return prio; } EXPORT_SYMBOL(dcb_ieee_getapp_mask); /* Get protocol value from rewrite entry. */ u16 dcb_getrewr(struct net_device *dev, struct dcb_app *app) { struct dcb_app_type *itr; u16 proto = 0; spin_lock_bh(&dcb_lock); itr = dcb_rewr_lookup(app, dev->ifindex, -1); if (itr) proto = itr->app.protocol; spin_unlock_bh(&dcb_lock); return proto; } EXPORT_SYMBOL(dcb_getrewr); /* Add rewrite entry to the rewrite list. */ int dcb_setrewr(struct net_device *dev, struct dcb_app *new) { int err; spin_lock_bh(&dcb_lock); /* Search for existing match and abort if found. */ if (dcb_rewr_lookup(new, dev->ifindex, new->protocol)) { err = -EEXIST; goto out; } err = dcb_app_add(&dcb_rewr_list, new, dev->ifindex); out: spin_unlock_bh(&dcb_lock); return err; } EXPORT_SYMBOL(dcb_setrewr); /* Delete rewrite entry from the rewrite list. */ int dcb_delrewr(struct net_device *dev, struct dcb_app *del) { struct dcb_app_type *itr; int err = -ENOENT; spin_lock_bh(&dcb_lock); /* Search for existing match and remove it. */ itr = dcb_rewr_lookup(del, dev->ifindex, del->protocol); if (itr) { list_del(&itr->list); kfree(itr); err = 0; } spin_unlock_bh(&dcb_lock); return err; } EXPORT_SYMBOL(dcb_delrewr); /** * dcb_ieee_setapp - add IEEE dcb application data to app list * @dev: network interface * @new: application data to add * * This adds Application data to the list. Multiple application * entries may exists for the same selector and protocol as long * as the priorities are different. Priority is expected to be a * 3-bit unsigned integer */ int dcb_ieee_setapp(struct net_device *dev, struct dcb_app *new) { struct dcb_app_type event; int err = 0; event.ifindex = dev->ifindex; memcpy(&event.app, new, sizeof(event.app)); if (dev->dcbnl_ops->getdcbx) event.dcbx = dev->dcbnl_ops->getdcbx(dev); spin_lock_bh(&dcb_lock); /* Search for existing match and abort if found */ if (dcb_app_lookup(new, dev->ifindex, new->priority)) { err = -EEXIST; goto out; } err = dcb_app_add(&dcb_app_list, new, dev->ifindex); out: spin_unlock_bh(&dcb_lock); if (!err) call_dcbevent_notifiers(DCB_APP_EVENT, &event); return err; } EXPORT_SYMBOL(dcb_ieee_setapp); /** * dcb_ieee_delapp - delete IEEE dcb application data from list * @dev: network interface * @del: application data to delete * * This removes a matching APP data from the APP list */ int dcb_ieee_delapp(struct net_device *dev, struct dcb_app *del) { struct dcb_app_type *itr; struct dcb_app_type event; int err = -ENOENT; event.ifindex = dev->ifindex; memcpy(&event.app, del, sizeof(event.app)); if (dev->dcbnl_ops->getdcbx) event.dcbx = dev->dcbnl_ops->getdcbx(dev); spin_lock_bh(&dcb_lock); /* Search for existing match and remove it. */ if ((itr = dcb_app_lookup(del, dev->ifindex, del->priority))) { list_del(&itr->list); kfree(itr); err = 0; } spin_unlock_bh(&dcb_lock); if (!err) call_dcbevent_notifiers(DCB_APP_EVENT, &event); return err; } EXPORT_SYMBOL(dcb_ieee_delapp); /* dcb_getrewr_prio_pcp_mask_map - For a given device, find mapping from * priorities to the PCP and DEI values assigned to that priority. */ void dcb_getrewr_prio_pcp_mask_map(const struct net_device *dev, struct dcb_rewr_prio_pcp_map *p_map) { int ifindex = dev->ifindex; struct dcb_app_type *itr; u8 prio; memset(p_map->map, 0, sizeof(p_map->map)); spin_lock_bh(&dcb_lock); list_for_each_entry(itr, &dcb_rewr_list, list) { if (itr->ifindex == ifindex && itr->app.selector == DCB_APP_SEL_PCP && itr->app.protocol < 16 && itr->app.priority < IEEE_8021QAZ_MAX_TCS) { prio = itr->app.priority; p_map->map[prio] |= 1 << itr->app.protocol; } } spin_unlock_bh(&dcb_lock); } EXPORT_SYMBOL(dcb_getrewr_prio_pcp_mask_map); /* dcb_getrewr_prio_dscp_mask_map - For a given device, find mapping from * priorities to the DSCP values assigned to that priority. */ void dcb_getrewr_prio_dscp_mask_map(const struct net_device *dev, struct dcb_ieee_app_prio_map *p_map) { int ifindex = dev->ifindex; struct dcb_app_type *itr; u8 prio; memset(p_map->map, 0, sizeof(p_map->map)); spin_lock_bh(&dcb_lock); list_for_each_entry(itr, &dcb_rewr_list, list) { if (itr->ifindex == ifindex && itr->app.selector == IEEE_8021QAZ_APP_SEL_DSCP && itr->app.protocol < 64 && itr->app.priority < IEEE_8021QAZ_MAX_TCS) { prio = itr->app.priority; p_map->map[prio] |= 1ULL << itr->app.protocol; } } spin_unlock_bh(&dcb_lock); } EXPORT_SYMBOL(dcb_getrewr_prio_dscp_mask_map); /* * dcb_ieee_getapp_prio_dscp_mask_map - For a given device, find mapping from * priorities to the DSCP values assigned to that priority. Initialize p_map * such that each map element holds a bit mask of DSCP values configured for * that priority by APP entries. */ void dcb_ieee_getapp_prio_dscp_mask_map(const struct net_device *dev, struct dcb_ieee_app_prio_map *p_map) { int ifindex = dev->ifindex; struct dcb_app_type *itr; u8 prio; memset(p_map->map, 0, sizeof(p_map->map)); spin_lock_bh(&dcb_lock); list_for_each_entry(itr, &dcb_app_list, list) { if (itr->ifindex == ifindex && itr->app.selector == IEEE_8021QAZ_APP_SEL_DSCP && itr->app.protocol < 64 && itr->app.priority < IEEE_8021QAZ_MAX_TCS) { prio = itr->app.priority; p_map->map[prio] |= 1ULL << itr->app.protocol; } } spin_unlock_bh(&dcb_lock); } EXPORT_SYMBOL(dcb_ieee_getapp_prio_dscp_mask_map); /* * dcb_ieee_getapp_dscp_prio_mask_map - For a given device, find mapping from * DSCP values to the priorities assigned to that DSCP value. Initialize p_map * such that each map element holds a bit mask of priorities configured for a * given DSCP value by APP entries. */ void dcb_ieee_getapp_dscp_prio_mask_map(const struct net_device *dev, struct dcb_ieee_app_dscp_map *p_map) { int ifindex = dev->ifindex; struct dcb_app_type *itr; memset(p_map->map, 0, sizeof(p_map->map)); spin_lock_bh(&dcb_lock); list_for_each_entry(itr, &dcb_app_list, list) { if (itr->ifindex == ifindex && itr->app.selector == IEEE_8021QAZ_APP_SEL_DSCP && itr->app.protocol < 64 && itr->app.priority < IEEE_8021QAZ_MAX_TCS) p_map->map[itr->app.protocol] |= 1 << itr->app.priority; } spin_unlock_bh(&dcb_lock); } EXPORT_SYMBOL(dcb_ieee_getapp_dscp_prio_mask_map); /* * Per 802.1Q-2014, the selector value of 1 is used for matching on Ethernet * type, with valid PID values >= 1536. A special meaning is then assigned to * protocol value of 0: "default priority. For use when priority is not * otherwise specified". * * dcb_ieee_getapp_default_prio_mask - For a given device, find all APP entries * of the form {$PRIO, ETHERTYPE, 0} and construct a bit mask of all default * priorities set by these entries. */ u8 dcb_ieee_getapp_default_prio_mask(const struct net_device *dev) { int ifindex = dev->ifindex; struct dcb_app_type *itr; u8 mask = 0; spin_lock_bh(&dcb_lock); list_for_each_entry(itr, &dcb_app_list, list) { if (itr->ifindex == ifindex && itr->app.selector == IEEE_8021QAZ_APP_SEL_ETHERTYPE && itr->app.protocol == 0 && itr->app.priority < IEEE_8021QAZ_MAX_TCS) mask |= 1 << itr->app.priority; } spin_unlock_bh(&dcb_lock); return mask; } EXPORT_SYMBOL(dcb_ieee_getapp_default_prio_mask); static void dcbnl_flush_dev(struct net_device *dev) { struct dcb_app_type *itr, *tmp; spin_lock_bh(&dcb_lock); list_for_each_entry_safe(itr, tmp, &dcb_app_list, list) { if (itr->ifindex == dev->ifindex) { list_del(&itr->list); kfree(itr); } } spin_unlock_bh(&dcb_lock); } static int dcbnl_netdevice_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); switch (event) { case NETDEV_UNREGISTER: if (!dev->dcbnl_ops) return NOTIFY_DONE; dcbnl_flush_dev(dev); return NOTIFY_OK; default: return NOTIFY_DONE; } } static struct notifier_block dcbnl_nb __read_mostly = { .notifier_call = dcbnl_netdevice_event, }; static const struct rtnl_msg_handler dcbnl_rtnl_msg_handlers[] __initconst = { {.msgtype = RTM_GETDCB, .doit = dcb_doit}, {.msgtype = RTM_SETDCB, .doit = dcb_doit}, }; static int __init dcbnl_init(void) { int err; err = register_netdevice_notifier(&dcbnl_nb); if (err) return err; rtnl_register_many(dcbnl_rtnl_msg_handlers); return 0; } device_initcall(dcbnl_init);
28 29 29 28 29 29 29 29 29 28 29 28 29 29 29 28 29 29 29 29 29 28 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 9 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 // SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/signal.c * * Copyright (C) 1991, 1992 Linus Torvalds * * 1997-11-02 Modified for POSIX.1b signals by Richard Henderson * * 2003-06-02 Jim Houston - Concurrent Computer Corp. * Changes to use preallocated sigqueue structures * to allow signals to be sent reliably. */ #include <linux/slab.h> #include <linux/export.h> #include <linux/init.h> #include <linux/sched/mm.h> #include <linux/sched/user.h> #include <linux/sched/debug.h> #include <linux/sched/task.h> #include <linux/sched/task_stack.h> #include <linux/sched/cputime.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/proc_fs.h> #include <linux/tty.h> #include <linux/binfmts.h> #include <linux/coredump.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/ptrace.h> #include <linux/signal.h> #include <linux/signalfd.h> #include <linux/ratelimit.h> #include <linux/task_work.h> #include <linux/capability.h> #include <linux/freezer.h> #include <linux/pid_namespace.h> #include <linux/nsproxy.h> #include <linux/user_namespace.h> #include <linux/uprobes.h> #include <linux/compat.h> #include <linux/cn_proc.h> #include <linux/compiler.h> #include <linux/posix-timers.h> #include <linux/cgroup.h> #include <linux/audit.h> #include <linux/sysctl.h> #include <uapi/linux/pidfd.h> #define CREATE_TRACE_POINTS #include <trace/events/signal.h> #include <asm/param.h> #include <linux/uaccess.h> #include <asm/unistd.h> #include <asm/siginfo.h> #include <asm/cacheflush.h> #include <asm/syscall.h> /* for syscall_get_* */ #include "time/posix-timers.h" /* * SLAB caches for signal bits. */ static struct kmem_cache *sigqueue_cachep; int print_fatal_signals __read_mostly; static void __user *sig_handler(struct task_struct *t, int sig) { return t->sighand->action[sig - 1].sa.sa_handler; } static inline bool sig_handler_ignored(void __user *handler, int sig) { /* Is it explicitly or implicitly ignored? */ return handler == SIG_IGN || (handler == SIG_DFL && sig_kernel_ignore(sig)); } static bool sig_task_ignored(struct task_struct *t, int sig, bool force) { void __user *handler; handler = sig_handler(t, sig); /* SIGKILL and SIGSTOP may not be sent to the global init */ if (unlikely(is_global_init(t) && sig_kernel_only(sig))) return true; if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) && handler == SIG_DFL && !(force && sig_kernel_only(sig))) return true; /* Only allow kernel generated signals to this kthread */ if (unlikely((t->flags & PF_KTHREAD) && (handler == SIG_KTHREAD_KERNEL) && !force)) return true; return sig_handler_ignored(handler, sig); } static bool sig_ignored(struct task_struct *t, int sig, bool force) { /* * Blocked signals are never ignored, since the * signal handler may change by the time it is * unblocked. */ if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig)) return false; /* * Tracers may want to know about even ignored signal unless it * is SIGKILL which can't be reported anyway but can be ignored * by SIGNAL_UNKILLABLE task. */ if (t->ptrace && sig != SIGKILL) return false; return sig_task_ignored(t, sig, force); } /* * Re-calculate pending state from the set of locally pending * signals, globally pending signals, and blocked signals. */ static inline bool has_pending_signals(sigset_t *signal, sigset_t *blocked) { unsigned long ready; long i; switch (_NSIG_WORDS) { default: for (i = _NSIG_WORDS, ready = 0; --i >= 0 ;) ready |= signal->sig[i] &~ blocked->sig[i]; break; case 4: ready = signal->sig[3] &~ blocked->sig[3]; ready |= signal->sig[2] &~ blocked->sig[2]; ready |= signal->sig[1] &~ blocked->sig[1]; ready |= signal->sig[0] &~ blocked->sig[0]; break; case 2: ready = signal->sig[1] &~ blocked->sig[1]; ready |= signal->sig[0] &~ blocked->sig[0]; break; case 1: ready = signal->sig[0] &~ blocked->sig[0]; } return ready != 0; } #define PENDING(p,b) has_pending_signals(&(p)->signal, (b)) static bool recalc_sigpending_tsk(struct task_struct *t) { if ((t->jobctl & (JOBCTL_PENDING_MASK | JOBCTL_TRAP_FREEZE)) || PENDING(&t->pending, &t->blocked) || PENDING(&t->signal->shared_pending, &t->blocked) || cgroup_task_frozen(t)) { set_tsk_thread_flag(t, TIF_SIGPENDING); return true; } /* * We must never clear the flag in another thread, or in current * when it's possible the current syscall is returning -ERESTART*. * So we don't clear it here, and only callers who know they should do. */ return false; } void recalc_sigpending(void) { if (!recalc_sigpending_tsk(current) && !freezing(current)) { if (unlikely(test_thread_flag(TIF_SIGPENDING))) clear_thread_flag(TIF_SIGPENDING); } } EXPORT_SYMBOL(recalc_sigpending); void calculate_sigpending(void) { /* Have any signals or users of TIF_SIGPENDING been delayed * until after fork? */ spin_lock_irq(&current->sighand->siglock); set_tsk_thread_flag(current, TIF_SIGPENDING); recalc_sigpending(); spin_unlock_irq(&current->sighand->siglock); } /* Given the mask, find the first available signal that should be serviced. */ #define SYNCHRONOUS_MASK \ (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \ sigmask(SIGTRAP) | sigmask(SIGFPE) | sigmask(SIGSYS)) int next_signal(struct sigpending *pending, sigset_t *mask) { unsigned long i, *s, *m, x; int sig = 0; s = pending->signal.sig; m = mask->sig; /* * Handle the first word specially: it contains the * synchronous signals that need to be dequeued first. */ x = *s &~ *m; if (x) { if (x & SYNCHRONOUS_MASK) x &= SYNCHRONOUS_MASK; sig = ffz(~x) + 1; return sig; } switch (_NSIG_WORDS) { default: for (i = 1; i < _NSIG_WORDS; ++i) { x = *++s &~ *++m; if (!x) continue; sig = ffz(~x) + i*_NSIG_BPW + 1; break; } break; case 2: x = s[1] &~ m[1]; if (!x) break; sig = ffz(~x) + _NSIG_BPW + 1; break; case 1: /* Nothing to do */ break; } return sig; } static inline void print_dropped_signal(int sig) { static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10); if (!print_fatal_signals) return; if (!__ratelimit(&ratelimit_state)) return; pr_info("%s/%d: reached RLIMIT_SIGPENDING, dropped signal %d\n", current->comm, current->pid, sig); } /** * task_set_jobctl_pending - set jobctl pending bits * @task: target task * @mask: pending bits to set * * Clear @mask from @task->jobctl. @mask must be subset of * %JOBCTL_PENDING_MASK | %JOBCTL_STOP_CONSUME | %JOBCTL_STOP_SIGMASK | * %JOBCTL_TRAPPING. If stop signo is being set, the existing signo is * cleared. If @task is already being killed or exiting, this function * becomes noop. * * CONTEXT: * Must be called with @task->sighand->siglock held. * * RETURNS: * %true if @mask is set, %false if made noop because @task was dying. */ bool task_set_jobctl_pending(struct task_struct *task, unsigned long mask) { BUG_ON(mask & ~(JOBCTL_PENDING_MASK | JOBCTL_STOP_CONSUME | JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING)); BUG_ON((mask & JOBCTL_TRAPPING) && !(mask & JOBCTL_PENDING_MASK)); if (unlikely(fatal_signal_pending(task) || (task->flags & PF_EXITING))) return false; if (mask & JOBCTL_STOP_SIGMASK) task->jobctl &= ~JOBCTL_STOP_SIGMASK; task->jobctl |= mask; return true; } /** * task_clear_jobctl_trapping - clear jobctl trapping bit * @task: target task * * If JOBCTL_TRAPPING is set, a ptracer is waiting for us to enter TRACED. * Clear it and wake up the ptracer. Note that we don't need any further * locking. @task->siglock guarantees that @task->parent points to the * ptracer. * * CONTEXT: * Must be called with @task->sighand->siglock held. */ void task_clear_jobctl_trapping(struct task_struct *task) { if (unlikely(task->jobctl & JOBCTL_TRAPPING)) { task->jobctl &= ~JOBCTL_TRAPPING; smp_mb(); /* advised by wake_up_bit() */ wake_up_bit(&task->jobctl, JOBCTL_TRAPPING_BIT); } } /** * task_clear_jobctl_pending - clear jobctl pending bits * @task: target task * @mask: pending bits to clear * * Clear @mask from @task->jobctl. @mask must be subset of * %JOBCTL_PENDING_MASK. If %JOBCTL_STOP_PENDING is being cleared, other * STOP bits are cleared together. * * If clearing of @mask leaves no stop or trap pending, this function calls * task_clear_jobctl_trapping(). * * CONTEXT: * Must be called with @task->sighand->siglock held. */ void task_clear_jobctl_pending(struct task_struct *task, unsigned long mask) { BUG_ON(mask & ~JOBCTL_PENDING_MASK); if (mask & JOBCTL_STOP_PENDING) mask |= JOBCTL_STOP_CONSUME | JOBCTL_STOP_DEQUEUED; task->jobctl &= ~mask; if (!(task->jobctl & JOBCTL_PENDING_MASK)) task_clear_jobctl_trapping(task); } /** * task_participate_group_stop - participate in a group stop * @task: task participating in a group stop * * @task has %JOBCTL_STOP_PENDING set and is participating in a group stop. * Group stop states are cleared and the group stop count is consumed if * %JOBCTL_STOP_CONSUME was set. If the consumption completes the group * stop, the appropriate `SIGNAL_*` flags are set. * * CONTEXT: * Must be called with @task->sighand->siglock held. * * RETURNS: * %true if group stop completion should be notified to the parent, %false * otherwise. */ static bool task_participate_group_stop(struct task_struct *task) { struct signal_struct *sig = task->signal; bool consume = task->jobctl & JOBCTL_STOP_CONSUME; WARN_ON_ONCE(!(task->jobctl & JOBCTL_STOP_PENDING)); task_clear_jobctl_pending(task, JOBCTL_STOP_PENDING); if (!consume) return false; if (!WARN_ON_ONCE(sig->group_stop_count == 0)) sig->group_stop_count--; /* * Tell the caller to notify completion iff we are entering into a * fresh group stop. Read comment in do_signal_stop() for details. */ if (!sig->group_stop_count && !(sig->flags & SIGNAL_STOP_STOPPED)) { signal_set_stop_flags(sig, SIGNAL_STOP_STOPPED); return true; } return false; } void task_join_group_stop(struct task_struct *task) { unsigned long mask = current->jobctl & JOBCTL_STOP_SIGMASK; struct signal_struct *sig = current->signal; if (sig->group_stop_count) { sig->group_stop_count++; mask |= JOBCTL_STOP_CONSUME; } else if (!(sig->flags & SIGNAL_STOP_STOPPED)) return; /* Have the new thread join an on-going signal group stop */ task_set_jobctl_pending(task, mask | JOBCTL_STOP_PENDING); } static struct ucounts *sig_get_ucounts(struct task_struct *t, int sig, int override_rlimit) { struct ucounts *ucounts; long sigpending; /* * Protect access to @t credentials. This can go away when all * callers hold rcu read lock. * * NOTE! A pending signal will hold on to the user refcount, * and we get/put the refcount only when the sigpending count * changes from/to zero. */ rcu_read_lock(); ucounts = task_ucounts(t); sigpending = inc_rlimit_get_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING, override_rlimit); rcu_read_unlock(); if (!sigpending) return NULL; if (unlikely(!override_rlimit && sigpending > task_rlimit(t, RLIMIT_SIGPENDING))) { dec_rlimit_put_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING); print_dropped_signal(sig); return NULL; } return ucounts; } static void __sigqueue_init(struct sigqueue *q, struct ucounts *ucounts, const unsigned int sigqueue_flags) { INIT_LIST_HEAD(&q->list); q->flags = sigqueue_flags; q->ucounts = ucounts; } /* * allocate a new signal queue record * - this may be called without locks if and only if t == current, otherwise an * appropriate lock must be held to stop the target task from exiting */ static struct sigqueue *sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags, int override_rlimit) { struct ucounts *ucounts = sig_get_ucounts(t, sig, override_rlimit); struct sigqueue *q; if (!ucounts) return NULL; q = kmem_cache_alloc(sigqueue_cachep, gfp_flags); if (!q) { dec_rlimit_put_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING); return NULL; } __sigqueue_init(q, ucounts, 0); return q; } static void __sigqueue_free(struct sigqueue *q) { if (q->flags & SIGQUEUE_PREALLOC) { posixtimer_sigqueue_putref(q); return; } if (q->ucounts) { dec_rlimit_put_ucounts(q->ucounts, UCOUNT_RLIMIT_SIGPENDING); q->ucounts = NULL; } kmem_cache_free(sigqueue_cachep, q); } void flush_sigqueue(struct sigpending *queue) { struct sigqueue *q; sigemptyset(&queue->signal); while (!list_empty(&queue->list)) { q = list_entry(queue->list.next, struct sigqueue , list); list_del_init(&q->list); __sigqueue_free(q); } } /* * Flush all pending signals for this kthread. */ void flush_signals(struct task_struct *t) { unsigned long flags; spin_lock_irqsave(&t->sighand->siglock, flags); clear_tsk_thread_flag(t, TIF_SIGPENDING); flush_sigqueue(&t->pending); flush_sigqueue(&t->signal->shared_pending); spin_unlock_irqrestore(&t->sighand->siglock, flags); } EXPORT_SYMBOL(flush_signals); void ignore_signals(struct task_struct *t) { int i; for (i = 0; i < _NSIG; ++i) t->sighand->action[i].sa.sa_handler = SIG_IGN; flush_signals(t); } /* * Flush all handlers for a task. */ void flush_signal_handlers(struct task_struct *t, int force_default) { int i; struct k_sigaction *ka = &t->sighand->action[0]; for (i = _NSIG ; i != 0 ; i--) { if (force_default || ka->sa.sa_handler != SIG_IGN) ka->sa.sa_handler = SIG_DFL; ka->sa.sa_flags = 0; #ifdef __ARCH_HAS_SA_RESTORER ka->sa.sa_restorer = NULL; #endif sigemptyset(&ka->sa.sa_mask); ka++; } } bool unhandled_signal(struct task_struct *tsk, int sig) { void __user *handler = tsk->sighand->action[sig-1].sa.sa_handler; if (is_global_init(tsk)) return true; if (handler != SIG_IGN && handler != SIG_DFL) return false; /* If dying, we handle all new signals by ignoring them */ if (fatal_signal_pending(tsk)) return false; /* if ptraced, let the tracer determine */ return !tsk->ptrace; } static void collect_signal(int sig, struct sigpending *list, kernel_siginfo_t *info, struct sigqueue **timer_sigq) { struct sigqueue *q, *first = NULL; /* * Collect the siginfo appropriate to this signal. Check if * there is another siginfo for the same signal. */ list_for_each_entry(q, &list->list, list) { if (q->info.si_signo == sig) { if (first) goto still_pending; first = q; } } sigdelset(&list->signal, sig); if (first) { still_pending: list_del_init(&first->list); copy_siginfo(info, &first->info); /* * posix-timer signals are preallocated and freed when the last * reference count is dropped in posixtimer_deliver_signal() or * immediately on timer deletion when the signal is not pending. * Spare the extra round through __sigqueue_free() which is * ignoring preallocated signals. */ if (unlikely((first->flags & SIGQUEUE_PREALLOC) && (info->si_code == SI_TIMER))) *timer_sigq = first; else __sigqueue_free(first); } else { /* * Ok, it wasn't in the queue. This must be * a fast-pathed signal or we must have been * out of queue space. So zero out the info. */ clear_siginfo(info); info->si_signo = sig; info->si_errno = 0; info->si_code = SI_USER; info->si_pid = 0; info->si_uid = 0; } } static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, kernel_siginfo_t *info, struct sigqueue **timer_sigq) { int sig = next_signal(pending, mask); if (sig) collect_signal(sig, pending, info, timer_sigq); return sig; } /* * Try to dequeue a signal. If a deliverable signal is found fill in the * caller provided siginfo and return the signal number. Otherwise return * 0. */ int dequeue_signal(sigset_t *mask, kernel_siginfo_t *info, enum pid_type *type) { struct task_struct *tsk = current; struct sigqueue *timer_sigq; int signr; lockdep_assert_held(&tsk->sighand->siglock); again: *type = PIDTYPE_PID; timer_sigq = NULL; signr = __dequeue_signal(&tsk->pending, mask, info, &timer_sigq); if (!signr) { *type = PIDTYPE_TGID; signr = __dequeue_signal(&tsk->signal->shared_pending, mask, info, &timer_sigq); if (unlikely(signr == SIGALRM)) posixtimer_rearm_itimer(tsk); } recalc_sigpending(); if (!signr) return 0; if (unlikely(sig_kernel_stop(signr))) { /* * Set a marker that we have dequeued a stop signal. Our * caller might release the siglock and then the pending * stop signal it is about to process is no longer in the * pending bitmasks, but must still be cleared by a SIGCONT * (and overruled by a SIGKILL). So those cases clear this * shared flag after we've set it. Note that this flag may * remain set after the signal we return is ignored or * handled. That doesn't matter because its only purpose * is to alert stop-signal processing code when another * processor has come along and cleared the flag. */ current->jobctl |= JOBCTL_STOP_DEQUEUED; } if (IS_ENABLED(CONFIG_POSIX_TIMERS) && unlikely(timer_sigq)) { if (!posixtimer_deliver_signal(info, timer_sigq)) goto again; } return signr; } EXPORT_SYMBOL_GPL(dequeue_signal); static int dequeue_synchronous_signal(kernel_siginfo_t *info) { struct task_struct *tsk = current; struct sigpending *pending = &tsk->pending; struct sigqueue *q, *sync = NULL; /* * Might a synchronous signal be in the queue? */ if (!((pending->signal.sig[0] & ~tsk->blocked.sig[0]) & SYNCHRONOUS_MASK)) return 0; /* * Return the first synchronous signal in the queue. */ list_for_each_entry(q, &pending->list, list) { /* Synchronous signals have a positive si_code */ if ((q->info.si_code > SI_USER) && (sigmask(q->info.si_signo) & SYNCHRONOUS_MASK)) { sync = q; goto next; } } return 0; next: /* * Check if there is another siginfo for the same signal. */ list_for_each_entry_continue(q, &pending->list, list) { if (q->info.si_signo == sync->info.si_signo) goto still_pending; } sigdelset(&pending->signal, sync->info.si_signo); recalc_sigpending(); still_pending: list_del_init(&sync->list); copy_siginfo(info, &sync->info); __sigqueue_free(sync); return info->si_signo; } /* * Tell a process that it has a new active signal.. * * NOTE! we rely on the previous spin_lock to * lock interrupts for us! We can only be called with * "siglock" held, and the local interrupt must * have been disabled when that got acquired! * * No need to set need_resched since signal event passing * goes through ->blocked */ void signal_wake_up_state(struct task_struct *t, unsigned int state) { lockdep_assert_held(&t->sighand->siglock); set_tsk_thread_flag(t, TIF_SIGPENDING); /* * TASK_WAKEKILL also means wake it up in the stopped/traced/killable * case. We don't check t->state here because there is a race with it * executing another processor and just now entering stopped state. * By using wake_up_state, we ensure the process will wake up and * handle its death signal. */ if (!wake_up_state(t, state | TASK_INTERRUPTIBLE)) kick_process(t); } static inline void posixtimer_sig_ignore(struct task_struct *tsk, struct sigqueue *q); static void sigqueue_free_ignored(struct task_struct *tsk, struct sigqueue *q) { if (likely(!(q->flags & SIGQUEUE_PREALLOC) || q->info.si_code != SI_TIMER)) __sigqueue_free(q); else posixtimer_sig_ignore(tsk, q); } /* Remove signals in mask from the pending set and queue. */ static void flush_sigqueue_mask(struct task_struct *p, sigset_t *mask, struct sigpending *s) { struct sigqueue *q, *n; sigset_t m; lockdep_assert_held(&p->sighand->siglock); sigandsets(&m, mask, &s->signal); if (sigisemptyset(&m)) return; sigandnsets(&s->signal, &s->signal, mask); list_for_each_entry_safe(q, n, &s->list, list) { if (sigismember(mask, q->info.si_signo)) { list_del_init(&q->list); sigqueue_free_ignored(p, q); } } } static inline int is_si_special(const struct kernel_siginfo *info) { return info <= SEND_SIG_PRIV; } static inline bool si_fromuser(const struct kernel_siginfo *info) { return info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info)); } /* * called with RCU read lock from check_kill_permission() */ static bool kill_ok_by_cred(struct task_struct *t) { const struct cred *cred = current_cred(); const struct cred *tcred = __task_cred(t); return uid_eq(cred->euid, tcred->suid) || uid_eq(cred->euid, tcred->uid) || uid_eq(cred->uid, tcred->suid) || uid_eq(cred->uid, tcred->uid) || ns_capable(tcred->user_ns, CAP_KILL); } /* * Bad permissions for sending the signal * - the caller must hold the RCU read lock */ static int check_kill_permission(int sig, struct kernel_siginfo *info, struct task_struct *t) { struct pid *sid; int error; if (!valid_signal(sig)) return -EINVAL; if (!si_fromuser(info)) return 0; error = audit_signal_info(sig, t); /* Let audit system see the signal */ if (error) return error; if (!same_thread_group(current, t) && !kill_ok_by_cred(t)) { switch (sig) { case SIGCONT: sid = task_session(t); /* * We don't return the error if sid == NULL. The * task was unhashed, the caller must notice this. */ if (!sid || sid == task_session(current)) break; fallthrough; default: return -EPERM; } } return security_task_kill(t, info, sig, NULL); } /** * ptrace_trap_notify - schedule trap to notify ptracer * @t: tracee wanting to notify tracer * * This function schedules sticky ptrace trap which is cleared on the next * TRAP_STOP to notify ptracer of an event. @t must have been seized by * ptracer. * * If @t is running, STOP trap will be taken. If trapped for STOP and * ptracer is listening for events, tracee is woken up so that it can * re-trap for the new event. If trapped otherwise, STOP trap will be * eventually taken without returning to userland after the existing traps * are finished by PTRACE_CONT. * * CONTEXT: * Must be called with @task->sighand->siglock held. */ static void ptrace_trap_notify(struct task_struct *t) { WARN_ON_ONCE(!(t->ptrace & PT_SEIZED)); lockdep_assert_held(&t->sighand->siglock); task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY); ptrace_signal_wake_up(t, t->jobctl & JOBCTL_LISTENING); } /* * Handle magic process-wide effects of stop/continue signals. Unlike * the signal actions, these happen immediately at signal-generation * time regardless of blocking, ignoring, or handling. This does the * actual continuing for SIGCONT, but not the actual stopping for stop * signals. The process stop is done as a signal action for SIG_DFL. * * Returns true if the signal should be actually delivered, otherwise * it should be dropped. */ static bool prepare_signal(int sig, struct task_struct *p, bool force) { struct signal_struct *signal = p->signal; struct task_struct *t; sigset_t flush; if (signal->flags & SIGNAL_GROUP_EXIT) { if (signal->core_state) return sig == SIGKILL; /* * The process is in the middle of dying, drop the signal. */ return false; } else if (sig_kernel_stop(sig)) { /* * This is a stop signal. Remove SIGCONT from all queues. */ siginitset(&flush, sigmask(SIGCONT)); flush_sigqueue_mask(p, &flush, &signal->shared_pending); for_each_thread(p, t) flush_sigqueue_mask(p, &flush, &t->pending); } else if (sig == SIGCONT) { unsigned int why; /* * Remove all stop signals from all queues, wake all threads. */ siginitset(&flush, SIG_KERNEL_STOP_MASK); flush_sigqueue_mask(p, &flush, &signal->shared_pending); for_each_thread(p, t) { flush_sigqueue_mask(p, &flush, &t->pending); task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING); if (likely(!(t->ptrace & PT_SEIZED))) { t->jobctl &= ~JOBCTL_STOPPED; wake_up_state(t, __TASK_STOPPED); } else ptrace_trap_notify(t); } /* * Notify the parent with CLD_CONTINUED if we were stopped. * * If we were in the middle of a group stop, we pretend it * was already finished, and then continued. Since SIGCHLD * doesn't queue we report only CLD_STOPPED, as if the next * CLD_CONTINUED was dropped. */ why = 0; if (signal->flags & SIGNAL_STOP_STOPPED) why |= SIGNAL_CLD_CONTINUED; else if (signal->group_stop_count) why |= SIGNAL_CLD_STOPPED; if (why) { /* * The first thread which returns from do_signal_stop() * will take ->siglock, notice SIGNAL_CLD_MASK, and * notify its parent. See get_signal(). */ signal_set_stop_flags(signal, why | SIGNAL_STOP_CONTINUED); signal->group_stop_count = 0; signal->group_exit_code = 0; } } return !sig_ignored(p, sig, force); } /* * Test if P wants to take SIG. After we've checked all threads with this, * it's equivalent to finding no threads not blocking SIG. Any threads not * blocking SIG were ruled out because they are not running and already * have pending signals. Such threads will dequeue from the shared queue * as soon as they're available, so putting the signal on the shared queue * will be equivalent to sending it to one such thread. */ static inline bool wants_signal(int sig, struct task_struct *p) { if (sigismember(&p->blocked, sig)) return false; if (p->flags & PF_EXITING) return false; if (sig == SIGKILL) return true; if (task_is_stopped_or_traced(p)) return false; return task_curr(p) || !task_sigpending(p); } static void complete_signal(int sig, struct task_struct *p, enum pid_type type) { struct signal_struct *signal = p->signal; struct task_struct *t; /* * Now find a thread we can wake up to take the signal off the queue. * * Try the suggested task first (may or may not be the main thread). */ if (wants_signal(sig, p)) t = p; else if ((type == PIDTYPE_PID) || thread_group_empty(p)) /* * There is just one thread and it does not need to be woken. * It will dequeue unblocked signals before it runs again. */ return; else { /* * Otherwise try to find a suitable thread. */ t = signal->curr_target; while (!wants_signal(sig, t)) { t = next_thread(t); if (t == signal->curr_target) /* * No thread needs to be woken. * Any eligible threads will see * the signal in the queue soon. */ return; } signal->curr_target = t; } /* * Found a killable thread. If the signal will be fatal, * then start taking the whole group down immediately. */ if (sig_fatal(p, sig) && (signal->core_state || !(signal->flags & SIGNAL_GROUP_EXIT)) && !sigismember(&t->real_blocked, sig) && (sig == SIGKILL || !p->ptrace)) { /* * This signal will be fatal to the whole group. */ if (!sig_kernel_coredump(sig)) { /* * Start a group exit and wake everybody up. * This way we don't have other threads * running and doing things after a slower * thread has the fatal signal pending. */ signal->flags = SIGNAL_GROUP_EXIT; signal->group_exit_code = sig; signal->group_stop_count = 0; __for_each_thread(signal, t) { task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1); } return; } } /* * The signal is already in the shared-pending queue. * Tell the chosen thread to wake up and dequeue it. */ signal_wake_up(t, sig == SIGKILL); return; } static inline bool legacy_queue(struct sigpending *signals, int sig) { return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); } static int __send_signal_locked(int sig, struct kernel_siginfo *info, struct task_struct *t, enum pid_type type, bool force) { struct sigpending *pending; struct sigqueue *q; int override_rlimit; int ret = 0, result; lockdep_assert_held(&t->sighand->siglock); result = TRACE_SIGNAL_IGNORED; if (!prepare_signal(sig, t, force)) goto ret; pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending; /* * Short-circuit ignored signals and support queuing * exactly one non-rt signal, so that we can get more * detailed information about the cause of the signal. */ result = TRACE_SIGNAL_ALREADY_PENDING; if (legacy_queue(pending, sig)) goto ret; result = TRACE_SIGNAL_DELIVERED; /* * Skip useless siginfo allocation for SIGKILL and kernel threads. */ if ((sig == SIGKILL) || (t->flags & PF_KTHREAD)) goto out_set; /* * Real-time signals must be queued if sent by sigqueue, or * some other real-time mechanism. It is implementation * defined whether kill() does so. We attempt to do so, on * the principle of least surprise, but since kill is not * allowed to fail with EAGAIN when low on memory we just * make sure at least one signal gets delivered and don't * pass on the info struct. */ if (sig < SIGRTMIN) override_rlimit = (is_si_special(info) || info->si_code >= 0); else override_rlimit = 0; q = sigqueue_alloc(sig, t, GFP_ATOMIC, override_rlimit); if (q) { list_add_tail(&q->list, &pending->list); switch ((unsigned long) info) { case (unsigned long) SEND_SIG_NOINFO: clear_siginfo(&q->info); q->info.si_signo = sig; q->info.si_errno = 0; q->info.si_code = SI_USER; q->info.si_pid = task_tgid_nr_ns(current, task_active_pid_ns(t)); rcu_read_lock(); q->info.si_uid = from_kuid_munged(task_cred_xxx(t, user_ns), current_uid()); rcu_read_unlock(); break; case (unsigned long) SEND_SIG_PRIV: clear_siginfo(&q->info); q->info.si_signo = sig; q->info.si_errno = 0; q->info.si_code = SI_KERNEL; q->info.si_pid = 0; q->info.si_uid = 0; break; default: copy_siginfo(&q->info, info); break; } } else if (!is_si_special(info) && sig >= SIGRTMIN && info->si_code != SI_USER) { /* * Queue overflow, abort. We may abort if the * signal was rt and sent by user using something * other than kill(). */ result = TRACE_SIGNAL_OVERFLOW_FAIL; ret = -EAGAIN; goto ret; } else { /* * This is a silent loss of information. We still * send the signal, but the *info bits are lost. */ result = TRACE_SIGNAL_LOSE_INFO; } out_set: signalfd_notify(t, sig); sigaddset(&pending->signal, sig); /* Let multiprocess signals appear after on-going forks */ if (type > PIDTYPE_TGID) { struct multiprocess_signals *delayed; hlist_for_each_entry(delayed, &t->signal->multiprocess, node) { sigset_t *signal = &delayed->signal; /* Can't queue both a stop and a continue signal */ if (sig == SIGCONT) sigdelsetmask(signal, SIG_KERNEL_STOP_MASK); else if (sig_kernel_stop(sig)) sigdelset(signal, SIGCONT); sigaddset(signal, sig); } } complete_signal(sig, t, type); ret: trace_signal_generate(sig, info, t, type != PIDTYPE_PID, result); return ret; } static inline bool has_si_pid_and_uid(struct kernel_siginfo *info) { bool ret = false; switch (siginfo_layout(info->si_signo, info->si_code)) { case SIL_KILL: case SIL_CHLD: case SIL_RT: ret = true; break; case SIL_TIMER: case SIL_POLL: case SIL_FAULT: case SIL_FAULT_TRAPNO: case SIL_FAULT_MCEERR: case SIL_FAULT_BNDERR: case SIL_FAULT_PKUERR: case SIL_FAULT_PERF_EVENT: case SIL_SYS: ret = false; break; } return ret; } int send_signal_locked(int sig, struct kernel_siginfo *info, struct task_struct *t, enum pid_type type) { /* Should SIGKILL or SIGSTOP be received by a pid namespace init? */ bool force = false; if (info == SEND_SIG_NOINFO) { /* Force if sent from an ancestor pid namespace */ force = !task_pid_nr_ns(current, task_active_pid_ns(t)); } else if (info == SEND_SIG_PRIV) { /* Don't ignore kernel generated signals */ force = true; } else if (has_si_pid_and_uid(info)) { /* SIGKILL and SIGSTOP is special or has ids */ struct user_namespace *t_user_ns; rcu_read_lock(); t_user_ns = task_cred_xxx(t, user_ns); if (current_user_ns() != t_user_ns) { kuid_t uid = make_kuid(current_user_ns(), info->si_uid); info->si_uid = from_kuid_munged(t_user_ns, uid); } rcu_read_unlock(); /* A kernel generated signal? */ force = (info->si_code == SI_KERNEL); /* From an ancestor pid namespace? */ if (!task_pid_nr_ns(current, task_active_pid_ns(t))) { info->si_pid = 0; force = true; } } return __send_signal_locked(sig, info, t, type, force); } static void print_fatal_signal(int signr) { struct pt_regs *regs = task_pt_regs(current); struct file *exe_file; exe_file = get_task_exe_file(current); if (exe_file) { pr_info("%pD: %s: potentially unexpected fatal signal %d.\n", exe_file, current->comm, signr); fput(exe_file); } else { pr_info("%s: potentially unexpected fatal signal %d.\n", current->comm, signr); } #if defined(__i386__) && !defined(__arch_um__) pr_info("code at %08lx: ", regs->ip); { int i; for (i = 0; i < 16; i++) { unsigned char insn; if (get_user(insn, (unsigned char *)(regs->ip + i))) break; pr_cont("%02x ", insn); } } pr_cont("\n"); #endif preempt_disable(); show_regs(regs); preempt_enable(); } static int __init setup_print_fatal_signals(char *str) { get_option (&str, &print_fatal_signals); return 1; } __setup("print-fatal-signals=", setup_print_fatal_signals); int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p, enum pid_type type) { unsigned long flags; int ret = -ESRCH; if (lock_task_sighand(p, &flags)) { ret = send_signal_locked(sig, info, p, type); unlock_task_sighand(p, &flags); } return ret; } enum sig_handler { HANDLER_CURRENT, /* If reachable use the current handler */ HANDLER_SIG_DFL, /* Always use SIG_DFL handler semantics */ HANDLER_EXIT, /* Only visible as the process exit code */ }; /* * Force a signal that the process can't ignore: if necessary * we unblock the signal and change any SIG_IGN to SIG_DFL. * * Note: If we unblock the signal, we always reset it to SIG_DFL, * since we do not want to have a signal handler that was blocked * be invoked when user space had explicitly blocked it. * * We don't want to have recursive SIGSEGV's etc, for example, * that is why we also clear SIGNAL_UNKILLABLE. */ static int force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t, enum sig_handler handler) { unsigned long int flags; int ret, blocked, ignored; struct k_sigaction *action; int sig = info->si_signo; spin_lock_irqsave(&t->sighand->siglock, flags); action = &t->sighand->action[sig-1]; ignored = action->sa.sa_handler == SIG_IGN; blocked = sigismember(&t->blocked, sig); if (blocked || ignored || (handler != HANDLER_CURRENT)) { action->sa.sa_handler = SIG_DFL; if (handler == HANDLER_EXIT) action->sa.sa_flags |= SA_IMMUTABLE; if (blocked) sigdelset(&t->blocked, sig); } /* * Don't clear SIGNAL_UNKILLABLE for traced tasks, users won't expect * debugging to leave init killable. But HANDLER_EXIT is always fatal. */ if (action->sa.sa_handler == SIG_DFL && (!t->ptrace || (handler == HANDLER_EXIT))) t->signal->flags &= ~SIGNAL_UNKILLABLE; ret = send_signal_locked(sig, info, t, PIDTYPE_PID); /* This can happen if the signal was already pending and blocked */ if (!task_sigpending(t)) signal_wake_up(t, 0); spin_unlock_irqrestore(&t->sighand->siglock, flags); return ret; } int force_sig_info(struct kernel_siginfo *info) { return force_sig_info_to_task(info, current, HANDLER_CURRENT); } /* * Nuke all other threads in the group. */ int zap_other_threads(struct task_struct *p) { struct task_struct *t; int count = 0; p->signal->group_stop_count = 0; for_other_threads(p, t) { task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); count++; /* Don't bother with already dead threads */ if (t->exit_state) continue; sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1); } return count; } struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, unsigned long *flags) { struct sighand_struct *sighand; rcu_read_lock(); for (;;) { sighand = rcu_dereference(tsk->sighand); if (unlikely(sighand == NULL)) break; /* * This sighand can be already freed and even reused, but * we rely on SLAB_TYPESAFE_BY_RCU and sighand_ctor() which * initializes ->siglock: this slab can't go away, it has * the same object type, ->siglock can't be reinitialized. * * We need to ensure that tsk->sighand is still the same * after we take the lock, we can race with de_thread() or * __exit_signal(). In the latter case the next iteration * must see ->sighand == NULL. */ spin_lock_irqsave(&sighand->siglock, *flags); if (likely(sighand == rcu_access_pointer(tsk->sighand))) break; spin_unlock_irqrestore(&sighand->siglock, *flags); } rcu_read_unlock(); return sighand; } #ifdef CONFIG_LOCKDEP void lockdep_assert_task_sighand_held(struct task_struct *task) { struct sighand_struct *sighand; rcu_read_lock(); sighand = rcu_dereference(task->sighand); if (sighand) lockdep_assert_held(&sighand->siglock); else WARN_ON_ONCE(1); rcu_read_unlock(); } #endif /* * send signal info to all the members of a thread group or to the * individual thread if type == PIDTYPE_PID. */ int group_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p, enum pid_type type) { int ret; rcu_read_lock(); ret = check_kill_permission(sig, info, p); rcu_read_unlock(); if (!ret && sig) ret = do_send_sig_info(sig, info, p, type); return ret; } /* * __kill_pgrp_info() sends a signal to a process group: this is what the tty * control characters do (^C, ^Z etc) * - the caller must hold at least a readlock on tasklist_lock */ int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp) { struct task_struct *p = NULL; int ret = -ESRCH; do_each_pid_task(pgrp, PIDTYPE_PGID, p) { int err = group_send_sig_info(sig, info, p, PIDTYPE_PGID); /* * If group_send_sig_info() succeeds at least once ret * becomes 0 and after that the code below has no effect. * Otherwise we return the last err or -ESRCH if this * process group is empty. */ if (ret) ret = err; } while_each_pid_task(pgrp, PIDTYPE_PGID, p); return ret; } static int kill_pid_info_type(int sig, struct kernel_siginfo *info, struct pid *pid, enum pid_type type) { int error = -ESRCH; struct task_struct *p; for (;;) { rcu_read_lock(); p = pid_task(pid, PIDTYPE_PID); if (p) error = group_send_sig_info(sig, info, p, type); rcu_read_unlock(); if (likely(!p || error != -ESRCH)) return error; /* * The task was unhashed in between, try again. If it * is dead, pid_task() will return NULL, if we race with * de_thread() it will find the new leader. */ } } int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid) { return kill_pid_info_type(sig, info, pid, PIDTYPE_TGID); } static int kill_proc_info(int sig, struct kernel_siginfo *info, pid_t pid) { int error; rcu_read_lock(); error = kill_pid_info(sig, info, find_vpid(pid)); rcu_read_unlock(); return error; } static inline bool kill_as_cred_perm(const struct cred *cred, struct task_struct *target) { const struct cred *pcred = __task_cred(target); return uid_eq(cred->euid, pcred->suid) || uid_eq(cred->euid, pcred->uid) || uid_eq(cred->uid, pcred->suid) || uid_eq(cred->uid, pcred->uid); } /* * The usb asyncio usage of siginfo is wrong. The glibc support * for asyncio which uses SI_ASYNCIO assumes the layout is SIL_RT. * AKA after the generic fields: * kernel_pid_t si_pid; * kernel_uid32_t si_uid; * sigval_t si_value; * * Unfortunately when usb generates SI_ASYNCIO it assumes the layout * after the generic fields is: * void __user *si_addr; * * This is a practical problem when there is a 64bit big endian kernel * and a 32bit userspace. As the 32bit address will encoded in the low * 32bits of the pointer. Those low 32bits will be stored at higher * address than appear in a 32 bit pointer. So userspace will not * see the address it was expecting for it's completions. * * There is nothing in the encoding that can allow * copy_siginfo_to_user32 to detect this confusion of formats, so * handle this by requiring the caller of kill_pid_usb_asyncio to * notice when this situration takes place and to store the 32bit * pointer in sival_int, instead of sival_addr of the sigval_t addr * parameter. */ int kill_pid_usb_asyncio(int sig, int errno, sigval_t addr, struct pid *pid, const struct cred *cred) { struct kernel_siginfo info; struct task_struct *p; unsigned long flags; int ret = -EINVAL; if (!valid_signal(sig)) return ret; clear_siginfo(&info); info.si_signo = sig; info.si_errno = errno; info.si_code = SI_ASYNCIO; *((sigval_t *)&info.si_pid) = addr; rcu_read_lock(); p = pid_task(pid, PIDTYPE_PID); if (!p) { ret = -ESRCH; goto out_unlock; } if (!kill_as_cred_perm(cred, p)) { ret = -EPERM; goto out_unlock; } ret = security_task_kill(p, &info, sig, cred); if (ret) goto out_unlock; if (sig) { if (lock_task_sighand(p, &flags)) { ret = __send_signal_locked(sig, &info, p, PIDTYPE_TGID, false); unlock_task_sighand(p, &flags); } else ret = -ESRCH; } out_unlock: rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(kill_pid_usb_asyncio); /* * kill_something_info() interprets pid in interesting ways just like kill(2). * * POSIX specifies that kill(-1,sig) is unspecified, but what we have * is probably wrong. Should make it like BSD or SYSV. */ static int kill_something_info(int sig, struct kernel_siginfo *info, pid_t pid) { int ret; if (pid > 0) return kill_proc_info(sig, info, pid); /* -INT_MIN is undefined. Exclude this case to avoid a UBSAN warning */ if (pid == INT_MIN) return -ESRCH; read_lock(&tasklist_lock); if (pid != -1) { ret = __kill_pgrp_info(sig, info, pid ? find_vpid(-pid) : task_pgrp(current)); } else { int retval = 0, count = 0; struct task_struct * p; for_each_process(p) { if (task_pid_vnr(p) > 1 && !same_thread_group(p, current)) { int err = group_send_sig_info(sig, info, p, PIDTYPE_MAX); ++count; if (err != -EPERM) retval = err; } } ret = count ? retval : -ESRCH; } read_unlock(&tasklist_lock); return ret; } /* * These are for backward compatibility with the rest of the kernel source. */ int send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p) { /* * Make sure legacy kernel users don't send in bad values * (normal paths check this in check_kill_permission). */ if (!valid_signal(sig)) return -EINVAL; return do_send_sig_info(sig, info, p, PIDTYPE_PID); } EXPORT_SYMBOL(send_sig_info); #define __si_special(priv) \ ((priv) ? SEND_SIG_PRIV : SEND_SIG_NOINFO) int send_sig(int sig, struct task_struct *p, int priv) { return send_sig_info(sig, __si_special(priv), p); } EXPORT_SYMBOL(send_sig); void force_sig(int sig) { struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = sig; info.si_errno = 0; info.si_code = SI_KERNEL; info.si_pid = 0; info.si_uid = 0; force_sig_info(&info); } EXPORT_SYMBOL(force_sig); void force_fatal_sig(int sig) { struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = sig; info.si_errno = 0; info.si_code = SI_KERNEL; info.si_pid = 0; info.si_uid = 0; force_sig_info_to_task(&info, current, HANDLER_SIG_DFL); } void force_exit_sig(int sig) { struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = sig; info.si_errno = 0; info.si_code = SI_KERNEL; info.si_pid = 0; info.si_uid = 0; force_sig_info_to_task(&info, current, HANDLER_EXIT); } /* * When things go south during signal handling, we * will force a SIGSEGV. And if the signal that caused * the problem was already a SIGSEGV, we'll want to * make sure we don't even try to deliver the signal.. */ void force_sigsegv(int sig) { if (sig == SIGSEGV) force_fatal_sig(SIGSEGV); else force_sig(SIGSEGV); } int force_sig_fault_to_task(int sig, int code, void __user *addr, struct task_struct *t) { struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = sig; info.si_errno = 0; info.si_code = code; info.si_addr = addr; return force_sig_info_to_task(&info, t, HANDLER_CURRENT); } int force_sig_fault(int sig, int code, void __user *addr) { return force_sig_fault_to_task(sig, code, addr, current); } int send_sig_fault(int sig, int code, void __user *addr, struct task_struct *t) { struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = sig; info.si_errno = 0; info.si_code = code; info.si_addr = addr; return send_sig_info(info.si_signo, &info, t); } int force_sig_mceerr(int code, void __user *addr, short lsb) { struct kernel_siginfo info; WARN_ON((code != BUS_MCEERR_AO) && (code != BUS_MCEERR_AR)); clear_siginfo(&info); info.si_signo = SIGBUS; info.si_errno = 0; info.si_code = code; info.si_addr = addr; info.si_addr_lsb = lsb; return force_sig_info(&info); } int send_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t) { struct kernel_siginfo info; WARN_ON((code != BUS_MCEERR_AO) && (code != BUS_MCEERR_AR)); clear_siginfo(&info); info.si_signo = SIGBUS; info.si_errno = 0; info.si_code = code; info.si_addr = addr; info.si_addr_lsb = lsb; return send_sig_info(info.si_signo, &info, t); } EXPORT_SYMBOL(send_sig_mceerr); int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper) { struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = SIGSEGV; info.si_errno = 0; info.si_code = SEGV_BNDERR; info.si_addr = addr; info.si_lower = lower; info.si_upper = upper; return force_sig_info(&info); } #ifdef SEGV_PKUERR int force_sig_pkuerr(void __user *addr, u32 pkey) { struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = SIGSEGV; info.si_errno = 0; info.si_code = SEGV_PKUERR; info.si_addr = addr; info.si_pkey = pkey; return force_sig_info(&info); } #endif int send_sig_perf(void __user *addr, u32 type, u64 sig_data) { struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = SIGTRAP; info.si_errno = 0; info.si_code = TRAP_PERF; info.si_addr = addr; info.si_perf_data = sig_data; info.si_perf_type = type; /* * Signals generated by perf events should not terminate the whole * process if SIGTRAP is blocked, however, delivering the signal * asynchronously is better than not delivering at all. But tell user * space if the signal was asynchronous, so it can clearly be * distinguished from normal synchronous ones. */ info.si_perf_flags = sigismember(&current->blocked, info.si_signo) ? TRAP_PERF_FLAG_ASYNC : 0; return send_sig_info(info.si_signo, &info, current); } /** * force_sig_seccomp - signals the task to allow in-process syscall emulation * @syscall: syscall number to send to userland * @reason: filter-supplied reason code to send to userland (via si_errno) * @force_coredump: true to trigger a coredump * * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info. */ int force_sig_seccomp(int syscall, int reason, bool force_coredump) { struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = SIGSYS; info.si_code = SYS_SECCOMP; info.si_call_addr = (void __user *)KSTK_EIP(current); info.si_errno = reason; info.si_arch = syscall_get_arch(current); info.si_syscall = syscall; return force_sig_info_to_task(&info, current, force_coredump ? HANDLER_EXIT : HANDLER_CURRENT); } /* For the crazy architectures that include trap information in * the errno field, instead of an actual errno value. */ int force_sig_ptrace_errno_trap(int errno, void __user *addr) { struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = SIGTRAP; info.si_errno = errno; info.si_code = TRAP_HWBKPT; info.si_addr = addr; return force_sig_info(&info); } /* For the rare architectures that include trap information using * si_trapno. */ int force_sig_fault_trapno(int sig, int code, void __user *addr, int trapno) { struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = sig; info.si_errno = 0; info.si_code = code; info.si_addr = addr; info.si_trapno = trapno; return force_sig_info(&info); } /* For the rare architectures that include trap information using * si_trapno. */ int send_sig_fault_trapno(int sig, int code, void __user *addr, int trapno, struct task_struct *t) { struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = sig; info.si_errno = 0; info.si_code = code; info.si_addr = addr; info.si_trapno = trapno; return send_sig_info(info.si_signo, &info, t); } static int kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp) { int ret; read_lock(&tasklist_lock); ret = __kill_pgrp_info(sig, info, pgrp); read_unlock(&tasklist_lock); return ret; } int kill_pgrp(struct pid *pid, int sig, int priv) { return kill_pgrp_info(sig, __si_special(priv), pid); } EXPORT_SYMBOL(kill_pgrp); int kill_pid(struct pid *pid, int sig, int priv) { return kill_pid_info(sig, __si_special(priv), pid); } EXPORT_SYMBOL(kill_pid); #ifdef CONFIG_POSIX_TIMERS /* * These functions handle POSIX timer signals. POSIX timers use * preallocated sigqueue structs for sending signals. */ static void __flush_itimer_signals(struct sigpending *pending) { sigset_t signal, retain; struct sigqueue *q, *n; signal = pending->signal; sigemptyset(&retain); list_for_each_entry_safe(q, n, &pending->list, list) { int sig = q->info.si_signo; if (likely(q->info.si_code != SI_TIMER)) { sigaddset(&retain, sig); } else { sigdelset(&signal, sig); list_del_init(&q->list); __sigqueue_free(q); } } sigorsets(&pending->signal, &signal, &retain); } void flush_itimer_signals(void) { struct task_struct *tsk = current; guard(spinlock_irqsave)(&tsk->sighand->siglock); __flush_itimer_signals(&tsk->pending); __flush_itimer_signals(&tsk->signal->shared_pending); } bool posixtimer_init_sigqueue(struct sigqueue *q) { struct ucounts *ucounts = sig_get_ucounts(current, -1, 0); if (!ucounts) return false; clear_siginfo(&q->info); __sigqueue_init(q, ucounts, SIGQUEUE_PREALLOC); return true; } static void posixtimer_queue_sigqueue(struct sigqueue *q, struct task_struct *t, enum pid_type type) { struct sigpending *pending; int sig = q->info.si_signo; signalfd_notify(t, sig); pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending; list_add_tail(&q->list, &pending->list); sigaddset(&pending->signal, sig); complete_signal(sig, t, type); } /* * This function is used by POSIX timers to deliver a timer signal. * Where type is PIDTYPE_PID (such as for timers with SIGEV_THREAD_ID * set), the signal must be delivered to the specific thread (queues * into t->pending). * * Where type is not PIDTYPE_PID, signals must be delivered to the * process. In this case, prefer to deliver to current if it is in * the same thread group as the target process and its sighand is * stable, which avoids unnecessarily waking up a potentially idle task. */ static inline struct task_struct *posixtimer_get_target(struct k_itimer *tmr) { struct task_struct *t = pid_task(tmr->it_pid, tmr->it_pid_type); if (t && tmr->it_pid_type != PIDTYPE_PID && same_thread_group(t, current) && !current->exit_state) t = current; return t; } void posixtimer_send_sigqueue(struct k_itimer *tmr) { struct sigqueue *q = &tmr->sigq; int sig = q->info.si_signo; struct task_struct *t; unsigned long flags; int result; guard(rcu)(); t = posixtimer_get_target(tmr); if (!t) return; if (!likely(lock_task_sighand(t, &flags))) return; /* * Update @tmr::sigqueue_seq for posix timer signals with sighand * locked to prevent a race against dequeue_signal(). */ tmr->it_sigqueue_seq = tmr->it_signal_seq; /* * Set the signal delivery status under sighand lock, so that the * ignored signal handling can distinguish between a periodic and a * non-periodic timer. */ tmr->it_sig_periodic = tmr->it_status == POSIX_TIMER_REQUEUE_PENDING; if (!prepare_signal(sig, t, false)) { result = TRACE_SIGNAL_IGNORED; if (!list_empty(&q->list)) { /* * The signal was ignored and blocked. The timer * expiry queued it because blocked signals are * queued independent of the ignored state. * * The unblocking set SIGPENDING, but the signal * was not yet dequeued from the pending list. * So prepare_signal() sees unblocked and ignored, * which ends up here. Leave it queued like a * regular signal. * * The same happens when the task group is exiting * and the signal is already queued. * prepare_signal() treats SIGNAL_GROUP_EXIT as * ignored independent of its queued state. This * gets cleaned up in __exit_signal(). */ goto out; } /* Periodic timers with SIG_IGN are queued on the ignored list */ if (tmr->it_sig_periodic) { /* * Already queued means the timer was rearmed after * the previous expiry got it on the ignore list. * Nothing to do for that case. */ if (hlist_unhashed(&tmr->ignored_list)) { /* * Take a signal reference and queue it on * the ignored list. */ posixtimer_sigqueue_getref(q); posixtimer_sig_ignore(t, q); } } else if (!hlist_unhashed(&tmr->ignored_list)) { /* * Covers the case where a timer was periodic and * then the signal was ignored. Later it was rearmed * as oneshot timer. The previous signal is invalid * now, and this oneshot signal has to be dropped. * Remove it from the ignored list and drop the * reference count as the signal is not longer * queued. */ hlist_del_init(&tmr->ignored_list); posixtimer_putref(tmr); } goto out; } if (unlikely(!list_empty(&q->list))) { /* This holds a reference count already */ result = TRACE_SIGNAL_ALREADY_PENDING; goto out; } /* * If the signal is on the ignore list, it got blocked after it was * ignored earlier. But nothing lifted the ignore. Move it back to * the pending list to be consistent with the regular signal * handling. This already holds a reference count. * * If it's not on the ignore list acquire a reference count. */ if (likely(hlist_unhashed(&tmr->ignored_list))) posixtimer_sigqueue_getref(q); else hlist_del_init(&tmr->ignored_list); posixtimer_queue_sigqueue(q, t, tmr->it_pid_type); result = TRACE_SIGNAL_DELIVERED; out: trace_signal_generate(sig, &q->info, t, tmr->it_pid_type != PIDTYPE_PID, result); unlock_task_sighand(t, &flags); } static inline void posixtimer_sig_ignore(struct task_struct *tsk, struct sigqueue *q) { struct k_itimer *tmr = container_of(q, struct k_itimer, sigq); /* * If the timer is marked deleted already or the signal originates * from a non-periodic timer, then just drop the reference * count. Otherwise queue it on the ignored list. */ if (posixtimer_valid(tmr) && tmr->it_sig_periodic) hlist_add_head(&tmr->ignored_list, &tsk->signal->ignored_posix_timers); else posixtimer_putref(tmr); } static void posixtimer_sig_unignore(struct task_struct *tsk, int sig) { struct hlist_head *head = &tsk->signal->ignored_posix_timers; struct hlist_node *tmp; struct k_itimer *tmr; if (likely(hlist_empty(head))) return; /* * Rearming a timer with sighand lock held is not possible due to * lock ordering vs. tmr::it_lock. Just stick the sigqueue back and * let the signal delivery path deal with it whether it needs to be * rearmed or not. This cannot be decided here w/o dropping sighand * lock and creating a loop retry horror show. */ hlist_for_each_entry_safe(tmr, tmp , head, ignored_list) { struct task_struct *target; /* * tmr::sigq.info.si_signo is immutable, so accessing it * without holding tmr::it_lock is safe. */ if (tmr->sigq.info.si_signo != sig) continue; hlist_del_init(&tmr->ignored_list); /* This should never happen and leaks a reference count */ if (WARN_ON_ONCE(!list_empty(&tmr->sigq.list))) continue; /* * Get the target for the signal. If target is a thread and * has exited by now, drop the reference count. */ guard(rcu)(); target = posixtimer_get_target(tmr); if (target) posixtimer_queue_sigqueue(&tmr->sigq, target, tmr->it_pid_type); else posixtimer_putref(tmr); } } #else /* CONFIG_POSIX_TIMERS */ static inline void posixtimer_sig_ignore(struct task_struct *tsk, struct sigqueue *q) { } static inline void posixtimer_sig_unignore(struct task_struct *tsk, int sig) { } #endif /* !CONFIG_POSIX_TIMERS */ void do_notify_pidfd(struct task_struct *task) { struct pid *pid = task_pid(task); WARN_ON(task->exit_state == 0); __wake_up(&pid->wait_pidfd, TASK_NORMAL, 0, poll_to_key(EPOLLIN | EPOLLRDNORM)); } /* * Let a parent know about the death of a child. * For a stopped/continued status change, use do_notify_parent_cldstop instead. * * Returns true if our parent ignored us and so we've switched to * self-reaping. */ bool do_notify_parent(struct task_struct *tsk, int sig) { struct kernel_siginfo info; unsigned long flags; struct sighand_struct *psig; bool autoreap = false; u64 utime, stime; WARN_ON_ONCE(sig == -1); /* do_notify_parent_cldstop should have been called instead. */ WARN_ON_ONCE(task_is_stopped_or_traced(tsk)); WARN_ON_ONCE(!tsk->ptrace && (tsk->group_leader != tsk || !thread_group_empty(tsk))); /* ptraced, or group-leader without sub-threads */ do_notify_pidfd(tsk); if (sig != SIGCHLD) { /* * This is only possible if parent == real_parent. * Check if it has changed security domain. */ if (tsk->parent_exec_id != READ_ONCE(tsk->parent->self_exec_id)) sig = SIGCHLD; } clear_siginfo(&info); info.si_signo = sig; info.si_errno = 0; /* * We are under tasklist_lock here so our parent is tied to * us and cannot change. * * task_active_pid_ns will always return the same pid namespace * until a task passes through release_task. * * write_lock() currently calls preempt_disable() which is the * same as rcu_read_lock(), but according to Oleg, this is not * correct to rely on this */ rcu_read_lock(); info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(tsk->parent)); info.si_uid = from_kuid_munged(task_cred_xxx(tsk->parent, user_ns), task_uid(tsk)); rcu_read_unlock(); task_cputime(tsk, &utime, &stime); info.si_utime = nsec_to_clock_t(utime + tsk->signal->utime); info.si_stime = nsec_to_clock_t(stime + tsk->signal->stime); info.si_status = tsk->exit_code & 0x7f; if (tsk->exit_code & 0x80) info.si_code = CLD_DUMPED; else if (tsk->exit_code & 0x7f) info.si_code = CLD_KILLED; else { info.si_code = CLD_EXITED; info.si_status = tsk->exit_code >> 8; } psig = tsk->parent->sighand; spin_lock_irqsave(&psig->siglock, flags); if (!tsk->ptrace && sig == SIGCHLD && (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN || (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) { /* * We are exiting and our parent doesn't care. POSIX.1 * defines special semantics for setting SIGCHLD to SIG_IGN * or setting the SA_NOCLDWAIT flag: we should be reaped * automatically and not left for our parent's wait4 call. * Rather than having the parent do it as a magic kind of * signal handler, we just set this to tell do_exit that we * can be cleaned up without becoming a zombie. Note that * we still call __wake_up_parent in this case, because a * blocked sys_wait4 might now return -ECHILD. * * Whether we send SIGCHLD or not for SA_NOCLDWAIT * is implementation-defined: we do (if you don't want * it, just use SIG_IGN instead). */ autoreap = true; if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) sig = 0; } /* * Send with __send_signal as si_pid and si_uid are in the * parent's namespaces. */ if (valid_signal(sig) && sig) __send_signal_locked(sig, &info, tsk->parent, PIDTYPE_TGID, false); __wake_up_parent(tsk, tsk->parent); spin_unlock_irqrestore(&psig->siglock, flags); return autoreap; } /** * do_notify_parent_cldstop - notify parent of stopped/continued state change * @tsk: task reporting the state change * @for_ptracer: the notification is for ptracer * @why: CLD_{CONTINUED|STOPPED|TRAPPED} to report * * Notify @tsk's parent that the stopped/continued state has changed. If * @for_ptracer is %false, @tsk's group leader notifies to its real parent. * If %true, @tsk reports to @tsk->parent which should be the ptracer. * * CONTEXT: * Must be called with tasklist_lock at least read locked. */ static void do_notify_parent_cldstop(struct task_struct *tsk, bool for_ptracer, int why) { struct kernel_siginfo info; unsigned long flags; struct task_struct *parent; struct sighand_struct *sighand; u64 utime, stime; if (for_ptracer) { parent = tsk->parent; } else { tsk = tsk->group_leader; parent = tsk->real_parent; } clear_siginfo(&info); info.si_signo = SIGCHLD; info.si_errno = 0; /* * see comment in do_notify_parent() about the following 4 lines */ rcu_read_lock(); info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(parent)); info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk)); rcu_read_unlock(); task_cputime(tsk, &utime, &stime); info.si_utime = nsec_to_clock_t(utime); info.si_stime = nsec_to_clock_t(stime); info.si_code = why; switch (why) { case CLD_CONTINUED: info.si_status = SIGCONT; break; case CLD_STOPPED: info.si_status = tsk->signal->group_exit_code & 0x7f; break; case CLD_TRAPPED: info.si_status = tsk->exit_code & 0x7f; break; default: BUG(); } sighand = parent->sighand; spin_lock_irqsave(&sighand->siglock, flags); if (sighand->action[SIGCHLD-1].sa.sa_handler != SIG_IGN && !(sighand->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDSTOP)) send_signal_locked(SIGCHLD, &info, parent, PIDTYPE_TGID); /* * Even if SIGCHLD is not generated, we must wake up wait4 calls. */ __wake_up_parent(tsk, parent); spin_unlock_irqrestore(&sighand->siglock, flags); } /* * This must be called with current->sighand->siglock held. * * This should be the path for all ptrace stops. * We always set current->last_siginfo while stopped here. * That makes it a way to test a stopped process for * being ptrace-stopped vs being job-control-stopped. * * Returns the signal the ptracer requested the code resume * with. If the code did not stop because the tracer is gone, * the stop signal remains unchanged unless clear_code. */ static int ptrace_stop(int exit_code, int why, unsigned long message, kernel_siginfo_t *info) __releases(&current->sighand->siglock) __acquires(&current->sighand->siglock) { bool gstop_done = false; if (arch_ptrace_stop_needed()) { /* * The arch code has something special to do before a * ptrace stop. This is allowed to block, e.g. for faults * on user stack pages. We can't keep the siglock while * calling arch_ptrace_stop, so we must release it now. * To preserve proper semantics, we must do this before * any signal bookkeeping like checking group_stop_count. */ spin_unlock_irq(&current->sighand->siglock); arch_ptrace_stop(); spin_lock_irq(&current->sighand->siglock); } /* * After this point ptrace_signal_wake_up or signal_wake_up * will clear TASK_TRACED if ptrace_unlink happens or a fatal * signal comes in. Handle previous ptrace_unlinks and fatal * signals here to prevent ptrace_stop sleeping in schedule. */ if (!current->ptrace || __fatal_signal_pending(current)) return exit_code; set_special_state(TASK_TRACED); current->jobctl |= JOBCTL_TRACED; /* * We're committing to trapping. TRACED should be visible before * TRAPPING is cleared; otherwise, the tracer might fail do_wait(). * Also, transition to TRACED and updates to ->jobctl should be * atomic with respect to siglock and should be done after the arch * hook as siglock is released and regrabbed across it. * * TRACER TRACEE * * ptrace_attach() * [L] wait_on_bit(JOBCTL_TRAPPING) [S] set_special_state(TRACED) * do_wait() * set_current_state() smp_wmb(); * ptrace_do_wait() * wait_task_stopped() * task_stopped_code() * [L] task_is_traced() [S] task_clear_jobctl_trapping(); */ smp_wmb(); current->ptrace_message = message; current->last_siginfo = info; current->exit_code = exit_code; /* * If @why is CLD_STOPPED, we're trapping to participate in a group * stop. Do the bookkeeping. Note that if SIGCONT was delievered * across siglock relocks since INTERRUPT was scheduled, PENDING * could be clear now. We act as if SIGCONT is received after * TASK_TRACED is entered - ignore it. */ if (why == CLD_STOPPED && (current->jobctl & JOBCTL_STOP_PENDING)) gstop_done = task_participate_group_stop(current); /* any trap clears pending STOP trap, STOP trap clears NOTIFY */ task_clear_jobctl_pending(current, JOBCTL_TRAP_STOP); if (info && info->si_code >> 8 == PTRACE_EVENT_STOP) task_clear_jobctl_pending(current, JOBCTL_TRAP_NOTIFY); /* entering a trap, clear TRAPPING */ task_clear_jobctl_trapping(current); spin_unlock_irq(&current->sighand->siglock); read_lock(&tasklist_lock); /* * Notify parents of the stop. * * While ptraced, there are two parents - the ptracer and * the real_parent of the group_leader. The ptracer should * know about every stop while the real parent is only * interested in the completion of group stop. The states * for the two don't interact with each other. Notify * separately unless they're gonna be duplicates. */ if (current->ptrace) do_notify_parent_cldstop(current, true, why); if (gstop_done && (!current->ptrace || ptrace_reparented(current))) do_notify_parent_cldstop(current, false, why); /* * The previous do_notify_parent_cldstop() invocation woke ptracer. * One a PREEMPTION kernel this can result in preemption requirement * which will be fulfilled after read_unlock() and the ptracer will be * put on the CPU. * The ptracer is in wait_task_inactive(, __TASK_TRACED) waiting for * this task wait in schedule(). If this task gets preempted then it * remains enqueued on the runqueue. The ptracer will observe this and * then sleep for a delay of one HZ tick. In the meantime this task * gets scheduled, enters schedule() and will wait for the ptracer. * * This preemption point is not bad from a correctness point of * view but extends the runtime by one HZ tick time due to the * ptracer's sleep. The preempt-disable section ensures that there * will be no preemption between unlock and schedule() and so * improving the performance since the ptracer will observe that * the tracee is scheduled out once it gets on the CPU. * * On PREEMPT_RT locking tasklist_lock does not disable preemption. * Therefore the task can be preempted after do_notify_parent_cldstop() * before unlocking tasklist_lock so there is no benefit in doing this. * * In fact disabling preemption is harmful on PREEMPT_RT because * the spinlock_t in cgroup_enter_frozen() must not be acquired * with preemption disabled due to the 'sleeping' spinlock * substitution of RT. */ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) preempt_disable(); read_unlock(&tasklist_lock); cgroup_enter_frozen(); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) preempt_enable_no_resched(); schedule(); cgroup_leave_frozen(true); /* * We are back. Now reacquire the siglock before touching * last_siginfo, so that we are sure to have synchronized with * any signal-sending on another CPU that wants to examine it. */ spin_lock_irq(&current->sighand->siglock); exit_code = current->exit_code; current->last_siginfo = NULL; current->ptrace_message = 0; current->exit_code = 0; /* LISTENING can be set only during STOP traps, clear it */ current->jobctl &= ~(JOBCTL_LISTENING | JOBCTL_PTRACE_FROZEN); /* * Queued signals ignored us while we were stopped for tracing. * So check for any that we should take before resuming user mode. * This sets TIF_SIGPENDING, but never clears it. */ recalc_sigpending_tsk(current); return exit_code; } static int ptrace_do_notify(int signr, int exit_code, int why, unsigned long message) { kernel_siginfo_t info; clear_siginfo(&info); info.si_signo = signr; info.si_code = exit_code; info.si_pid = task_pid_vnr(current); info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); /* Let the debugger run. */ return ptrace_stop(exit_code, why, message, &info); } int ptrace_notify(int exit_code, unsigned long message) { int signr; BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP); if (unlikely(task_work_pending(current))) task_work_run(); spin_lock_irq(&current->sighand->siglock); signr = ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED, message); spin_unlock_irq(&current->sighand->siglock); return signr; } /** * do_signal_stop - handle group stop for SIGSTOP and other stop signals * @signr: signr causing group stop if initiating * * If %JOBCTL_STOP_PENDING is not set yet, initiate group stop with @signr * and participate in it. If already set, participate in the existing * group stop. If participated in a group stop (and thus slept), %true is * returned with siglock released. * * If ptraced, this function doesn't handle stop itself. Instead, * %JOBCTL_TRAP_STOP is scheduled and %false is returned with siglock * untouched. The caller must ensure that INTERRUPT trap handling takes * places afterwards. * * CONTEXT: * Must be called with @current->sighand->siglock held, which is released * on %true return. * * RETURNS: * %false if group stop is already cancelled or ptrace trap is scheduled. * %true if participated in group stop. */ static bool do_signal_stop(int signr) __releases(&current->sighand->siglock) { struct signal_struct *sig = current->signal; if (!(current->jobctl & JOBCTL_STOP_PENDING)) { unsigned long gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME; struct task_struct *t; /* signr will be recorded in task->jobctl for retries */ WARN_ON_ONCE(signr & ~JOBCTL_STOP_SIGMASK); if (!likely(current->jobctl & JOBCTL_STOP_DEQUEUED) || unlikely(sig->flags & SIGNAL_GROUP_EXIT) || unlikely(sig->group_exec_task)) return false; /* * There is no group stop already in progress. We must * initiate one now. * * While ptraced, a task may be resumed while group stop is * still in effect and then receive a stop signal and * initiate another group stop. This deviates from the * usual behavior as two consecutive stop signals can't * cause two group stops when !ptraced. That is why we * also check !task_is_stopped(t) below. * * The condition can be distinguished by testing whether * SIGNAL_STOP_STOPPED is already set. Don't generate * group_exit_code in such case. * * This is not necessary for SIGNAL_STOP_CONTINUED because * an intervening stop signal is required to cause two * continued events regardless of ptrace. */ if (!(sig->flags & SIGNAL_STOP_STOPPED)) sig->group_exit_code = signr; sig->group_stop_count = 0; if (task_set_jobctl_pending(current, signr | gstop)) sig->group_stop_count++; for_other_threads(current, t) { /* * Setting state to TASK_STOPPED for a group * stop is always done with the siglock held, * so this check has no races. */ if (!task_is_stopped(t) && task_set_jobctl_pending(t, signr | gstop)) { sig->group_stop_count++; if (likely(!(t->ptrace & PT_SEIZED))) signal_wake_up(t, 0); else ptrace_trap_notify(t); } } } if (likely(!current->ptrace)) { int notify = 0; /* * If there are no other threads in the group, or if there * is a group stop in progress and we are the last to stop, * report to the parent. */ if (task_participate_group_stop(current)) notify = CLD_STOPPED; current->jobctl |= JOBCTL_STOPPED; set_special_state(TASK_STOPPED); spin_unlock_irq(&current->sighand->siglock); /* * Notify the parent of the group stop completion. Because * we're not holding either the siglock or tasklist_lock * here, ptracer may attach inbetween; however, this is for * group stop and should always be delivered to the real * parent of the group leader. The new ptracer will get * its notification when this task transitions into * TASK_TRACED. */ if (notify) { read_lock(&tasklist_lock); do_notify_parent_cldstop(current, false, notify); read_unlock(&tasklist_lock); } /* Now we don't run again until woken by SIGCONT or SIGKILL */ cgroup_enter_frozen(); schedule(); return true; } else { /* * While ptraced, group stop is handled by STOP trap. * Schedule it and let the caller deal with it. */ task_set_jobctl_pending(current, JOBCTL_TRAP_STOP); return false; } } /** * do_jobctl_trap - take care of ptrace jobctl traps * * When PT_SEIZED, it's used for both group stop and explicit * SEIZE/INTERRUPT traps. Both generate PTRACE_EVENT_STOP trap with * accompanying siginfo. If stopped, lower eight bits of exit_code contain * the stop signal; otherwise, %SIGTRAP. * * When !PT_SEIZED, it's used only for group stop trap with stop signal * number as exit_code and no siginfo. * * CONTEXT: * Must be called with @current->sighand->siglock held, which may be * released and re-acquired before returning with intervening sleep. */ static void do_jobctl_trap(void) { struct signal_struct *signal = current->signal; int signr = current->jobctl & JOBCTL_STOP_SIGMASK; if (current->ptrace & PT_SEIZED) { if (!signal->group_stop_count && !(signal->flags & SIGNAL_STOP_STOPPED)) signr = SIGTRAP; WARN_ON_ONCE(!signr); ptrace_do_notify(signr, signr | (PTRACE_EVENT_STOP << 8), CLD_STOPPED, 0); } else { WARN_ON_ONCE(!signr); ptrace_stop(signr, CLD_STOPPED, 0, NULL); } } /** * do_freezer_trap - handle the freezer jobctl trap * * Puts the task into frozen state, if only the task is not about to quit. * In this case it drops JOBCTL_TRAP_FREEZE. * * CONTEXT: * Must be called with @current->sighand->siglock held, * which is always released before returning. */ static void do_freezer_trap(void) __releases(&current->sighand->siglock) { /* * If there are other trap bits pending except JOBCTL_TRAP_FREEZE, * let's make another loop to give it a chance to be handled. * In any case, we'll return back. */ if ((current->jobctl & (JOBCTL_PENDING_MASK | JOBCTL_TRAP_FREEZE)) != JOBCTL_TRAP_FREEZE) { spin_unlock_irq(&current->sighand->siglock); return; } /* * Now we're sure that there is no pending fatal signal and no * pending traps. Clear TIF_SIGPENDING to not get out of schedule() * immediately (if there is a non-fatal signal pending), and * put the task into sleep. */ __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); clear_thread_flag(TIF_SIGPENDING); spin_unlock_irq(&current->sighand->siglock); cgroup_enter_frozen(); schedule(); /* * We could've been woken by task_work, run it to clear * TIF_NOTIFY_SIGNAL. The caller will retry if necessary. */ clear_notify_signal(); if (unlikely(task_work_pending(current))) task_work_run(); } static int ptrace_signal(int signr, kernel_siginfo_t *info, enum pid_type type) { /* * We do not check sig_kernel_stop(signr) but set this marker * unconditionally because we do not know whether debugger will * change signr. This flag has no meaning unless we are going * to stop after return from ptrace_stop(). In this case it will * be checked in do_signal_stop(), we should only stop if it was * not cleared by SIGCONT while we were sleeping. See also the * comment in dequeue_signal(). */ current->jobctl |= JOBCTL_STOP_DEQUEUED; signr = ptrace_stop(signr, CLD_TRAPPED, 0, info); /* We're back. Did the debugger cancel the sig? */ if (signr == 0) return signr; /* * Update the siginfo structure if the signal has * changed. If the debugger wanted something * specific in the siginfo structure then it should * have updated *info via PTRACE_SETSIGINFO. */ if (signr != info->si_signo) { clear_siginfo(info); info->si_signo = signr; info->si_errno = 0; info->si_code = SI_USER; rcu_read_lock(); info->si_pid = task_pid_vnr(current->parent); info->si_uid = from_kuid_munged(current_user_ns(), task_uid(current->parent)); rcu_read_unlock(); } /* If the (new) signal is now blocked, requeue it. */ if (sigismember(&current->blocked, signr) || fatal_signal_pending(current)) { send_signal_locked(signr, info, current, type); signr = 0; } return signr; } static void hide_si_addr_tag_bits(struct ksignal *ksig) { switch (siginfo_layout(ksig->sig, ksig->info.si_code)) { case SIL_FAULT: case SIL_FAULT_TRAPNO: case SIL_FAULT_MCEERR: case SIL_FAULT_BNDERR: case SIL_FAULT_PKUERR: case SIL_FAULT_PERF_EVENT: ksig->info.si_addr = arch_untagged_si_addr( ksig->info.si_addr, ksig->sig, ksig->info.si_code); break; case SIL_KILL: case SIL_TIMER: case SIL_POLL: case SIL_CHLD: case SIL_RT: case SIL_SYS: break; } } bool get_signal(struct ksignal *ksig) { struct sighand_struct *sighand = current->sighand; struct signal_struct *signal = current->signal; int signr; clear_notify_signal(); if (unlikely(task_work_pending(current))) task_work_run(); if (!task_sigpending(current)) return false; if (unlikely(uprobe_deny_signal())) return false; /* * Do this once, we can't return to user-mode if freezing() == T. * do_signal_stop() and ptrace_stop() do freezable_schedule() and * thus do not need another check after return. */ try_to_freeze(); relock: spin_lock_irq(&sighand->siglock); /* * Every stopped thread goes here after wakeup. Check to see if * we should notify the parent, prepare_signal(SIGCONT) encodes * the CLD_ si_code into SIGNAL_CLD_MASK bits. */ if (unlikely(signal->flags & SIGNAL_CLD_MASK)) { int why; if (signal->flags & SIGNAL_CLD_CONTINUED) why = CLD_CONTINUED; else why = CLD_STOPPED; signal->flags &= ~SIGNAL_CLD_MASK; spin_unlock_irq(&sighand->siglock); /* * Notify the parent that we're continuing. This event is * always per-process and doesn't make whole lot of sense * for ptracers, who shouldn't consume the state via * wait(2) either, but, for backward compatibility, notify * the ptracer of the group leader too unless it's gonna be * a duplicate. */ read_lock(&tasklist_lock); do_notify_parent_cldstop(current, false, why); if (ptrace_reparented(current->group_leader)) do_notify_parent_cldstop(current->group_leader, true, why); read_unlock(&tasklist_lock); goto relock; } for (;;) { struct k_sigaction *ka; enum pid_type type; /* Has this task already been marked for death? */ if ((signal->flags & SIGNAL_GROUP_EXIT) || signal->group_exec_task) { signr = SIGKILL; sigdelset(&current->pending.signal, SIGKILL); trace_signal_deliver(SIGKILL, SEND_SIG_NOINFO, &sighand->action[SIGKILL-1]); recalc_sigpending(); /* * implies do_group_exit() or return to PF_USER_WORKER, * no need to initialize ksig->info/etc. */ goto fatal; } if (unlikely(current->jobctl & JOBCTL_STOP_PENDING) && do_signal_stop(0)) goto relock; if (unlikely(current->jobctl & (JOBCTL_TRAP_MASK | JOBCTL_TRAP_FREEZE))) { if (current->jobctl & JOBCTL_TRAP_MASK) { do_jobctl_trap(); spin_unlock_irq(&sighand->siglock); } else if (current->jobctl & JOBCTL_TRAP_FREEZE) do_freezer_trap(); goto relock; } /* * If the task is leaving the frozen state, let's update * cgroup counters and reset the frozen bit. */ if (unlikely(cgroup_task_frozen(current))) { spin_unlock_irq(&sighand->siglock); cgroup_leave_frozen(false); goto relock; } /* * Signals generated by the execution of an instruction * need to be delivered before any other pending signals * so that the instruction pointer in the signal stack * frame points to the faulting instruction. */ type = PIDTYPE_PID; signr = dequeue_synchronous_signal(&ksig->info); if (!signr) signr = dequeue_signal(&current->blocked, &ksig->info, &type); if (!signr) break; /* will return 0 */ if (unlikely(current->ptrace) && (signr != SIGKILL) && !(sighand->action[signr -1].sa.sa_flags & SA_IMMUTABLE)) { signr = ptrace_signal(signr, &ksig->info, type); if (!signr) continue; } ka = &sighand->action[signr-1]; /* Trace actually delivered signals. */ trace_signal_deliver(signr, &ksig->info, ka); if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */ continue; if (ka->sa.sa_handler != SIG_DFL) { /* Run the handler. */ ksig->ka = *ka; if (ka->sa.sa_flags & SA_ONESHOT) ka->sa.sa_handler = SIG_DFL; break; /* will return non-zero "signr" value */ } /* * Now we are doing the default action for this signal. */ if (sig_kernel_ignore(signr)) /* Default is nothing. */ continue; /* * Global init gets no signals it doesn't want. * Container-init gets no signals it doesn't want from same * container. * * Note that if global/container-init sees a sig_kernel_only() * signal here, the signal must have been generated internally * or must have come from an ancestor namespace. In either * case, the signal cannot be dropped. */ if (unlikely(signal->flags & SIGNAL_UNKILLABLE) && !sig_kernel_only(signr)) continue; if (sig_kernel_stop(signr)) { /* * The default action is to stop all threads in * the thread group. The job control signals * do nothing in an orphaned pgrp, but SIGSTOP * always works. Note that siglock needs to be * dropped during the call to is_orphaned_pgrp() * because of lock ordering with tasklist_lock. * This allows an intervening SIGCONT to be posted. * We need to check for that and bail out if necessary. */ if (signr != SIGSTOP) { spin_unlock_irq(&sighand->siglock); /* signals can be posted during this window */ if (is_current_pgrp_orphaned()) goto relock; spin_lock_irq(&sighand->siglock); } if (likely(do_signal_stop(signr))) { /* It released the siglock. */ goto relock; } /* * We didn't actually stop, due to a race * with SIGCONT or something like that. */ continue; } fatal: spin_unlock_irq(&sighand->siglock); if (unlikely(cgroup_task_frozen(current))) cgroup_leave_frozen(true); /* * Anything else is fatal, maybe with a core dump. */ current->flags |= PF_SIGNALED; if (sig_kernel_coredump(signr)) { if (print_fatal_signals) print_fatal_signal(signr); proc_coredump_connector(current); /* * If it was able to dump core, this kills all * other threads in the group and synchronizes with * their demise. If we lost the race with another * thread getting here, it set group_exit_code * first and our do_group_exit call below will use * that value and ignore the one we pass it. */ vfs_coredump(&ksig->info); } /* * PF_USER_WORKER threads will catch and exit on fatal signals * themselves. They have cleanup that must be performed, so we * cannot call do_exit() on their behalf. Note that ksig won't * be properly initialized, PF_USER_WORKER's shouldn't use it. */ if (current->flags & PF_USER_WORKER) goto out; /* * Death signals, no core dump. */ do_group_exit(signr); /* NOTREACHED */ } spin_unlock_irq(&sighand->siglock); ksig->sig = signr; if (signr && !(ksig->ka.sa.sa_flags & SA_EXPOSE_TAGBITS)) hide_si_addr_tag_bits(ksig); out: return signr > 0; } /** * signal_delivered - called after signal delivery to update blocked signals * @ksig: kernel signal struct * @stepping: nonzero if debugger single-step or block-step in use * * This function should be called when a signal has successfully been * delivered. It updates the blocked signals accordingly (@ksig->ka.sa.sa_mask * is always blocked), and the signal itself is blocked unless %SA_NODEFER * is set in @ksig->ka.sa.sa_flags. Tracing is notified. */ static void signal_delivered(struct ksignal *ksig, int stepping) { sigset_t blocked; /* A signal was successfully delivered, and the saved sigmask was stored on the signal frame, and will be restored by sigreturn. So we can simply clear the restore sigmask flag. */ clear_restore_sigmask(); sigorsets(&blocked, &current->blocked, &ksig->ka.sa.sa_mask); if (!(ksig->ka.sa.sa_flags & SA_NODEFER)) sigaddset(&blocked, ksig->sig); set_current_blocked(&blocked); if (current->sas_ss_flags & SS_AUTODISARM) sas_ss_reset(current); if (stepping) ptrace_notify(SIGTRAP, 0); } void signal_setup_done(int failed, struct ksignal *ksig, int stepping) { if (failed) force_sigsegv(ksig->sig); else signal_delivered(ksig, stepping); } /* * It could be that complete_signal() picked us to notify about the * group-wide signal. Other threads should be notified now to take * the shared signals in @which since we will not. */ static void retarget_shared_pending(struct task_struct *tsk, sigset_t *which) { sigset_t retarget; struct task_struct *t; sigandsets(&retarget, &tsk->signal->shared_pending.signal, which); if (sigisemptyset(&retarget)) return; for_other_threads(tsk, t) { if (t->flags & PF_EXITING) continue; if (!has_pending_signals(&retarget, &t->blocked)) continue; /* Remove the signals this thread can handle. */ sigandsets(&retarget, &retarget, &t->blocked); if (!task_sigpending(t)) signal_wake_up(t, 0); if (sigisemptyset(&retarget)) break; } } void exit_signals(struct task_struct *tsk) { int group_stop = 0; sigset_t unblocked; /* * @tsk is about to have PF_EXITING set - lock out users which * expect stable threadgroup. */ cgroup_threadgroup_change_begin(tsk); if (thread_group_empty(tsk) || (tsk->signal->flags & SIGNAL_GROUP_EXIT)) { sched_mm_cid_exit_signals(tsk); tsk->flags |= PF_EXITING; cgroup_threadgroup_change_end(tsk); return; } spin_lock_irq(&tsk->sighand->siglock); /* * From now this task is not visible for group-wide signals, * see wants_signal(), do_signal_stop(). */ sched_mm_cid_exit_signals(tsk); tsk->flags |= PF_EXITING; cgroup_threadgroup_change_end(tsk); if (!task_sigpending(tsk)) goto out; unblocked = tsk->blocked; signotset(&unblocked); retarget_shared_pending(tsk, &unblocked); if (unlikely(tsk->jobctl & JOBCTL_STOP_PENDING) && task_participate_group_stop(tsk)) group_stop = CLD_STOPPED; out: spin_unlock_irq(&tsk->sighand->siglock); /* * If group stop has completed, deliver the notification. This * should always go to the real parent of the group leader. */ if (unlikely(group_stop)) { read_lock(&tasklist_lock); do_notify_parent_cldstop(tsk, false, group_stop); read_unlock(&tasklist_lock); } } /* * System call entry points. */ /** * sys_restart_syscall - restart a system call */ SYSCALL_DEFINE0(restart_syscall) { struct restart_block *restart = &current->restart_block; return restart->fn(restart); } long do_no_restart_syscall(struct restart_block *param) { return -EINTR; } static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset) { if (task_sigpending(tsk) && !thread_group_empty(tsk)) { sigset_t newblocked; /* A set of now blocked but previously unblocked signals. */ sigandnsets(&newblocked, newset, &current->blocked); retarget_shared_pending(tsk, &newblocked); } tsk->blocked = *newset; recalc_sigpending(); } /** * set_current_blocked - change current->blocked mask * @newset: new mask * * It is wrong to change ->blocked directly, this helper should be used * to ensure the process can't miss a shared signal we are going to block. */ void set_current_blocked(sigset_t *newset) { sigdelsetmask(newset, sigmask(SIGKILL) | sigmask(SIGSTOP)); __set_current_blocked(newset); } void __set_current_blocked(const sigset_t *newset) { struct task_struct *tsk = current; /* * In case the signal mask hasn't changed, there is nothing we need * to do. The current->blocked shouldn't be modified by other task. */ if (sigequalsets(&tsk->blocked, newset)) return; spin_lock_irq(&tsk->sighand->siglock); __set_task_blocked(tsk, newset); spin_unlock_irq(&tsk->sighand->siglock); } /* * This is also useful for kernel threads that want to temporarily * (or permanently) block certain signals. * * NOTE! Unlike the user-mode sys_sigprocmask(), the kernel * interface happily blocks "unblockable" signals like SIGKILL * and friends. */ int sigprocmask(int how, sigset_t *set, sigset_t *oldset) { struct task_struct *tsk = current; sigset_t newset; /* Lockless, only current can change ->blocked, never from irq */ if (oldset) *oldset = tsk->blocked; switch (how) { case SIG_BLOCK: sigorsets(&newset, &tsk->blocked, set); break; case SIG_UNBLOCK: sigandnsets(&newset, &tsk->blocked, set); break; case SIG_SETMASK: newset = *set; break; default: return -EINVAL; } __set_current_blocked(&newset); return 0; } EXPORT_SYMBOL(sigprocmask); /* * The api helps set app-provided sigmasks. * * This is useful for syscalls such as ppoll, pselect, io_pgetevents and * epoll_pwait where a new sigmask is passed from userland for the syscalls. * * Note that it does set_restore_sigmask() in advance, so it must be always * paired with restore_saved_sigmask_unless() before return from syscall. */ int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize) { sigset_t kmask; if (!umask) return 0; if (sigsetsize != sizeof(sigset_t)) return -EINVAL; if (copy_from_user(&kmask, umask, sizeof(sigset_t))) return -EFAULT; set_restore_sigmask(); current->saved_sigmask = current->blocked; set_current_blocked(&kmask); return 0; } #ifdef CONFIG_COMPAT int set_compat_user_sigmask(const compat_sigset_t __user *umask, size_t sigsetsize) { sigset_t kmask; if (!umask) return 0; if (sigsetsize != sizeof(compat_sigset_t)) return -EINVAL; if (get_compat_sigset(&kmask, umask)) return -EFAULT; set_restore_sigmask(); current->saved_sigmask = current->blocked; set_current_blocked(&kmask); return 0; } #endif /** * sys_rt_sigprocmask - change the list of currently blocked signals * @how: whether to add, remove, or set signals * @nset: stores pending signals * @oset: previous value of signal mask if non-null * @sigsetsize: size of sigset_t type */ SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset, sigset_t __user *, oset, size_t, sigsetsize) { sigset_t old_set, new_set; int error; /* XXX: Don't preclude handling different sized sigset_t's. */ if (sigsetsize != sizeof(sigset_t)) return -EINVAL; old_set = current->blocked; if (nset) { if (copy_from_user(&new_set, nset, sizeof(sigset_t))) return -EFAULT; sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP)); error = sigprocmask(how, &new_set, NULL); if (error) return error; } if (oset) { if (copy_to_user(oset, &old_set, sizeof(sigset_t))) return -EFAULT; } return 0; } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset, compat_sigset_t __user *, oset, compat_size_t, sigsetsize) { sigset_t old_set = current->blocked; /* XXX: Don't preclude handling different sized sigset_t's. */ if (sigsetsize != sizeof(sigset_t)) return -EINVAL; if (nset) { sigset_t new_set; int error; if (get_compat_sigset(&new_set, nset)) return -EFAULT; sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP)); error = sigprocmask(how, &new_set, NULL); if (error) return error; } return oset ? put_compat_sigset(oset, &old_set, sizeof(*oset)) : 0; } #endif static void do_sigpending(sigset_t *set) { spin_lock_irq(&current->sighand->siglock); sigorsets(set, &current->pending.signal, &current->signal->shared_pending.signal); spin_unlock_irq(&current->sighand->siglock); /* Outside the lock because only this thread touches it. */ sigandsets(set, &current->blocked, set); } /** * sys_rt_sigpending - examine a pending signal that has been raised * while blocked * @uset: stores pending signals * @sigsetsize: size of sigset_t type or larger */ SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize) { sigset_t set; if (sigsetsize > sizeof(*uset)) return -EINVAL; do_sigpending(&set); if (copy_to_user(uset, &set, sigsetsize)) return -EFAULT; return 0; } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset, compat_size_t, sigsetsize) { sigset_t set; if (sigsetsize > sizeof(*uset)) return -EINVAL; do_sigpending(&set); return put_compat_sigset(uset, &set, sigsetsize); } #endif static const struct { unsigned char limit, layout; } sig_sicodes[] = { [SIGILL] = { NSIGILL, SIL_FAULT }, [SIGFPE] = { NSIGFPE, SIL_FAULT }, [SIGSEGV] = { NSIGSEGV, SIL_FAULT }, [SIGBUS] = { NSIGBUS, SIL_FAULT }, [SIGTRAP] = { NSIGTRAP, SIL_FAULT }, #if defined(SIGEMT) [SIGEMT] = { NSIGEMT, SIL_FAULT }, #endif [SIGCHLD] = { NSIGCHLD, SIL_CHLD }, [SIGPOLL] = { NSIGPOLL, SIL_POLL }, [SIGSYS] = { NSIGSYS, SIL_SYS }, }; static bool known_siginfo_layout(unsigned sig, int si_code) { if (si_code == SI_KERNEL) return true; else if ((si_code > SI_USER)) { if (sig_specific_sicodes(sig)) { if (si_code <= sig_sicodes[sig].limit) return true; } else if (si_code <= NSIGPOLL) return true; } else if (si_code >= SI_DETHREAD) return true; else if (si_code == SI_ASYNCNL) return true; return false; } enum siginfo_layout siginfo_layout(unsigned sig, int si_code) { enum siginfo_layout layout = SIL_KILL; if ((si_code > SI_USER) && (si_code < SI_KERNEL)) { if ((sig < ARRAY_SIZE(sig_sicodes)) && (si_code <= sig_sicodes[sig].limit)) { layout = sig_sicodes[sig].layout; /* Handle the exceptions */ if ((sig == SIGBUS) && (si_code >= BUS_MCEERR_AR) && (si_code <= BUS_MCEERR_AO)) layout = SIL_FAULT_MCEERR; else if ((sig == SIGSEGV) && (si_code == SEGV_BNDERR)) layout = SIL_FAULT_BNDERR; #ifdef SEGV_PKUERR else if ((sig == SIGSEGV) && (si_code == SEGV_PKUERR)) layout = SIL_FAULT_PKUERR; #endif else if ((sig == SIGTRAP) && (si_code == TRAP_PERF)) layout = SIL_FAULT_PERF_EVENT; else if (IS_ENABLED(CONFIG_SPARC) && (sig == SIGILL) && (si_code == ILL_ILLTRP)) layout = SIL_FAULT_TRAPNO; else if (IS_ENABLED(CONFIG_ALPHA) && ((sig == SIGFPE) || ((sig == SIGTRAP) && (si_code == TRAP_UNK)))) layout = SIL_FAULT_TRAPNO; } else if (si_code <= NSIGPOLL) layout = SIL_POLL; } else { if (si_code == SI_TIMER) layout = SIL_TIMER; else if (si_code == SI_SIGIO) layout = SIL_POLL; else if (si_code < 0) layout = SIL_RT; } return layout; } static inline char __user *si_expansion(const siginfo_t __user *info) { return ((char __user *)info) + sizeof(struct kernel_siginfo); } int copy_siginfo_to_user(siginfo_t __user *to, const kernel_siginfo_t *from) { char __user *expansion = si_expansion(to); if (copy_to_user(to, from , sizeof(struct kernel_siginfo))) return -EFAULT; if (clear_user(expansion, SI_EXPANSION_SIZE)) return -EFAULT; return 0; } static int post_copy_siginfo_from_user(kernel_siginfo_t *info, const siginfo_t __user *from) { if (unlikely(!known_siginfo_layout(info->si_signo, info->si_code))) { char __user *expansion = si_expansion(from); char buf[SI_EXPANSION_SIZE]; int i; /* * An unknown si_code might need more than * sizeof(struct kernel_siginfo) bytes. Verify all of the * extra bytes are 0. This guarantees copy_siginfo_to_user * will return this data to userspace exactly. */ if (copy_from_user(&buf, expansion, SI_EXPANSION_SIZE)) return -EFAULT; for (i = 0; i < SI_EXPANSION_SIZE; i++) { if (buf[i] != 0) return -E2BIG; } } return 0; } static int __copy_siginfo_from_user(int signo, kernel_siginfo_t *to, const siginfo_t __user *from) { if (copy_from_user(to, from, sizeof(struct kernel_siginfo))) return -EFAULT; to->si_signo = signo; return post_copy_siginfo_from_user(to, from); } int copy_siginfo_from_user(kernel_siginfo_t *to, const siginfo_t __user *from) { if (copy_from_user(to, from, sizeof(struct kernel_siginfo))) return -EFAULT; return post_copy_siginfo_from_user(to, from); } #ifdef CONFIG_COMPAT /** * copy_siginfo_to_external32 - copy a kernel siginfo into a compat user siginfo * @to: compat siginfo destination * @from: kernel siginfo source * * Note: This function does not work properly for the SIGCHLD on x32, but * fortunately it doesn't have to. The only valid callers for this function are * copy_siginfo_to_user32, which is overriden for x32 and the coredump code. * The latter does not care because SIGCHLD will never cause a coredump. */ void copy_siginfo_to_external32(struct compat_siginfo *to, const struct kernel_siginfo *from) { memset(to, 0, sizeof(*to)); to->si_signo = from->si_signo; to->si_errno = from->si_errno; to->si_code = from->si_code; switch(siginfo_layout(from->si_signo, from->si_code)) { case SIL_KILL: to->si_pid = from->si_pid; to->si_uid = from->si_uid; break; case SIL_TIMER: to->si_tid = from->si_tid; to->si_overrun = from->si_overrun; to->si_int = from->si_int; break; case SIL_POLL: to->si_band = from->si_band; to->si_fd = from->si_fd; break; case SIL_FAULT: to->si_addr = ptr_to_compat(from->si_addr); break; case SIL_FAULT_TRAPNO: to->si_addr = ptr_to_compat(from->si_addr); to->si_trapno = from->si_trapno; break; case SIL_FAULT_MCEERR: to->si_addr = ptr_to_compat(from->si_addr); to->si_addr_lsb = from->si_addr_lsb; break; case SIL_FAULT_BNDERR: to->si_addr = ptr_to_compat(from->si_addr); to->si_lower = ptr_to_compat(from->si_lower); to->si_upper = ptr_to_compat(from->si_upper); break; case SIL_FAULT_PKUERR: to->si_addr = ptr_to_compat(from->si_addr); to->si_pkey = from->si_pkey; break; case SIL_FAULT_PERF_EVENT: to->si_addr = ptr_to_compat(from->si_addr); to->si_perf_data = from->si_perf_data; to->si_perf_type = from->si_perf_type; to->si_perf_flags = from->si_perf_flags; break; case SIL_CHLD: to->si_pid = from->si_pid; to->si_uid = from->si_uid; to->si_status = from->si_status; to->si_utime = from->si_utime; to->si_stime = from->si_stime; break; case SIL_RT: to->si_pid = from->si_pid; to->si_uid = from->si_uid; to->si_int = from->si_int; break; case SIL_SYS: to->si_call_addr = ptr_to_compat(from->si_call_addr); to->si_syscall = from->si_syscall; to->si_arch = from->si_arch; break; } } int __copy_siginfo_to_user32(struct compat_siginfo __user *to, const struct kernel_siginfo *from) { struct compat_siginfo new; copy_siginfo_to_external32(&new, from); if (copy_to_user(to, &new, sizeof(struct compat_siginfo))) return -EFAULT; return 0; } static int post_copy_siginfo_from_user32(kernel_siginfo_t *to, const struct compat_siginfo *from) { clear_siginfo(to); to->si_signo = from->si_signo; to->si_errno = from->si_errno; to->si_code = from->si_code; switch(siginfo_layout(from->si_signo, from->si_code)) { case SIL_KILL: to->si_pid = from->si_pid; to->si_uid = from->si_uid; break; case SIL_TIMER: to->si_tid = from->si_tid; to->si_overrun = from->si_overrun; to->si_int = from->si_int; break; case SIL_POLL: to->si_band = from->si_band; to->si_fd = from->si_fd; break; case SIL_FAULT: to->si_addr = compat_ptr(from->si_addr); break; case SIL_FAULT_TRAPNO: to->si_addr = compat_ptr(from->si_addr); to->si_trapno = from->si_trapno; break; case SIL_FAULT_MCEERR: to->si_addr = compat_ptr(from->si_addr); to->si_addr_lsb = from->si_addr_lsb; break; case SIL_FAULT_BNDERR: to->si_addr = compat_ptr(from->si_addr); to->si_lower = compat_ptr(from->si_lower); to->si_upper = compat_ptr(from->si_upper); break; case SIL_FAULT_PKUERR: to->si_addr = compat_ptr(from->si_addr); to->si_pkey = from->si_pkey; break; case SIL_FAULT_PERF_EVENT: to->si_addr = compat_ptr(from->si_addr); to->si_perf_data = from->si_perf_data; to->si_perf_type = from->si_perf_type; to->si_perf_flags = from->si_perf_flags; break; case SIL_CHLD: to->si_pid = from->si_pid; to->si_uid = from->si_uid; to->si_status = from->si_status; #ifdef CONFIG_X86_X32_ABI if (in_x32_syscall()) { to->si_utime = from->_sifields._sigchld_x32._utime; to->si_stime = from->_sifields._sigchld_x32._stime; } else #endif { to->si_utime = from->si_utime; to->si_stime = from->si_stime; } break; case SIL_RT: to->si_pid = from->si_pid; to->si_uid = from->si_uid; to->si_int = from->si_int; break; case SIL_SYS: to->si_call_addr = compat_ptr(from->si_call_addr); to->si_syscall = from->si_syscall; to->si_arch = from->si_arch; break; } return 0; } static int __copy_siginfo_from_user32(int signo, struct kernel_siginfo *to, const struct compat_siginfo __user *ufrom) { struct compat_siginfo from; if (copy_from_user(&from, ufrom, sizeof(struct compat_siginfo))) return -EFAULT; from.si_signo = signo; return post_copy_siginfo_from_user32(to, &from); } int copy_siginfo_from_user32(struct kernel_siginfo *to, const struct compat_siginfo __user *ufrom) { struct compat_siginfo from; if (copy_from_user(&from, ufrom, sizeof(struct compat_siginfo))) return -EFAULT; return post_copy_siginfo_from_user32(to, &from); } #endif /* CONFIG_COMPAT */ /** * do_sigtimedwait - wait for queued signals specified in @which * @which: queued signals to wait for * @info: if non-null, the signal's siginfo is returned here * @ts: upper bound on process time suspension */ static int do_sigtimedwait(const sigset_t *which, kernel_siginfo_t *info, const struct timespec64 *ts) { ktime_t *to = NULL, timeout = KTIME_MAX; struct task_struct *tsk = current; sigset_t mask = *which; enum pid_type type; int sig, ret = 0; if (ts) { if (!timespec64_valid(ts)) return -EINVAL; timeout = timespec64_to_ktime(*ts); to = &timeout; } /* * Invert the set of allowed signals to get those we want to block. */ sigdelsetmask(&mask, sigmask(SIGKILL) | sigmask(SIGSTOP)); signotset(&mask); spin_lock_irq(&tsk->sighand->siglock); sig = dequeue_signal(&mask, info, &type); if (!sig && timeout) { /* * None ready, temporarily unblock those we're interested * while we are sleeping in so that we'll be awakened when * they arrive. Unblocking is always fine, we can avoid * set_current_blocked(). */ tsk->real_blocked = tsk->blocked; sigandsets(&tsk->blocked, &tsk->blocked, &mask); recalc_sigpending(); spin_unlock_irq(&tsk->sighand->siglock); __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); ret = schedule_hrtimeout_range(to, tsk->timer_slack_ns, HRTIMER_MODE_REL); spin_lock_irq(&tsk->sighand->siglock); __set_task_blocked(tsk, &tsk->real_blocked); sigemptyset(&tsk->real_blocked); sig = dequeue_signal(&mask, info, &type); } spin_unlock_irq(&tsk->sighand->siglock); if (sig) return sig; return ret ? -EINTR : -EAGAIN; } /** * sys_rt_sigtimedwait - synchronously wait for queued signals specified * in @uthese * @uthese: queued signals to wait for * @uinfo: if non-null, the signal's siginfo is returned here * @uts: upper bound on process time suspension * @sigsetsize: size of sigset_t type */ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, siginfo_t __user *, uinfo, const struct __kernel_timespec __user *, uts, size_t, sigsetsize) { sigset_t these; struct timespec64 ts; kernel_siginfo_t info; int ret; /* XXX: Don't preclude handling different sized sigset_t's. */ if (sigsetsize != sizeof(sigset_t)) return -EINVAL; if (copy_from_user(&these, uthese, sizeof(these))) return -EFAULT; if (uts) { if (get_timespec64(&ts, uts)) return -EFAULT; } ret = do_sigtimedwait(&these, &info, uts ? &ts : NULL); if (ret > 0 && uinfo) { if (copy_siginfo_to_user(uinfo, &info)) ret = -EFAULT; } return ret; } #ifdef CONFIG_COMPAT_32BIT_TIME SYSCALL_DEFINE4(rt_sigtimedwait_time32, const sigset_t __user *, uthese, siginfo_t __user *, uinfo, const struct old_timespec32 __user *, uts, size_t, sigsetsize) { sigset_t these; struct timespec64 ts; kernel_siginfo_t info; int ret; if (sigsetsize != sizeof(sigset_t)) return -EINVAL; if (copy_from_user(&these, uthese, sizeof(these))) return -EFAULT; if (uts) { if (get_old_timespec32(&ts, uts)) return -EFAULT; } ret = do_sigtimedwait(&these, &info, uts ? &ts : NULL); if (ret > 0 && uinfo) { if (copy_siginfo_to_user(uinfo, &info)) ret = -EFAULT; } return ret; } #endif #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time64, compat_sigset_t __user *, uthese, struct compat_siginfo __user *, uinfo, struct __kernel_timespec __user *, uts, compat_size_t, sigsetsize) { sigset_t s; struct timespec64 t; kernel_siginfo_t info; long ret; if (sigsetsize != sizeof(sigset_t)) return -EINVAL; if (get_compat_sigset(&s, uthese)) return -EFAULT; if (uts) { if (get_timespec64(&t, uts)) return -EFAULT; } ret = do_sigtimedwait(&s, &info, uts ? &t : NULL); if (ret > 0 && uinfo) { if (copy_siginfo_to_user32(uinfo, &info)) ret = -EFAULT; } return ret; } #ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time32, compat_sigset_t __user *, uthese, struct compat_siginfo __user *, uinfo, struct old_timespec32 __user *, uts, compat_size_t, sigsetsize) { sigset_t s; struct timespec64 t; kernel_siginfo_t info; long ret; if (sigsetsize != sizeof(sigset_t)) return -EINVAL; if (get_compat_sigset(&s, uthese)) return -EFAULT; if (uts) { if (get_old_timespec32(&t, uts)) return -EFAULT; } ret = do_sigtimedwait(&s, &info, uts ? &t : NULL); if (ret > 0 && uinfo) { if (copy_siginfo_to_user32(uinfo, &info)) ret = -EFAULT; } return ret; } #endif #endif static void prepare_kill_siginfo(int sig, struct kernel_siginfo *info, enum pid_type type) { clear_siginfo(info); info->si_signo = sig; info->si_errno = 0; info->si_code = (type == PIDTYPE_PID) ? SI_TKILL : SI_USER; info->si_pid = task_tgid_vnr(current); info->si_uid = from_kuid_munged(current_user_ns(), current_uid()); } /** * sys_kill - send a signal to a process * @pid: the PID of the process * @sig: signal to be sent */ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) { struct kernel_siginfo info; prepare_kill_siginfo(sig, &info, PIDTYPE_TGID); return kill_something_info(sig, &info, pid); } /* * Verify that the signaler and signalee either are in the same pid namespace * or that the signaler's pid namespace is an ancestor of the signalee's pid * namespace. */ static bool access_pidfd_pidns(struct pid *pid) { struct pid_namespace *active = task_active_pid_ns(current); struct pid_namespace *p = ns_of_pid(pid); for (;;) { if (!p) return false; if (p == active) break; p = p->parent; } return true; } static int copy_siginfo_from_user_any(kernel_siginfo_t *kinfo, siginfo_t __user *info) { #ifdef CONFIG_COMPAT /* * Avoid hooking up compat syscalls and instead handle necessary * conversions here. Note, this is a stop-gap measure and should not be * considered a generic solution. */ if (in_compat_syscall()) return copy_siginfo_from_user32( kinfo, (struct compat_siginfo __user *)info); #endif return copy_siginfo_from_user(kinfo, info); } static struct pid *pidfd_to_pid(const struct file *file) { struct pid *pid; pid = pidfd_pid(file); if (!IS_ERR(pid)) return pid; return tgid_pidfd_to_pid(file); } #define PIDFD_SEND_SIGNAL_FLAGS \ (PIDFD_SIGNAL_THREAD | PIDFD_SIGNAL_THREAD_GROUP | \ PIDFD_SIGNAL_PROCESS_GROUP) static int do_pidfd_send_signal(struct pid *pid, int sig, enum pid_type type, siginfo_t __user *info, unsigned int flags) { kernel_siginfo_t kinfo; switch (flags) { case PIDFD_SIGNAL_THREAD: type = PIDTYPE_PID; break; case PIDFD_SIGNAL_THREAD_GROUP: type = PIDTYPE_TGID; break; case PIDFD_SIGNAL_PROCESS_GROUP: type = PIDTYPE_PGID; break; } if (info) { int ret; ret = copy_siginfo_from_user_any(&kinfo, info); if (unlikely(ret)) return ret; if (unlikely(sig != kinfo.si_signo)) return -EINVAL; /* Only allow sending arbitrary signals to yourself. */ if ((task_pid(current) != pid || type > PIDTYPE_TGID) && (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL)) return -EPERM; } else { prepare_kill_siginfo(sig, &kinfo, type); } if (type == PIDTYPE_PGID) return kill_pgrp_info(sig, &kinfo, pid); return kill_pid_info_type(sig, &kinfo, pid, type); } /** * sys_pidfd_send_signal - Signal a process through a pidfd * @pidfd: file descriptor of the process * @sig: signal to send * @info: signal info * @flags: future flags * * Send the signal to the thread group or to the individual thread depending * on PIDFD_THREAD. * In the future extension to @flags may be used to override the default scope * of @pidfd. * * Return: 0 on success, negative errno on failure */ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, siginfo_t __user *, info, unsigned int, flags) { struct pid *pid; enum pid_type type; int ret; /* Enforce flags be set to 0 until we add an extension. */ if (flags & ~PIDFD_SEND_SIGNAL_FLAGS) return -EINVAL; /* Ensure that only a single signal scope determining flag is set. */ if (hweight32(flags & PIDFD_SEND_SIGNAL_FLAGS) > 1) return -EINVAL; switch (pidfd) { case PIDFD_SELF_THREAD: pid = get_task_pid(current, PIDTYPE_PID); type = PIDTYPE_PID; break; case PIDFD_SELF_THREAD_GROUP: pid = get_task_pid(current, PIDTYPE_TGID); type = PIDTYPE_TGID; break; default: { CLASS(fd, f)(pidfd); if (fd_empty(f)) return -EBADF; /* Is this a pidfd? */ pid = pidfd_to_pid(fd_file(f)); if (IS_ERR(pid)) return PTR_ERR(pid); if (!access_pidfd_pidns(pid)) return -EINVAL; /* Infer scope from the type of pidfd. */ if (fd_file(f)->f_flags & PIDFD_THREAD) type = PIDTYPE_PID; else type = PIDTYPE_TGID; return do_pidfd_send_signal(pid, sig, type, info, flags); } } ret = do_pidfd_send_signal(pid, sig, type, info, flags); put_pid(pid); return ret; } static int do_send_specific(pid_t tgid, pid_t pid, int sig, struct kernel_siginfo *info) { struct task_struct *p; int error = -ESRCH; rcu_read_lock(); p = find_task_by_vpid(pid); if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) { error = check_kill_permission(sig, info, p); /* * The null signal is a permissions and process existence * probe. No signal is actually delivered. */ if (!error && sig) { error = do_send_sig_info(sig, info, p, PIDTYPE_PID); /* * If lock_task_sighand() failed we pretend the task * dies after receiving the signal. The window is tiny, * and the signal is private anyway. */ if (unlikely(error == -ESRCH)) error = 0; } } rcu_read_unlock(); return error; } static int do_tkill(pid_t tgid, pid_t pid, int sig) { struct kernel_siginfo info; prepare_kill_siginfo(sig, &info, PIDTYPE_PID); return do_send_specific(tgid, pid, sig, &info); } /** * sys_tgkill - send signal to one specific thread * @tgid: the thread group ID of the thread * @pid: the PID of the thread * @sig: signal to be sent * * This syscall also checks the @tgid and returns -ESRCH even if the PID * exists but it's not belonging to the target process anymore. This * method solves the problem of threads exiting and PIDs getting reused. */ SYSCALL_DEFINE3(tgkill, pid_t, tgid, pid_t, pid, int, sig) { /* This is only valid for single tasks */ if (pid <= 0 || tgid <= 0) return -EINVAL; return do_tkill(tgid, pid, sig); } /** * sys_tkill - send signal to one specific task * @pid: the PID of the task * @sig: signal to be sent * * Send a signal to only one task, even if it's a CLONE_THREAD task. */ SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig) { /* This is only valid for single tasks */ if (pid <= 0) return -EINVAL; return do_tkill(0, pid, sig); } static int do_rt_sigqueueinfo(pid_t pid, int sig, kernel_siginfo_t *info) { /* Not even root can pretend to send signals from the kernel. * Nor can they impersonate a kill()/tgkill(), which adds source info. */ if ((info->si_code >= 0 || info->si_code == SI_TKILL) && (task_pid_vnr(current) != pid)) return -EPERM; /* POSIX.1b doesn't mention process groups. */ return kill_proc_info(sig, info, pid); } /** * sys_rt_sigqueueinfo - send signal information to a signal * @pid: the PID of the thread * @sig: signal to be sent * @uinfo: signal info to be sent */ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig, siginfo_t __user *, uinfo) { kernel_siginfo_t info; int ret = __copy_siginfo_from_user(sig, &info, uinfo); if (unlikely(ret)) return ret; return do_rt_sigqueueinfo(pid, sig, &info); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE3(rt_sigqueueinfo, compat_pid_t, pid, int, sig, struct compat_siginfo __user *, uinfo) { kernel_siginfo_t info; int ret = __copy_siginfo_from_user32(sig, &info, uinfo); if (unlikely(ret)) return ret; return do_rt_sigqueueinfo(pid, sig, &info); } #endif static int do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, kernel_siginfo_t *info) { /* This is only valid for single tasks */ if (pid <= 0 || tgid <= 0) return -EINVAL; /* Not even root can pretend to send signals from the kernel. * Nor can they impersonate a kill()/tgkill(), which adds source info. */ if ((info->si_code >= 0 || info->si_code == SI_TKILL) && (task_pid_vnr(current) != pid)) return -EPERM; return do_send_specific(tgid, pid, sig, info); } SYSCALL_DEFINE4(rt_tgsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig, siginfo_t __user *, uinfo) { kernel_siginfo_t info; int ret = __copy_siginfo_from_user(sig, &info, uinfo); if (unlikely(ret)) return ret; return do_rt_tgsigqueueinfo(tgid, pid, sig, &info); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo, compat_pid_t, tgid, compat_pid_t, pid, int, sig, struct compat_siginfo __user *, uinfo) { kernel_siginfo_t info; int ret = __copy_siginfo_from_user32(sig, &info, uinfo); if (unlikely(ret)) return ret; return do_rt_tgsigqueueinfo(tgid, pid, sig, &info); } #endif /* * For kthreads only, must not be used if cloned with CLONE_SIGHAND */ void kernel_sigaction(int sig, __sighandler_t action) { spin_lock_irq(&current->sighand->siglock); current->sighand->action[sig - 1].sa.sa_handler = action; if (action == SIG_IGN) { sigset_t mask; sigemptyset(&mask); sigaddset(&mask, sig); flush_sigqueue_mask(current, &mask, &current->signal->shared_pending); flush_sigqueue_mask(current, &mask, &current->pending); recalc_sigpending(); } spin_unlock_irq(&current->sighand->siglock); } EXPORT_SYMBOL(kernel_sigaction); void __weak sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact) { } int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) { struct task_struct *p = current, *t; struct k_sigaction *k; sigset_t mask; if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig))) return -EINVAL; k = &p->sighand->action[sig-1]; spin_lock_irq(&p->sighand->siglock); if (k->sa.sa_flags & SA_IMMUTABLE) { spin_unlock_irq(&p->sighand->siglock); return -EINVAL; } if (oact) *oact = *k; /* * Make sure that we never accidentally claim to support SA_UNSUPPORTED, * e.g. by having an architecture use the bit in their uapi. */ BUILD_BUG_ON(UAPI_SA_FLAGS & SA_UNSUPPORTED); /* * Clear unknown flag bits in order to allow userspace to detect missing * support for flag bits and to allow the kernel to use non-uapi bits * internally. */ if (act) act->sa.sa_flags &= UAPI_SA_FLAGS; if (oact) oact->sa.sa_flags &= UAPI_SA_FLAGS; sigaction_compat_abi(act, oact); if (act) { bool was_ignored = k->sa.sa_handler == SIG_IGN; sigdelsetmask(&act->sa.sa_mask, sigmask(SIGKILL) | sigmask(SIGSTOP)); *k = *act; /* * POSIX 3.3.1.3: * "Setting a signal action to SIG_IGN for a signal that is * pending shall cause the pending signal to be discarded, * whether or not it is blocked." * * "Setting a signal action to SIG_DFL for a signal that is * pending and whose default action is to ignore the signal * (for example, SIGCHLD), shall cause the pending signal to * be discarded, whether or not it is blocked" */ if (sig_handler_ignored(sig_handler(p, sig), sig)) { sigemptyset(&mask); sigaddset(&mask, sig); flush_sigqueue_mask(p, &mask, &p->signal->shared_pending); for_each_thread(p, t) flush_sigqueue_mask(p, &mask, &t->pending); } else if (was_ignored) { posixtimer_sig_unignore(p, sig); } } spin_unlock_irq(&p->sighand->siglock); return 0; } #ifdef CONFIG_DYNAMIC_SIGFRAME static inline void sigaltstack_lock(void) __acquires(&current->sighand->siglock) { spin_lock_irq(&current->sighand->siglock); } static inline void sigaltstack_unlock(void) __releases(&current->sighand->siglock) { spin_unlock_irq(&current->sighand->siglock); } #else static inline void sigaltstack_lock(void) { } static inline void sigaltstack_unlock(void) { } #endif static int do_sigaltstack (const stack_t *ss, stack_t *oss, unsigned long sp, size_t min_ss_size) { struct task_struct *t = current; int ret = 0; if (oss) { memset(oss, 0, sizeof(stack_t)); oss->ss_sp = (void __user *) t->sas_ss_sp; oss->ss_size = t->sas_ss_size; oss->ss_flags = sas_ss_flags(sp) | (current->sas_ss_flags & SS_FLAG_BITS); } if (ss) { void __user *ss_sp = ss->ss_sp; size_t ss_size = ss->ss_size; unsigned ss_flags = ss->ss_flags; int ss_mode; if (unlikely(on_sig_stack(sp))) return -EPERM; ss_mode = ss_flags & ~SS_FLAG_BITS; if (unlikely(ss_mode != SS_DISABLE && ss_mode != SS_ONSTACK && ss_mode != 0)) return -EINVAL; /* * Return before taking any locks if no actual * sigaltstack changes were requested. */ if (t->sas_ss_sp == (unsigned long)ss_sp && t->sas_ss_size == ss_size && t->sas_ss_flags == ss_flags) return 0; sigaltstack_lock(); if (ss_mode == SS_DISABLE) { ss_size = 0; ss_sp = NULL; } else { if (unlikely(ss_size < min_ss_size)) ret = -ENOMEM; if (!sigaltstack_size_valid(ss_size)) ret = -ENOMEM; } if (!ret) { t->sas_ss_sp = (unsigned long) ss_sp; t->sas_ss_size = ss_size; t->sas_ss_flags = ss_flags; } sigaltstack_unlock(); } return ret; } SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss) { stack_t new, old; int err; if (uss && copy_from_user(&new, uss, sizeof(stack_t))) return -EFAULT; err = do_sigaltstack(uss ? &new : NULL, uoss ? &old : NULL, current_user_stack_pointer(), MINSIGSTKSZ); if (!err && uoss && copy_to_user(uoss, &old, sizeof(stack_t))) err = -EFAULT; return err; } int restore_altstack(const stack_t __user *uss) { stack_t new; if (copy_from_user(&new, uss, sizeof(stack_t))) return -EFAULT; (void)do_sigaltstack(&new, NULL, current_user_stack_pointer(), MINSIGSTKSZ); /* squash all but EFAULT for now */ return 0; } int __save_altstack(stack_t __user *uss, unsigned long sp) { struct task_struct *t = current; int err = __put_user((void __user *)t->sas_ss_sp, &uss->ss_sp) | __put_user(t->sas_ss_flags, &uss->ss_flags) | __put_user(t->sas_ss_size, &uss->ss_size); return err; } #ifdef CONFIG_COMPAT static int do_compat_sigaltstack(const compat_stack_t __user *uss_ptr, compat_stack_t __user *uoss_ptr) { stack_t uss, uoss; int ret; if (uss_ptr) { compat_stack_t uss32; if (copy_from_user(&uss32, uss_ptr, sizeof(compat_stack_t))) return -EFAULT; uss.ss_sp = compat_ptr(uss32.ss_sp); uss.ss_flags = uss32.ss_flags; uss.ss_size = uss32.ss_size; } ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss, compat_user_stack_pointer(), COMPAT_MINSIGSTKSZ); if (ret >= 0 && uoss_ptr) { compat_stack_t old; memset(&old, 0, sizeof(old)); old.ss_sp = ptr_to_compat(uoss.ss_sp); old.ss_flags = uoss.ss_flags; old.ss_size = uoss.ss_size; if (copy_to_user(uoss_ptr, &old, sizeof(compat_stack_t))) ret = -EFAULT; } return ret; } COMPAT_SYSCALL_DEFINE2(sigaltstack, const compat_stack_t __user *, uss_ptr, compat_stack_t __user *, uoss_ptr) { return do_compat_sigaltstack(uss_ptr, uoss_ptr); } int compat_restore_altstack(const compat_stack_t __user *uss) { int err = do_compat_sigaltstack(uss, NULL); /* squash all but -EFAULT for now */ return err == -EFAULT ? err : 0; } int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp) { int err; struct task_struct *t = current; err = __put_user(ptr_to_compat((void __user *)t->sas_ss_sp), &uss->ss_sp) | __put_user(t->sas_ss_flags, &uss->ss_flags) | __put_user(t->sas_ss_size, &uss->ss_size); return err; } #endif #ifdef __ARCH_WANT_SYS_SIGPENDING /** * sys_sigpending - examine pending signals * @uset: where mask of pending signal is returned */ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, uset) { sigset_t set; if (sizeof(old_sigset_t) > sizeof(*uset)) return -EINVAL; do_sigpending(&set); if (copy_to_user(uset, &set, sizeof(old_sigset_t))) return -EFAULT; return 0; } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set32) { sigset_t set; do_sigpending(&set); return put_user(set.sig[0], set32); } #endif #endif #ifdef __ARCH_WANT_SYS_SIGPROCMASK /** * sys_sigprocmask - examine and change blocked signals * @how: whether to add, remove, or set signals * @nset: signals to add or remove (if non-null) * @oset: previous value of signal mask if non-null * * Some platforms have their own version with special arguments; * others support only sys_rt_sigprocmask. */ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset, old_sigset_t __user *, oset) { old_sigset_t old_set, new_set; sigset_t new_blocked; old_set = current->blocked.sig[0]; if (nset) { if (copy_from_user(&new_set, nset, sizeof(*nset))) return -EFAULT; new_blocked = current->blocked; switch (how) { case SIG_BLOCK: sigaddsetmask(&new_blocked, new_set); break; case SIG_UNBLOCK: sigdelsetmask(&new_blocked, new_set); break; case SIG_SETMASK: new_blocked.sig[0] = new_set; break; default: return -EINVAL; } set_current_blocked(&new_blocked); } if (oset) { if (copy_to_user(oset, &old_set, sizeof(*oset))) return -EFAULT; } return 0; } #endif /* __ARCH_WANT_SYS_SIGPROCMASK */ #ifndef CONFIG_ODD_RT_SIGACTION /** * sys_rt_sigaction - alter an action taken by a process * @sig: signal to be sent * @act: new sigaction * @oact: used to save the previous sigaction * @sigsetsize: size of sigset_t type */ SYSCALL_DEFINE4(rt_sigaction, int, sig, const struct sigaction __user *, act, struct sigaction __user *, oact, size_t, sigsetsize) { struct k_sigaction new_sa, old_sa; int ret; /* XXX: Don't preclude handling different sized sigset_t's. */ if (sigsetsize != sizeof(sigset_t)) return -EINVAL; if (act && copy_from_user(&new_sa.sa, act, sizeof(new_sa.sa))) return -EFAULT; ret = do_sigaction(sig, act ? &new_sa : NULL, oact ? &old_sa : NULL); if (ret) return ret; if (oact && copy_to_user(oact, &old_sa.sa, sizeof(old_sa.sa))) return -EFAULT; return 0; } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig, const struct compat_sigaction __user *, act, struct compat_sigaction __user *, oact, compat_size_t, sigsetsize) { struct k_sigaction new_ka, old_ka; #ifdef __ARCH_HAS_SA_RESTORER compat_uptr_t restorer; #endif int ret; /* XXX: Don't preclude handling different sized sigset_t's. */ if (sigsetsize != sizeof(compat_sigset_t)) return -EINVAL; if (act) { compat_uptr_t handler; ret = get_user(handler, &act->sa_handler); new_ka.sa.sa_handler = compat_ptr(handler); #ifdef __ARCH_HAS_SA_RESTORER ret |= get_user(restorer, &act->sa_restorer); new_ka.sa.sa_restorer = compat_ptr(restorer); #endif ret |= get_compat_sigset(&new_ka.sa.sa_mask, &act->sa_mask); ret |= get_user(new_ka.sa.sa_flags, &act->sa_flags); if (ret) return -EFAULT; } ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); if (!ret && oact) { ret = put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler); ret |= put_compat_sigset(&oact->sa_mask, &old_ka.sa.sa_mask, sizeof(oact->sa_mask)); ret |= put_user(old_ka.sa.sa_flags, &oact->sa_flags); #ifdef __ARCH_HAS_SA_RESTORER ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer); #endif } return ret; } #endif #endif /* !CONFIG_ODD_RT_SIGACTION */ #ifdef CONFIG_OLD_SIGACTION SYSCALL_DEFINE3(sigaction, int, sig, const struct old_sigaction __user *, act, struct old_sigaction __user *, oact) { struct k_sigaction new_ka, old_ka; int ret; if (act) { old_sigset_t mask; if (!access_ok(act, sizeof(*act)) || __get_user(new_ka.sa.sa_handler, &act->sa_handler) || __get_user(new_ka.sa.sa_restorer, &act->sa_restorer) || __get_user(new_ka.sa.sa_flags, &act->sa_flags) || __get_user(mask, &act->sa_mask)) return -EFAULT; #ifdef __ARCH_HAS_KA_RESTORER new_ka.ka_restorer = NULL; #endif siginitset(&new_ka.sa.sa_mask, mask); } ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); if (!ret && oact) { if (!access_ok(oact, sizeof(*oact)) || __put_user(old_ka.sa.sa_handler, &oact->sa_handler) || __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer) || __put_user(old_ka.sa.sa_flags, &oact->sa_flags) || __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask)) return -EFAULT; } return ret; } #endif #ifdef CONFIG_COMPAT_OLD_SIGACTION COMPAT_SYSCALL_DEFINE3(sigaction, int, sig, const struct compat_old_sigaction __user *, act, struct compat_old_sigaction __user *, oact) { struct k_sigaction new_ka, old_ka; int ret; compat_old_sigset_t mask; compat_uptr_t handler, restorer; if (act) { if (!access_ok(act, sizeof(*act)) || __get_user(handler, &act->sa_handler) || __get_user(restorer, &act->sa_restorer) || __get_user(new_ka.sa.sa_flags, &act->sa_flags) || __get_user(mask, &act->sa_mask)) return -EFAULT; #ifdef __ARCH_HAS_KA_RESTORER new_ka.ka_restorer = NULL; #endif new_ka.sa.sa_handler = compat_ptr(handler); new_ka.sa.sa_restorer = compat_ptr(restorer); siginitset(&new_ka.sa.sa_mask, mask); } ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); if (!ret && oact) { if (!access_ok(oact, sizeof(*oact)) || __put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler) || __put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer) || __put_user(old_ka.sa.sa_flags, &oact->sa_flags) || __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask)) return -EFAULT; } return ret; } #endif #ifdef CONFIG_SGETMASK_SYSCALL /* * For backwards compatibility. Functionality superseded by sigprocmask. */ SYSCALL_DEFINE0(sgetmask) { /* SMP safe */ return current->blocked.sig[0]; } SYSCALL_DEFINE1(ssetmask, int, newmask) { int old = current->blocked.sig[0]; sigset_t newset; siginitset(&newset, newmask); set_current_blocked(&newset); return old; } #endif /* CONFIG_SGETMASK_SYSCALL */ #ifdef __ARCH_WANT_SYS_SIGNAL /* * For backwards compatibility. Functionality superseded by sigaction. */ SYSCALL_DEFINE2(signal, int, sig, __sighandler_t, handler) { struct k_sigaction new_sa, old_sa; int ret; new_sa.sa.sa_handler = handler; new_sa.sa.sa_flags = SA_ONESHOT | SA_NOMASK; sigemptyset(&new_sa.sa.sa_mask); ret = do_sigaction(sig, &new_sa, &old_sa); return ret ? ret : (unsigned long)old_sa.sa.sa_handler; } #endif /* __ARCH_WANT_SYS_SIGNAL */ #ifdef __ARCH_WANT_SYS_PAUSE SYSCALL_DEFINE0(pause) { while (!signal_pending(current)) { __set_current_state(TASK_INTERRUPTIBLE); schedule(); } return -ERESTARTNOHAND; } #endif static int sigsuspend(sigset_t *set) { current->saved_sigmask = current->blocked; set_current_blocked(set); while (!signal_pending(current)) { __set_current_state(TASK_INTERRUPTIBLE); schedule(); } set_restore_sigmask(); return -ERESTARTNOHAND; } /** * sys_rt_sigsuspend - replace the signal mask for a value with the * @unewset value until a signal is received * @unewset: new signal mask value * @sigsetsize: size of sigset_t type */ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize) { sigset_t newset; /* XXX: Don't preclude handling different sized sigset_t's. */ if (sigsetsize != sizeof(sigset_t)) return -EINVAL; if (copy_from_user(&newset, unewset, sizeof(newset))) return -EFAULT; return sigsuspend(&newset); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(rt_sigsuspend, compat_sigset_t __user *, unewset, compat_size_t, sigsetsize) { sigset_t newset; /* XXX: Don't preclude handling different sized sigset_t's. */ if (sigsetsize != sizeof(sigset_t)) return -EINVAL; if (get_compat_sigset(&newset, unewset)) return -EFAULT; return sigsuspend(&newset); } #endif #ifdef CONFIG_OLD_SIGSUSPEND SYSCALL_DEFINE1(sigsuspend, old_sigset_t, mask) { sigset_t blocked; siginitset(&blocked, mask); return sigsuspend(&blocked); } #endif #ifdef CONFIG_OLD_SIGSUSPEND3 SYSCALL_DEFINE3(sigsuspend, int, unused1, int, unused2, old_sigset_t, mask) { sigset_t blocked; siginitset(&blocked, mask); return sigsuspend(&blocked); } #endif __weak const char *arch_vma_name(struct vm_area_struct *vma) { return NULL; } static inline void siginfo_buildtime_checks(void) { BUILD_BUG_ON(sizeof(struct siginfo) != SI_MAX_SIZE); /* Verify the offsets in the two siginfos match */ #define CHECK_OFFSET(field) \ BUILD_BUG_ON(offsetof(siginfo_t, field) != offsetof(kernel_siginfo_t, field)) /* kill */ CHECK_OFFSET(si_pid); CHECK_OFFSET(si_uid); /* timer */ CHECK_OFFSET(si_tid); CHECK_OFFSET(si_overrun); CHECK_OFFSET(si_value); /* rt */ CHECK_OFFSET(si_pid); CHECK_OFFSET(si_uid); CHECK_OFFSET(si_value); /* sigchld */ CHECK_OFFSET(si_pid); CHECK_OFFSET(si_uid); CHECK_OFFSET(si_status); CHECK_OFFSET(si_utime); CHECK_OFFSET(si_stime); /* sigfault */ CHECK_OFFSET(si_addr); CHECK_OFFSET(si_trapno); CHECK_OFFSET(si_addr_lsb); CHECK_OFFSET(si_lower); CHECK_OFFSET(si_upper); CHECK_OFFSET(si_pkey); CHECK_OFFSET(si_perf_data); CHECK_OFFSET(si_perf_type); CHECK_OFFSET(si_perf_flags); /* sigpoll */ CHECK_OFFSET(si_band); CHECK_OFFSET(si_fd); /* sigsys */ CHECK_OFFSET(si_call_addr); CHECK_OFFSET(si_syscall); CHECK_OFFSET(si_arch); #undef CHECK_OFFSET /* usb asyncio */ BUILD_BUG_ON(offsetof(struct siginfo, si_pid) != offsetof(struct siginfo, si_addr)); if (sizeof(int) == sizeof(void __user *)) { BUILD_BUG_ON(sizeof_field(struct siginfo, si_pid) != sizeof(void __user *)); } else { BUILD_BUG_ON((sizeof_field(struct siginfo, si_pid) + sizeof_field(struct siginfo, si_uid)) != sizeof(void __user *)); BUILD_BUG_ON(offsetofend(struct siginfo, si_pid) != offsetof(struct siginfo, si_uid)); } #ifdef CONFIG_COMPAT BUILD_BUG_ON(offsetof(struct compat_siginfo, si_pid) != offsetof(struct compat_siginfo, si_addr)); BUILD_BUG_ON(sizeof_field(struct compat_siginfo, si_pid) != sizeof(compat_uptr_t)); BUILD_BUG_ON(sizeof_field(struct compat_siginfo, si_pid) != sizeof_field(struct siginfo, si_pid)); #endif } #if defined(CONFIG_SYSCTL) static const struct ctl_table signal_debug_table[] = { #ifdef CONFIG_SYSCTL_EXCEPTION_TRACE { .procname = "exception-trace", .data = &show_unhandled_signals, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, #endif }; static const struct ctl_table signal_table[] = { { .procname = "print-fatal-signals", .data = &print_fatal_signals, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, }; static int __init init_signal_sysctls(void) { register_sysctl_init("debug", signal_debug_table); register_sysctl_init("kernel", signal_table); return 0; } early_initcall(init_signal_sysctls); #endif /* CONFIG_SYSCTL */ void __init signals_init(void) { siginfo_buildtime_checks(); sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC | SLAB_ACCOUNT); } #ifdef CONFIG_KGDB_KDB #include <linux/kdb.h> /* * kdb_send_sig - Allows kdb to send signals without exposing * signal internals. This function checks if the required locks are * available before calling the main signal code, to avoid kdb * deadlocks. */ void kdb_send_sig(struct task_struct *t, int sig) { static struct task_struct *kdb_prev_t; int new_t, ret; if (!spin_trylock(&t->sighand->siglock)) { kdb_printf("Can't do kill command now.\n" "The sigmask lock is held somewhere else in " "kernel, try again later\n"); return; } new_t = kdb_prev_t != t; kdb_prev_t = t; if (!task_is_running(t) && new_t) { spin_unlock(&t->sighand->siglock); kdb_printf("Process is not RUNNING, sending a signal from " "kdb risks deadlock\n" "on the run queue locks. " "The signal has _not_ been sent.\n" "Reissue the kill command if you want to risk " "the deadlock.\n"); return; } ret = send_signal_locked(sig, SEND_SIG_PRIV, t, PIDTYPE_PID); spin_unlock(&t->sighand->siglock); if (ret) kdb_printf("Fail to deliver Signal %d to process %d.\n", sig, t->pid); else kdb_printf("Signal %d is sent to process %d.\n", sig, t->pid); } #endif /* CONFIG_KGDB_KDB */
27 27 27 27 478 476 27 29 28 28 28 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2013 Red Hat, Inc. and Parallels Inc. All rights reserved. * Authors: David Chinner and Glauber Costa * * Generic LRU infrastructure */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/mm.h> #include <linux/list_lru.h> #include <linux/slab.h> #include <linux/mutex.h> #include <linux/memcontrol.h> #include "slab.h" #include "internal.h" #ifdef CONFIG_MEMCG static LIST_HEAD(memcg_list_lrus); static DEFINE_MUTEX(list_lrus_mutex); static inline bool list_lru_memcg_aware(struct list_lru *lru) { return lru->memcg_aware; } static void list_lru_register(struct list_lru *lru) { if (!list_lru_memcg_aware(lru)) return; mutex_lock(&list_lrus_mutex); list_add(&lru->list, &memcg_list_lrus); mutex_unlock(&list_lrus_mutex); } static void list_lru_unregister(struct list_lru *lru) { if (!list_lru_memcg_aware(lru)) return; mutex_lock(&list_lrus_mutex); list_del(&lru->list); mutex_unlock(&list_lrus_mutex); } static int lru_shrinker_id(struct list_lru *lru) { return lru->shrinker_id; } static inline struct list_lru_one * list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) { if (list_lru_memcg_aware(lru) && idx >= 0) { struct list_lru_memcg *mlru = xa_load(&lru->xa, idx); return mlru ? &mlru->node[nid] : NULL; } return &lru->node[nid].lru; } static inline bool lock_list_lru(struct list_lru_one *l, bool irq) { if (irq) spin_lock_irq(&l->lock); else spin_lock(&l->lock); if (unlikely(READ_ONCE(l->nr_items) == LONG_MIN)) { if (irq) spin_unlock_irq(&l->lock); else spin_unlock(&l->lock); return false; } return true; } static inline struct list_lru_one * lock_list_lru_of_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg, bool irq, bool skip_empty) { struct list_lru_one *l; rcu_read_lock(); again: l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); if (likely(l) && lock_list_lru(l, irq)) { rcu_read_unlock(); return l; } /* * Caller may simply bail out if raced with reparenting or * may iterate through the list_lru and expect empty slots. */ if (skip_empty) { rcu_read_unlock(); return NULL; } VM_WARN_ON(!css_is_dying(&memcg->css)); memcg = parent_mem_cgroup(memcg); goto again; } static inline void unlock_list_lru(struct list_lru_one *l, bool irq_off) { if (irq_off) spin_unlock_irq(&l->lock); else spin_unlock(&l->lock); } #else static void list_lru_register(struct list_lru *lru) { } static void list_lru_unregister(struct list_lru *lru) { } static int lru_shrinker_id(struct list_lru *lru) { return -1; } static inline bool list_lru_memcg_aware(struct list_lru *lru) { return false; } static inline struct list_lru_one * list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) { return &lru->node[nid].lru; } static inline struct list_lru_one * lock_list_lru_of_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg, bool irq, bool skip_empty) { struct list_lru_one *l = &lru->node[nid].lru; if (irq) spin_lock_irq(&l->lock); else spin_lock(&l->lock); return l; } static inline void unlock_list_lru(struct list_lru_one *l, bool irq_off) { if (irq_off) spin_unlock_irq(&l->lock); else spin_unlock(&l->lock); } #endif /* CONFIG_MEMCG */ /* The caller must ensure the memcg lifetime. */ bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, struct mem_cgroup *memcg) { struct list_lru_node *nlru = &lru->node[nid]; struct list_lru_one *l; l = lock_list_lru_of_memcg(lru, nid, memcg, false, false); if (!l) return false; if (list_empty(item)) { list_add_tail(item, &l->list); /* Set shrinker bit if the first element was added */ if (!l->nr_items++) set_shrinker_bit(memcg, nid, lru_shrinker_id(lru)); unlock_list_lru(l, false); atomic_long_inc(&nlru->nr_items); return true; } unlock_list_lru(l, false); return false; } bool list_lru_add_obj(struct list_lru *lru, struct list_head *item) { bool ret; int nid = page_to_nid(virt_to_page(item)); if (list_lru_memcg_aware(lru)) { rcu_read_lock(); ret = list_lru_add(lru, item, nid, mem_cgroup_from_slab_obj(item)); rcu_read_unlock(); } else { ret = list_lru_add(lru, item, nid, NULL); } return ret; } EXPORT_SYMBOL_GPL(list_lru_add_obj); /* The caller must ensure the memcg lifetime. */ bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid, struct mem_cgroup *memcg) { struct list_lru_node *nlru = &lru->node[nid]; struct list_lru_one *l; l = lock_list_lru_of_memcg(lru, nid, memcg, false, false); if (!l) return false; if (!list_empty(item)) { list_del_init(item); l->nr_items--; unlock_list_lru(l, false); atomic_long_dec(&nlru->nr_items); return true; } unlock_list_lru(l, false); return false; } bool list_lru_del_obj(struct list_lru *lru, struct list_head *item) { bool ret; int nid = page_to_nid(virt_to_page(item)); if (list_lru_memcg_aware(lru)) { rcu_read_lock(); ret = list_lru_del(lru, item, nid, mem_cgroup_from_slab_obj(item)); rcu_read_unlock(); } else { ret = list_lru_del(lru, item, nid, NULL); } return ret; } EXPORT_SYMBOL_GPL(list_lru_del_obj); void list_lru_isolate(struct list_lru_one *list, struct list_head *item) { list_del_init(item); list->nr_items--; } EXPORT_SYMBOL_GPL(list_lru_isolate); void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item, struct list_head *head) { list_move(item, head); list->nr_items--; } EXPORT_SYMBOL_GPL(list_lru_isolate_move); unsigned long list_lru_count_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg) { struct list_lru_one *l; long count; rcu_read_lock(); l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); count = l ? READ_ONCE(l->nr_items) : 0; rcu_read_unlock(); if (unlikely(count < 0)) count = 0; return count; } EXPORT_SYMBOL_GPL(list_lru_count_one); unsigned long list_lru_count_node(struct list_lru *lru, int nid) { struct list_lru_node *nlru; nlru = &lru->node[nid]; return atomic_long_read(&nlru->nr_items); } EXPORT_SYMBOL_GPL(list_lru_count_node); static unsigned long __list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk, bool irq_off) { struct list_lru_node *nlru = &lru->node[nid]; struct list_lru_one *l = NULL; struct list_head *item, *n; unsigned long isolated = 0; restart: l = lock_list_lru_of_memcg(lru, nid, memcg, irq_off, true); if (!l) return isolated; list_for_each_safe(item, n, &l->list) { enum lru_status ret; /* * decrement nr_to_walk first so that we don't livelock if we * get stuck on large numbers of LRU_RETRY items */ if (!*nr_to_walk) break; --*nr_to_walk; ret = isolate(item, l, cb_arg); switch (ret) { /* * LRU_RETRY, LRU_REMOVED_RETRY and LRU_STOP will drop the lru * lock. List traversal will have to restart from scratch. */ case LRU_RETRY: goto restart; case LRU_REMOVED_RETRY: fallthrough; case LRU_REMOVED: isolated++; atomic_long_dec(&nlru->nr_items); if (ret == LRU_REMOVED_RETRY) goto restart; break; case LRU_ROTATE: list_move_tail(item, &l->list); break; case LRU_SKIP: break; case LRU_STOP: goto out; default: BUG(); } } unlock_list_lru(l, irq_off); out: return isolated; } unsigned long list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk) { return __list_lru_walk_one(lru, nid, memcg, isolate, cb_arg, nr_to_walk, false); } EXPORT_SYMBOL_GPL(list_lru_walk_one); unsigned long list_lru_walk_one_irq(struct list_lru *lru, int nid, struct mem_cgroup *memcg, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk) { return __list_lru_walk_one(lru, nid, memcg, isolate, cb_arg, nr_to_walk, true); } unsigned long list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk) { long isolated = 0; isolated += list_lru_walk_one(lru, nid, NULL, isolate, cb_arg, nr_to_walk); #ifdef CONFIG_MEMCG if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) { struct list_lru_memcg *mlru; struct mem_cgroup *memcg; unsigned long index; xa_for_each(&lru->xa, index, mlru) { rcu_read_lock(); memcg = mem_cgroup_from_id(index); if (!mem_cgroup_tryget(memcg)) { rcu_read_unlock(); continue; } rcu_read_unlock(); isolated += __list_lru_walk_one(lru, nid, memcg, isolate, cb_arg, nr_to_walk, false); mem_cgroup_put(memcg); if (*nr_to_walk <= 0) break; } } #endif return isolated; } EXPORT_SYMBOL_GPL(list_lru_walk_node); static void init_one_lru(struct list_lru *lru, struct list_lru_one *l) { INIT_LIST_HEAD(&l->list); spin_lock_init(&l->lock); l->nr_items = 0; #ifdef CONFIG_LOCKDEP if (lru->key) lockdep_set_class(&l->lock, lru->key); #endif } #ifdef CONFIG_MEMCG static struct list_lru_memcg *memcg_init_list_lru_one(struct list_lru *lru, gfp_t gfp) { int nid; struct list_lru_memcg *mlru; mlru = kmalloc(struct_size(mlru, node, nr_node_ids), gfp); if (!mlru) return NULL; for_each_node(nid) init_one_lru(lru, &mlru->node[nid]); return mlru; } static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) { if (memcg_aware) xa_init_flags(&lru->xa, XA_FLAGS_LOCK_IRQ); lru->memcg_aware = memcg_aware; } static void memcg_destroy_list_lru(struct list_lru *lru) { XA_STATE(xas, &lru->xa, 0); struct list_lru_memcg *mlru; if (!list_lru_memcg_aware(lru)) return; xas_lock_irq(&xas); xas_for_each(&xas, mlru, ULONG_MAX) { kfree(mlru); xas_store(&xas, NULL); } xas_unlock_irq(&xas); } static void memcg_reparent_list_lru_one(struct list_lru *lru, int nid, struct list_lru_one *src, struct mem_cgroup *dst_memcg) { int dst_idx = dst_memcg->kmemcg_id; struct list_lru_one *dst; spin_lock_irq(&src->lock); dst = list_lru_from_memcg_idx(lru, nid, dst_idx); spin_lock_nested(&dst->lock, SINGLE_DEPTH_NESTING); list_splice_init(&src->list, &dst->list); if (src->nr_items) { WARN_ON(src->nr_items < 0); dst->nr_items += src->nr_items; set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru)); } /* Mark the list_lru_one dead */ src->nr_items = LONG_MIN; spin_unlock(&dst->lock); spin_unlock_irq(&src->lock); } void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *parent) { struct list_lru *lru; int i; mutex_lock(&list_lrus_mutex); list_for_each_entry(lru, &memcg_list_lrus, list) { struct list_lru_memcg *mlru; XA_STATE(xas, &lru->xa, memcg->kmemcg_id); /* * Lock the Xarray to ensure no on going list_lru_memcg * allocation and further allocation will see css_is_dying(). */ xas_lock_irq(&xas); mlru = xas_store(&xas, NULL); xas_unlock_irq(&xas); if (!mlru) continue; /* * With Xarray value set to NULL, holding the lru lock below * prevents list_lru_{add,del,isolate} from touching the lru, * safe to reparent. */ for_each_node(i) memcg_reparent_list_lru_one(lru, i, &mlru->node[i], parent); /* * Here all list_lrus corresponding to the cgroup are guaranteed * to remain empty, we can safely free this lru, any further * memcg_list_lru_alloc() call will simply bail out. */ kvfree_rcu(mlru, rcu); } mutex_unlock(&list_lrus_mutex); } static inline bool memcg_list_lru_allocated(struct mem_cgroup *memcg, struct list_lru *lru) { int idx = memcg->kmemcg_id; return idx < 0 || xa_load(&lru->xa, idx); } int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru, gfp_t gfp) { unsigned long flags; struct list_lru_memcg *mlru = NULL; struct mem_cgroup *pos, *parent; XA_STATE(xas, &lru->xa, 0); if (!list_lru_memcg_aware(lru) || memcg_list_lru_allocated(memcg, lru)) return 0; gfp &= GFP_RECLAIM_MASK; /* * Because the list_lru can be reparented to the parent cgroup's * list_lru, we should make sure that this cgroup and all its * ancestors have allocated list_lru_memcg. */ do { /* * Keep finding the farest parent that wasn't populated * until found memcg itself. */ pos = memcg; parent = parent_mem_cgroup(pos); while (!memcg_list_lru_allocated(parent, lru)) { pos = parent; parent = parent_mem_cgroup(pos); } if (!mlru) { mlru = memcg_init_list_lru_one(lru, gfp); if (!mlru) return -ENOMEM; } xas_set(&xas, pos->kmemcg_id); do { xas_lock_irqsave(&xas, flags); if (!xas_load(&xas) && !css_is_dying(&pos->css)) { xas_store(&xas, mlru); if (!xas_error(&xas)) mlru = NULL; } xas_unlock_irqrestore(&xas, flags); } while (xas_nomem(&xas, gfp)); } while (pos != memcg && !css_is_dying(&pos->css)); if (unlikely(mlru)) kfree(mlru); return xas_error(&xas); } #else static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) { } static void memcg_destroy_list_lru(struct list_lru *lru) { } #endif /* CONFIG_MEMCG */ int __list_lru_init(struct list_lru *lru, bool memcg_aware, struct shrinker *shrinker) { int i; #ifdef CONFIG_MEMCG if (shrinker) lru->shrinker_id = shrinker->id; else lru->shrinker_id = -1; if (mem_cgroup_kmem_disabled()) memcg_aware = false; #endif lru->node = kcalloc(nr_node_ids, sizeof(*lru->node), GFP_KERNEL); if (!lru->node) return -ENOMEM; for_each_node(i) init_one_lru(lru, &lru->node[i].lru); memcg_init_list_lru(lru, memcg_aware); list_lru_register(lru); return 0; } EXPORT_SYMBOL_GPL(__list_lru_init); void list_lru_destroy(struct list_lru *lru) { /* Already destroyed or not yet initialized? */ if (!lru->node) return; list_lru_unregister(lru); memcg_destroy_list_lru(lru); kfree(lru->node); lru->node = NULL; #ifdef CONFIG_MEMCG lru->shrinker_id = -1; #endif } EXPORT_SYMBOL_GPL(list_lru_destroy);
12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 /* * linux/include/linux/console.h * * Copyright (C) 1993 Hamish Macdonald * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of this archive * for more details. * * Changed: * 10-Mar-94: Arno Griffioen: Conversion for vt100 emulator port from PC LINUX */ #ifndef _LINUX_CONSOLE_H_ #define _LINUX_CONSOLE_H_ 1 #include <linux/atomic.h> #include <linux/bits.h> #include <linux/irq_work.h> #include <linux/rculist.h> #include <linux/rcuwait.h> #include <linux/types.h> #include <linux/vesa.h> struct vc_data; struct console_font_op; struct console_font; struct module; struct tty_struct; struct notifier_block; enum con_scroll { SM_UP, SM_DOWN, }; enum vc_intensity; /** * struct consw - callbacks for consoles * * @owner: the module to get references of when this console is used * @con_startup: set up the console and return its name (like VGA, EGA, ...) * @con_init: initialize the console on @vc. @init is true for the very first * call on this @vc. * @con_deinit: deinitialize the console from @vc. * @con_clear: erase @count characters at [@x, @y] on @vc. @count >= 1. * @con_putc: emit one character with attributes @ca to [@x, @y] on @vc. * (optional -- @con_putcs would be called instead) * @con_putcs: emit @count characters with attributes @s to [@x, @y] on @vc. * @con_cursor: enable/disable cursor depending on @enable * @con_scroll: move lines from @top to @bottom in direction @dir by @lines. * Return true if no generic handling should be done. * Invoked by csi_M and printing to the console. * @con_switch: notifier about the console switch; it is supposed to return * true if a redraw is needed. * @con_blank: blank/unblank the console. The target mode is passed in @blank. * @mode_switch is set if changing from/to text/graphics. The hook * is supposed to return true if a redraw is needed. * @con_font_set: set console @vc font to @font with height @vpitch. @flags can * be %KD_FONT_FLAG_DONT_RECALC. (optional) * @con_font_get: fetch the current font on @vc of height @vpitch into @font. * (optional) * @con_font_default: set default font on @vc. @name can be %NULL or font name * to search for. @font can be filled back. (optional) * @con_resize: resize the @vc console to @width x @height. @from_user is true * when this change comes from the user space. * @con_set_palette: sets the palette of the console @vc to @table (optional) * @con_scrolldelta: the contents of the console should be scrolled by @lines. * Invoked by user. (optional) * @con_set_origin: set origin (see &vc_data::vc_origin) of the @vc. If not * provided or returns false, the origin is set to * @vc->vc_screenbuf. (optional) * @con_save_screen: save screen content into @vc->vc_screenbuf. Called e.g. * upon entering graphics. (optional) * @con_build_attr: build attributes based on @color, @intensity and other * parameters. The result is used for both normal and erase * characters. (optional) * @con_invert_region: invert a region of length @count on @vc starting at @p. * (optional) * @con_debug_enter: prepare the console for the debugger. This includes, but * is not limited to, unblanking the console, loading an * appropriate palette, and allowing debugger generated output. * (optional) * @con_debug_leave: restore the console to its pre-debug state as closely as * possible. (optional) */ struct consw { struct module *owner; const char *(*con_startup)(void); void (*con_init)(struct vc_data *vc, bool init); void (*con_deinit)(struct vc_data *vc); void (*con_clear)(struct vc_data *vc, unsigned int y, unsigned int x, unsigned int count); void (*con_putc)(struct vc_data *vc, u16 ca, unsigned int y, unsigned int x); void (*con_putcs)(struct vc_data *vc, const u16 *s, unsigned int count, unsigned int ypos, unsigned int xpos); void (*con_cursor)(struct vc_data *vc, bool enable); bool (*con_scroll)(struct vc_data *vc, unsigned int top, unsigned int bottom, enum con_scroll dir, unsigned int lines); bool (*con_switch)(struct vc_data *vc); bool (*con_blank)(struct vc_data *vc, enum vesa_blank_mode blank, bool mode_switch); int (*con_font_set)(struct vc_data *vc, const struct console_font *font, unsigned int vpitch, unsigned int flags); int (*con_font_get)(struct vc_data *vc, struct console_font *font, unsigned int vpitch); int (*con_font_default)(struct vc_data *vc, struct console_font *font, const char *name); int (*con_resize)(struct vc_data *vc, unsigned int width, unsigned int height, bool from_user); void (*con_set_palette)(struct vc_data *vc, const unsigned char *table); void (*con_scrolldelta)(struct vc_data *vc, int lines); bool (*con_set_origin)(struct vc_data *vc); void (*con_save_screen)(struct vc_data *vc); u8 (*con_build_attr)(struct vc_data *vc, u8 color, enum vc_intensity intensity, bool blink, bool underline, bool reverse, bool italic); void (*con_invert_region)(struct vc_data *vc, u16 *p, int count); void (*con_debug_enter)(struct vc_data *vc); void (*con_debug_leave)(struct vc_data *vc); }; extern const struct consw *conswitchp; extern const struct consw dummy_con; /* dummy console buffer */ extern const struct consw vga_con; /* VGA text console */ extern const struct consw newport_con; /* SGI Newport console */ struct screen_info; #ifdef CONFIG_VGA_CONSOLE void vgacon_register_screen(struct screen_info *si); #else static inline void vgacon_register_screen(struct screen_info *si) { } #endif int con_is_bound(const struct consw *csw); int do_unregister_con_driver(const struct consw *csw); int do_take_over_console(const struct consw *sw, int first, int last, int deflt); void give_up_console(const struct consw *sw); #ifdef CONFIG_VT void con_debug_enter(struct vc_data *vc); void con_debug_leave(void); #else static inline void con_debug_enter(struct vc_data *vc) { } static inline void con_debug_leave(void) { } #endif /* * The interface for a console, or any other device that wants to capture * console messages (printer driver?) */ /** * enum cons_flags - General console flags * @CON_PRINTBUFFER: Used by newly registered consoles to avoid duplicate * output of messages that were already shown by boot * consoles or read by userspace via syslog() syscall. * @CON_CONSDEV: Indicates that the console driver is backing * /dev/console. * @CON_ENABLED: Indicates if a console is allowed to print records. If * false, the console also will not advance to later * records. * @CON_BOOT: Marks the console driver as early console driver which * is used during boot before the real driver becomes * available. It will be automatically unregistered * when the real console driver is registered unless * "keep_bootcon" parameter is used. * @CON_ANYTIME: A misnomed historical flag which tells the core code * that the legacy @console::write callback can be invoked * on a CPU which is marked OFFLINE. That is misleading as * it suggests that there is no contextual limit for * invoking the callback. The original motivation was * readiness of the per-CPU areas. * @CON_BRL: Indicates a braille device which is exempt from * receiving the printk spam for obvious reasons. * @CON_EXTENDED: The console supports the extended output format of * /dev/kmesg which requires a larger output buffer. * @CON_SUSPENDED: Indicates if a console is suspended. If true, the * printing callbacks must not be called. * @CON_NBCON: Console can operate outside of the legacy style console_lock * constraints. */ enum cons_flags { CON_PRINTBUFFER = BIT(0), CON_CONSDEV = BIT(1), CON_ENABLED = BIT(2), CON_BOOT = BIT(3), CON_ANYTIME = BIT(4), CON_BRL = BIT(5), CON_EXTENDED = BIT(6), CON_SUSPENDED = BIT(7), CON_NBCON = BIT(8), }; /** * struct nbcon_state - console state for nbcon consoles * @atom: Compound of the state fields for atomic operations * * @req_prio: The priority of a handover request * @prio: The priority of the current owner * @unsafe: Console is busy in a non takeover region * @unsafe_takeover: A hostile takeover in an unsafe state happened in the * past. The console cannot be safe until re-initialized. * @cpu: The CPU on which the owner runs * * To be used for reading and preparing of the value stored in the nbcon * state variable @console::nbcon_state. * * The @prio and @req_prio fields are particularly important to allow * spin-waiting to timeout and give up without the risk of a waiter being * assigned the lock after giving up. */ struct nbcon_state { union { unsigned int atom; struct { unsigned int prio : 2; unsigned int req_prio : 2; unsigned int unsafe : 1; unsigned int unsafe_takeover : 1; unsigned int cpu : 24; }; }; }; /* * The nbcon_state struct is used to easily create and interpret values that * are stored in the @console::nbcon_state variable. Ensure this struct stays * within the size boundaries of the atomic variable's underlying type in * order to avoid any accidental truncation. */ static_assert(sizeof(struct nbcon_state) <= sizeof(int)); /** * enum nbcon_prio - console owner priority for nbcon consoles * @NBCON_PRIO_NONE: Unused * @NBCON_PRIO_NORMAL: Normal (non-emergency) usage * @NBCON_PRIO_EMERGENCY: Emergency output (WARN/OOPS...) * @NBCON_PRIO_PANIC: Panic output * @NBCON_PRIO_MAX: The number of priority levels * * A higher priority context can takeover the console when it is * in the safe state. The final attempt to flush consoles in panic() * can be allowed to do so even in an unsafe state (Hope and pray). */ enum nbcon_prio { NBCON_PRIO_NONE = 0, NBCON_PRIO_NORMAL, NBCON_PRIO_EMERGENCY, NBCON_PRIO_PANIC, NBCON_PRIO_MAX, }; struct console; struct printk_buffers; /** * struct nbcon_context - Context for console acquire/release * @console: The associated console * @spinwait_max_us: Limit for spin-wait acquire * @prio: Priority of the context * @allow_unsafe_takeover: Allow performing takeover even if unsafe. Can * be used only with NBCON_PRIO_PANIC @prio. It * might cause a system freeze when the console * is used later. * @backlog: Ringbuffer has pending records * @pbufs: Pointer to the text buffer for this context * @seq: The sequence number to print for this context */ struct nbcon_context { /* members set by caller */ struct console *console; unsigned int spinwait_max_us; enum nbcon_prio prio; unsigned int allow_unsafe_takeover : 1; /* members set by emit */ unsigned int backlog : 1; /* members set by acquire */ struct printk_buffers *pbufs; u64 seq; }; /** * struct nbcon_write_context - Context handed to the nbcon write callbacks * @ctxt: The core console context * @outbuf: Pointer to the text buffer for output * @len: Length to write * @unsafe_takeover: If a hostile takeover in an unsafe state has occurred */ struct nbcon_write_context { struct nbcon_context __private ctxt; char *outbuf; unsigned int len; bool unsafe_takeover; }; /** * struct console - The console descriptor structure * @name: The name of the console driver * @write: Legacy write callback to output messages (Optional) * @read: Read callback for console input (Optional) * @device: The underlying TTY device driver (Optional) * @unblank: Callback to unblank the console (Optional) * @setup: Callback for initializing the console (Optional) * @exit: Callback for teardown of the console (Optional) * @match: Callback for matching a console (Optional) * @flags: Console flags. See enum cons_flags * @index: Console index, e.g. port number * @cflag: TTY control mode flags * @ispeed: TTY input speed * @ospeed: TTY output speed * @seq: Sequence number of the next ringbuffer record to print * @dropped: Number of unreported dropped ringbuffer records * @data: Driver private data * @node: hlist node for the console list * * @nbcon_state: State for nbcon consoles * @nbcon_seq: Sequence number of the next record for nbcon to print * @nbcon_device_ctxt: Context available for non-printing operations * @nbcon_prev_seq: Seq num the previous nbcon owner was assigned to print * @pbufs: Pointer to nbcon private buffer * @kthread: Printer kthread for this console * @rcuwait: RCU-safe wait object for @kthread waking * @irq_work: Defer @kthread waking to IRQ work context */ struct console { char name[16]; void (*write)(struct console *co, const char *s, unsigned int count); int (*read)(struct console *co, char *s, unsigned int count); struct tty_driver *(*device)(struct console *co, int *index); void (*unblank)(void); int (*setup)(struct console *co, char *options); int (*exit)(struct console *co); int (*match)(struct console *co, char *name, int idx, char *options); short flags; short index; int cflag; uint ispeed; uint ospeed; u64 seq; unsigned long dropped; void *data; struct hlist_node node; /* nbcon console specific members */ /** * @write_atomic: * * NBCON callback to write out text in any context. (Optional) * * This callback is called with the console already acquired. However, * a higher priority context is allowed to take it over by default. * * The callback must call nbcon_enter_unsafe() and nbcon_exit_unsafe() * around any code where the takeover is not safe, for example, when * manipulating the serial port registers. * * nbcon_enter_unsafe() will fail if the context has lost the console * ownership in the meantime. In this case, the callback is no longer * allowed to go forward. It must back out immediately and carefully. * The buffer content is also no longer trusted since it no longer * belongs to the context. * * The callback should allow the takeover whenever it is safe. It * increases the chance to see messages when the system is in trouble. * If the driver must reacquire ownership in order to finalize or * revert hardware changes, nbcon_reacquire_nobuf() can be used. * However, on reacquire the buffer content is no longer available. A * reacquire cannot be used to resume printing. * * The callback can be called from any context (including NMI). * Therefore it must avoid usage of any locking and instead rely * on the console ownership for synchronization. */ void (*write_atomic)(struct console *con, struct nbcon_write_context *wctxt); /** * @write_thread: * * NBCON callback to write out text in task context. * * This callback must be called only in task context with both * device_lock() and the nbcon console acquired with * NBCON_PRIO_NORMAL. * * The same rules for console ownership verification and unsafe * sections handling applies as with write_atomic(). * * The console ownership handling is necessary for synchronization * against write_atomic() which is synchronized only via the context. * * The device_lock() provides the primary serialization for operations * on the device. It might be as relaxed (mutex)[*] or as tight * (disabled preemption and interrupts) as needed. It allows * the kthread to operate in the least restrictive mode[**]. * * [*] Standalone nbcon_context_try_acquire() is not safe with * the preemption enabled, see nbcon_owner_matches(). But it * can be safe when always called in the preemptive context * under the device_lock(). * * [**] The device_lock() makes sure that nbcon_context_try_acquire() * would never need to spin which is important especially with * PREEMPT_RT. */ void (*write_thread)(struct console *con, struct nbcon_write_context *wctxt); /** * @device_lock: * * NBCON callback to begin synchronization with driver code. * * Console drivers typically must deal with access to the hardware * via user input/output (such as an interactive login shell) and * output of kernel messages via printk() calls. This callback is * called by the printk-subsystem whenever it needs to synchronize * with hardware access by the driver. It should be implemented to * use whatever synchronization mechanism the driver is using for * itself (for example, the port lock for uart serial consoles). * * The callback is always called from task context. It may use any * synchronization method required by the driver. * * IMPORTANT: The callback MUST disable migration. The console driver * may be using a synchronization mechanism that already takes * care of this (such as spinlocks). Otherwise this function must * explicitly call migrate_disable(). * * The flags argument is provided as a convenience to the driver. It * will be passed again to device_unlock(). It can be ignored if the * driver does not need it. */ void (*device_lock)(struct console *con, unsigned long *flags); /** * @device_unlock: * * NBCON callback to finish synchronization with driver code. * * It is the counterpart to device_lock(). * * This callback is always called from task context. It must * appropriately re-enable migration (depending on how device_lock() * disabled migration). * * The flags argument is the value of the same variable that was * passed to device_lock(). */ void (*device_unlock)(struct console *con, unsigned long flags); atomic_t __private nbcon_state; atomic_long_t __private nbcon_seq; struct nbcon_context __private nbcon_device_ctxt; atomic_long_t __private nbcon_prev_seq; struct printk_buffers *pbufs; struct task_struct *kthread; struct rcuwait rcuwait; struct irq_work irq_work; }; #ifdef CONFIG_LOCKDEP extern void lockdep_assert_console_list_lock_held(void); #else static inline void lockdep_assert_console_list_lock_held(void) { } #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC extern bool console_srcu_read_lock_is_held(void); #else static inline bool console_srcu_read_lock_is_held(void) { return 1; } #endif extern int console_srcu_read_lock(void); extern void console_srcu_read_unlock(int cookie); extern void console_list_lock(void) __acquires(console_mutex); extern void console_list_unlock(void) __releases(console_mutex); extern struct hlist_head console_list; /** * console_srcu_read_flags - Locklessly read flags of a possibly registered * console * @con: struct console pointer of console to read flags from * * Locklessly reading @con->flags provides a consistent read value because * there is at most one CPU modifying @con->flags and that CPU is using only * read-modify-write operations to do so. * * Requires console_srcu_read_lock to be held, which implies that @con might * be a registered console. The purpose of holding console_srcu_read_lock is * to guarantee that the console state is valid (CON_SUSPENDED/CON_ENABLED) * and that no exit/cleanup routines will run if the console is currently * undergoing unregistration. * * If the caller is holding the console_list_lock or it is _certain_ that * @con is not and will not become registered, the caller may read * @con->flags directly instead. * * Context: Any context. * Return: The current value of the @con->flags field. */ static inline short console_srcu_read_flags(const struct console *con) { WARN_ON_ONCE(!console_srcu_read_lock_is_held()); /* * The READ_ONCE() matches the WRITE_ONCE() when @flags are modified * for registered consoles with console_srcu_write_flags(). */ return data_race(READ_ONCE(con->flags)); } /** * console_srcu_write_flags - Write flags for a registered console * @con: struct console pointer of console to write flags to * @flags: new flags value to write * * Only use this function to write flags for registered consoles. It * requires holding the console_list_lock. * * Context: Any context. */ static inline void console_srcu_write_flags(struct console *con, short flags) { lockdep_assert_console_list_lock_held(); /* This matches the READ_ONCE() in console_srcu_read_flags(). */ WRITE_ONCE(con->flags, flags); } /* Variant of console_is_registered() when the console_list_lock is held. */ static inline bool console_is_registered_locked(const struct console *con) { lockdep_assert_console_list_lock_held(); return !hlist_unhashed(&con->node); } /* * console_is_registered - Check if the console is registered * @con: struct console pointer of console to check * * Context: Process context. May sleep while acquiring console list lock. * Return: true if the console is in the console list, otherwise false. * * If false is returned for a console that was previously registered, it * can be assumed that the console's unregistration is fully completed, * including the exit() callback after console list removal. */ static inline bool console_is_registered(const struct console *con) { bool ret; console_list_lock(); ret = console_is_registered_locked(con); console_list_unlock(); return ret; } /** * for_each_console_srcu() - Iterator over registered consoles * @con: struct console pointer used as loop cursor * * Although SRCU guarantees the console list will be consistent, the * struct console fields may be updated by other CPUs while iterating. * * Requires console_srcu_read_lock to be held. Can be invoked from * any context. */ #define for_each_console_srcu(con) \ hlist_for_each_entry_srcu(con, &console_list, node, \ console_srcu_read_lock_is_held()) /** * for_each_console() - Iterator over registered consoles * @con: struct console pointer used as loop cursor * * The console list and the &console.flags are immutable while iterating. * * Requires console_list_lock to be held. */ #define for_each_console(con) \ lockdep_assert_console_list_lock_held(); \ hlist_for_each_entry(con, &console_list, node) #ifdef CONFIG_PRINTK extern void nbcon_cpu_emergency_enter(void); extern void nbcon_cpu_emergency_exit(void); extern bool nbcon_can_proceed(struct nbcon_write_context *wctxt); extern bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt); extern bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt); extern void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt); #else static inline void nbcon_cpu_emergency_enter(void) { } static inline void nbcon_cpu_emergency_exit(void) { } static inline bool nbcon_can_proceed(struct nbcon_write_context *wctxt) { return false; } static inline bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt) { return false; } static inline bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt) { return false; } static inline void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt) { } #endif extern int console_set_on_cmdline; extern struct console *early_console; enum con_flush_mode { CONSOLE_FLUSH_PENDING, CONSOLE_REPLAY_ALL, }; extern int add_preferred_console(const char *name, const short idx, char *options); extern void console_force_preferred_locked(struct console *con); extern void register_console(struct console *); extern int unregister_console(struct console *); extern void console_lock(void); extern int console_trylock(void); extern void console_unlock(void); extern void console_conditional_schedule(void); extern void console_unblank(void); extern void console_flush_on_panic(enum con_flush_mode mode); extern struct tty_driver *console_device(int *); extern void console_suspend(struct console *); extern void console_resume(struct console *); extern int is_console_locked(void); extern int braille_register_console(struct console *, int index, char *console_options, char *braille_options); extern int braille_unregister_console(struct console *); #ifdef CONFIG_TTY extern void console_sysfs_notify(void); #else static inline void console_sysfs_notify(void) { } #endif extern bool console_suspend_enabled; /* Suspend and resume console messages over PM events */ extern void console_suspend_all(void); extern void console_resume_all(void); int mda_console_init(void); void vcs_make_sysfs(int index); void vcs_remove_sysfs(int index); /* Some debug stub to catch some of the obvious races in the VT code */ #define WARN_CONSOLE_UNLOCKED() \ WARN_ON(!atomic_read(&ignore_console_lock_warning) && \ !is_console_locked() && !oops_in_progress) /* * Increment ignore_console_lock_warning if you need to quiet * WARN_CONSOLE_UNLOCKED() for debugging purposes. */ extern atomic_t ignore_console_lock_warning; extern void console_init(void); /* For deferred console takeover */ void dummycon_register_output_notifier(struct notifier_block *nb); void dummycon_unregister_output_notifier(struct notifier_block *nb); #endif /* _LINUX_CONSOLE_H */
48 6 6 6 6 6 6 6 6 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_HIGHMEM_H #define _LINUX_HIGHMEM_H #include <linux/fs.h> #include <linux/kernel.h> #include <linux/bug.h> #include <linux/cacheflush.h> #include <linux/kmsan.h> #include <linux/mm.h> #include <linux/uaccess.h> #include <linux/hardirq.h> #include "highmem-internal.h" /** * kmap - Map a page for long term usage * @page: Pointer to the page to be mapped * * Returns: The virtual address of the mapping * * Can only be invoked from preemptible task context because on 32bit * systems with CONFIG_HIGHMEM enabled this function might sleep. * * For systems with CONFIG_HIGHMEM=n and for pages in the low memory area * this returns the virtual address of the direct kernel mapping. * * The returned virtual address is globally visible and valid up to the * point where it is unmapped via kunmap(). The pointer can be handed to * other contexts. * * For highmem pages on 32bit systems this can be slow as the mapping space * is limited and protected by a global lock. In case that there is no * mapping slot available the function blocks until a slot is released via * kunmap(). */ static inline void *kmap(struct page *page); /** * kunmap - Unmap the virtual address mapped by kmap() * @page: Pointer to the page which was mapped by kmap() * * Counterpart to kmap(). A NOOP for CONFIG_HIGHMEM=n and for mappings of * pages in the low memory area. */ static inline void kunmap(struct page *page); /** * kmap_to_page - Get the page for a kmap'ed address * @addr: The address to look up * * Returns: The page which is mapped to @addr. */ static inline struct page *kmap_to_page(void *addr); /** * kmap_flush_unused - Flush all unused kmap mappings in order to * remove stray mappings */ static inline void kmap_flush_unused(void); /** * kmap_local_page - Map a page for temporary usage * @page: Pointer to the page to be mapped * * Returns: The virtual address of the mapping * * Can be invoked from any context, including interrupts. * * Requires careful handling when nesting multiple mappings because the map * management is stack based. The unmap has to be in the reverse order of * the map operation: * * addr1 = kmap_local_page(page1); * addr2 = kmap_local_page(page2); * ... * kunmap_local(addr2); * kunmap_local(addr1); * * Unmapping addr1 before addr2 is invalid and causes malfunction. * * Contrary to kmap() mappings the mapping is only valid in the context of * the caller and cannot be handed to other contexts. * * On CONFIG_HIGHMEM=n kernels and for low memory pages this returns the * virtual address of the direct mapping. Only real highmem pages are * temporarily mapped. * * While kmap_local_page() is significantly faster than kmap() for the highmem * case it comes with restrictions about the pointer validity. * * On HIGHMEM enabled systems mapping a highmem page has the side effect of * disabling migration in order to keep the virtual address stable across * preemption. No caller of kmap_local_page() can rely on this side effect. */ static inline void *kmap_local_page(struct page *page); /** * kmap_local_folio - Map a page in this folio for temporary usage * @folio: The folio containing the page. * @offset: The byte offset within the folio which identifies the page. * * Requires careful handling when nesting multiple mappings because the map * management is stack based. The unmap has to be in the reverse order of * the map operation:: * * addr1 = kmap_local_folio(folio1, offset1); * addr2 = kmap_local_folio(folio2, offset2); * ... * kunmap_local(addr2); * kunmap_local(addr1); * * Unmapping addr1 before addr2 is invalid and causes malfunction. * * Contrary to kmap() mappings the mapping is only valid in the context of * the caller and cannot be handed to other contexts. * * On CONFIG_HIGHMEM=n kernels and for low memory pages this returns the * virtual address of the direct mapping. Only real highmem pages are * temporarily mapped. * * While it is significantly faster than kmap() for the highmem case it * comes with restrictions about the pointer validity. * * On HIGHMEM enabled systems mapping a highmem page has the side effect of * disabling migration in order to keep the virtual address stable across * preemption. No caller of kmap_local_folio() can rely on this side effect. * * Context: Can be invoked from any context. * Return: The virtual address of @offset. */ static inline void *kmap_local_folio(struct folio *folio, size_t offset); /** * kmap_atomic - Atomically map a page for temporary usage - Deprecated! * @page: Pointer to the page to be mapped * * Returns: The virtual address of the mapping * * In fact a wrapper around kmap_local_page() which also disables pagefaults * and, depending on PREEMPT_RT configuration, also CPU migration and * preemption. Therefore users should not count on the latter two side effects. * * Mappings should always be released by kunmap_atomic(). * * Do not use in new code. Use kmap_local_page() instead. * * It is used in atomic context when code wants to access the contents of a * page that might be allocated from high memory (see __GFP_HIGHMEM), for * example a page in the pagecache. The API has two functions, and they * can be used in a manner similar to the following:: * * // Find the page of interest. * struct page *page = find_get_page(mapping, offset); * * // Gain access to the contents of that page. * void *vaddr = kmap_atomic(page); * * // Do something to the contents of that page. * memset(vaddr, 0, PAGE_SIZE); * * // Unmap that page. * kunmap_atomic(vaddr); * * Note that the kunmap_atomic() call takes the result of the kmap_atomic() * call, not the argument. * * If you need to map two pages because you want to copy from one page to * another you need to keep the kmap_atomic calls strictly nested, like: * * vaddr1 = kmap_atomic(page1); * vaddr2 = kmap_atomic(page2); * * memcpy(vaddr1, vaddr2, PAGE_SIZE); * * kunmap_atomic(vaddr2); * kunmap_atomic(vaddr1); */ static inline void *kmap_atomic(struct page *page); /* Highmem related interfaces for management code */ static inline unsigned long nr_free_highpages(void); static inline unsigned long totalhigh_pages(void); #ifndef ARCH_HAS_FLUSH_ANON_PAGE static inline void flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vmaddr) { } #endif #ifndef ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE static inline void flush_kernel_vmap_range(void *vaddr, int size) { } static inline void invalidate_kernel_vmap_range(void *vaddr, int size) { } #endif /* when CONFIG_HIGHMEM is not set these will be plain clear/copy_page */ #ifndef clear_user_highpage static inline void clear_user_highpage(struct page *page, unsigned long vaddr) { void *addr = kmap_local_page(page); clear_user_page(addr, vaddr, page); kunmap_local(addr); } #endif #ifndef vma_alloc_zeroed_movable_folio /** * vma_alloc_zeroed_movable_folio - Allocate a zeroed page for a VMA. * @vma: The VMA the page is to be allocated for. * @vaddr: The virtual address the page will be inserted into. * * This function will allocate a page suitable for inserting into this * VMA at this virtual address. It may be allocated from highmem or * the movable zone. An architecture may provide its own implementation. * * Return: A folio containing one allocated and zeroed page or NULL if * we are out of memory. */ static inline struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, unsigned long vaddr) { struct folio *folio; folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vaddr); if (folio && user_alloc_needs_zeroing()) clear_user_highpage(&folio->page, vaddr); return folio; } #endif static inline void clear_highpage(struct page *page) { void *kaddr = kmap_local_page(page); clear_page(kaddr); kunmap_local(kaddr); } static inline void clear_highpage_kasan_tagged(struct page *page) { void *kaddr = kmap_local_page(page); clear_page(kasan_reset_tag(kaddr)); kunmap_local(kaddr); } #ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGE static inline void tag_clear_highpage(struct page *page) { } #endif /* * If we pass in a base or tail page, we can zero up to PAGE_SIZE. * If we pass in a head page, we can zero up to the size of the compound page. */ #ifdef CONFIG_HIGHMEM void zero_user_segments(struct page *page, unsigned start1, unsigned end1, unsigned start2, unsigned end2); #else static inline void zero_user_segments(struct page *page, unsigned start1, unsigned end1, unsigned start2, unsigned end2) { void *kaddr = kmap_local_page(page); unsigned int i; BUG_ON(end1 > page_size(page) || end2 > page_size(page)); if (end1 > start1) memset(kaddr + start1, 0, end1 - start1); if (end2 > start2) memset(kaddr + start2, 0, end2 - start2); kunmap_local(kaddr); for (i = 0; i < compound_nr(page); i++) flush_dcache_page(page + i); } #endif static inline void zero_user_segment(struct page *page, unsigned start, unsigned end) { zero_user_segments(page, start, end, 0, 0); } #ifndef __HAVE_ARCH_COPY_USER_HIGHPAGE static inline void copy_user_highpage(struct page *to, struct page *from, unsigned long vaddr, struct vm_area_struct *vma) { char *vfrom, *vto; vfrom = kmap_local_page(from); vto = kmap_local_page(to); copy_user_page(vto, vfrom, vaddr, to); kmsan_unpoison_memory(page_address(to), PAGE_SIZE); kunmap_local(vto); kunmap_local(vfrom); } #endif #ifndef __HAVE_ARCH_COPY_HIGHPAGE static inline void copy_highpage(struct page *to, struct page *from) { char *vfrom, *vto; vfrom = kmap_local_page(from); vto = kmap_local_page(to); copy_page(vto, vfrom); kmsan_copy_page_meta(to, from); kunmap_local(vto); kunmap_local(vfrom); } #endif #ifdef copy_mc_to_kernel /* * If architecture supports machine check exception handling, define the * #MC versions of copy_user_highpage and copy_highpage. They copy a memory * page with #MC in source page (@from) handled, and return the number * of bytes not copied if there was a #MC, otherwise 0 for success. */ static inline int copy_mc_user_highpage(struct page *to, struct page *from, unsigned long vaddr, struct vm_area_struct *vma) { unsigned long ret; char *vfrom, *vto; vfrom = kmap_local_page(from); vto = kmap_local_page(to); ret = copy_mc_to_kernel(vto, vfrom, PAGE_SIZE); if (!ret) kmsan_unpoison_memory(page_address(to), PAGE_SIZE); kunmap_local(vto); kunmap_local(vfrom); if (ret) memory_failure_queue(page_to_pfn(from), 0); return ret; } static inline int copy_mc_highpage(struct page *to, struct page *from) { unsigned long ret; char *vfrom, *vto; vfrom = kmap_local_page(from); vto = kmap_local_page(to); ret = copy_mc_to_kernel(vto, vfrom, PAGE_SIZE); if (!ret) kmsan_copy_page_meta(to, from); kunmap_local(vto); kunmap_local(vfrom); if (ret) memory_failure_queue(page_to_pfn(from), 0); return ret; } #else static inline int copy_mc_user_highpage(struct page *to, struct page *from, unsigned long vaddr, struct vm_area_struct *vma) { copy_user_highpage(to, from, vaddr, vma); return 0; } static inline int copy_mc_highpage(struct page *to, struct page *from) { copy_highpage(to, from); return 0; } #endif static inline void memcpy_page(struct page *dst_page, size_t dst_off, struct page *src_page, size_t src_off, size_t len) { char *dst = kmap_local_page(dst_page); char *src = kmap_local_page(src_page); VM_BUG_ON(dst_off + len > PAGE_SIZE || src_off + len > PAGE_SIZE); memcpy(dst + dst_off, src + src_off, len); kunmap_local(src); kunmap_local(dst); } static inline void memcpy_folio(struct folio *dst_folio, size_t dst_off, struct folio *src_folio, size_t src_off, size_t len) { VM_BUG_ON(dst_off + len > folio_size(dst_folio)); VM_BUG_ON(src_off + len > folio_size(src_folio)); do { char *dst = kmap_local_folio(dst_folio, dst_off); const char *src = kmap_local_folio(src_folio, src_off); size_t chunk = len; if (folio_test_highmem(dst_folio) && chunk > PAGE_SIZE - offset_in_page(dst_off)) chunk = PAGE_SIZE - offset_in_page(dst_off); if (folio_test_highmem(src_folio) && chunk > PAGE_SIZE - offset_in_page(src_off)) chunk = PAGE_SIZE - offset_in_page(src_off); memcpy(dst, src, chunk); kunmap_local(src); kunmap_local(dst); dst_off += chunk; src_off += chunk; len -= chunk; } while (len > 0); } static inline void memset_page(struct page *page, size_t offset, int val, size_t len) { char *addr = kmap_local_page(page); VM_BUG_ON(offset + len > PAGE_SIZE); memset(addr + offset, val, len); kunmap_local(addr); } static inline void memcpy_from_page(char *to, struct page *page, size_t offset, size_t len) { char *from = kmap_local_page(page); VM_BUG_ON(offset + len > PAGE_SIZE); memcpy(to, from + offset, len); kunmap_local(from); } static inline void memcpy_to_page(struct page *page, size_t offset, const char *from, size_t len) { char *to = kmap_local_page(page); VM_BUG_ON(offset + len > PAGE_SIZE); memcpy(to + offset, from, len); flush_dcache_page(page); kunmap_local(to); } static inline void memzero_page(struct page *page, size_t offset, size_t len) { char *addr = kmap_local_page(page); VM_BUG_ON(offset + len > PAGE_SIZE); memset(addr + offset, 0, len); flush_dcache_page(page); kunmap_local(addr); } /** * memcpy_from_folio - Copy a range of bytes from a folio. * @to: The memory to copy to. * @folio: The folio to read from. * @offset: The first byte in the folio to read. * @len: The number of bytes to copy. */ static inline void memcpy_from_folio(char *to, struct folio *folio, size_t offset, size_t len) { VM_BUG_ON(offset + len > folio_size(folio)); do { const char *from = kmap_local_folio(folio, offset); size_t chunk = len; if (folio_test_partial_kmap(folio) && chunk > PAGE_SIZE - offset_in_page(offset)) chunk = PAGE_SIZE - offset_in_page(offset); memcpy(to, from, chunk); kunmap_local(from); to += chunk; offset += chunk; len -= chunk; } while (len > 0); } /** * memcpy_to_folio - Copy a range of bytes to a folio. * @folio: The folio to write to. * @offset: The first byte in the folio to store to. * @from: The memory to copy from. * @len: The number of bytes to copy. */ static inline void memcpy_to_folio(struct folio *folio, size_t offset, const char *from, size_t len) { VM_BUG_ON(offset + len > folio_size(folio)); do { char *to = kmap_local_folio(folio, offset); size_t chunk = len; if (folio_test_partial_kmap(folio) && chunk > PAGE_SIZE - offset_in_page(offset)) chunk = PAGE_SIZE - offset_in_page(offset); memcpy(to, from, chunk); kunmap_local(to); from += chunk; offset += chunk; len -= chunk; } while (len > 0); flush_dcache_folio(folio); } /** * folio_zero_tail - Zero the tail of a folio. * @folio: The folio to zero. * @offset: The byte offset in the folio to start zeroing at. * @kaddr: The address the folio is currently mapped to. * * If you have already used kmap_local_folio() to map a folio, written * some data to it and now need to zero the end of the folio (and flush * the dcache), you can use this function. If you do not have the * folio kmapped (eg the folio has been partially populated by DMA), * use folio_zero_range() or folio_zero_segment() instead. * * Return: An address which can be passed to kunmap_local(). */ static inline __must_check void *folio_zero_tail(struct folio *folio, size_t offset, void *kaddr) { size_t len = folio_size(folio) - offset; if (folio_test_partial_kmap(folio)) { size_t max = PAGE_SIZE - offset_in_page(offset); while (len > max) { memset(kaddr, 0, max); kunmap_local(kaddr); len -= max; offset += max; max = PAGE_SIZE; kaddr = kmap_local_folio(folio, offset); } } memset(kaddr, 0, len); flush_dcache_folio(folio); return kaddr; } /** * folio_fill_tail - Copy some data to a folio and pad with zeroes. * @folio: The destination folio. * @offset: The offset into @folio at which to start copying. * @from: The data to copy. * @len: How many bytes of data to copy. * * This function is most useful for filesystems which support inline data. * When they want to copy data from the inode into the page cache, this * function does everything for them. It supports large folios even on * HIGHMEM configurations. */ static inline void folio_fill_tail(struct folio *folio, size_t offset, const char *from, size_t len) { char *to = kmap_local_folio(folio, offset); VM_BUG_ON(offset + len > folio_size(folio)); if (folio_test_partial_kmap(folio)) { size_t max = PAGE_SIZE - offset_in_page(offset); while (len > max) { memcpy(to, from, max); kunmap_local(to); len -= max; from += max; offset += max; max = PAGE_SIZE; to = kmap_local_folio(folio, offset); } } memcpy(to, from, len); to = folio_zero_tail(folio, offset + len, to + len); kunmap_local(to); } /** * memcpy_from_file_folio - Copy some bytes from a file folio. * @to: The destination buffer. * @folio: The folio to copy from. * @pos: The position in the file. * @len: The maximum number of bytes to copy. * * Copy up to @len bytes from this folio. This may be limited by PAGE_SIZE * if the folio comes from HIGHMEM, and by the size of the folio. * * Return: The number of bytes copied from the folio. */ static inline size_t memcpy_from_file_folio(char *to, struct folio *folio, loff_t pos, size_t len) { size_t offset = offset_in_folio(folio, pos); char *from = kmap_local_folio(folio, offset); if (folio_test_partial_kmap(folio)) { offset = offset_in_page(offset); len = min_t(size_t, len, PAGE_SIZE - offset); } else len = min(len, folio_size(folio) - offset); memcpy(to, from, len); kunmap_local(from); return len; } /** * folio_zero_segments() - Zero two byte ranges in a folio. * @folio: The folio to write to. * @start1: The first byte to zero. * @xend1: One more than the last byte in the first range. * @start2: The first byte to zero in the second range. * @xend2: One more than the last byte in the second range. */ static inline void folio_zero_segments(struct folio *folio, size_t start1, size_t xend1, size_t start2, size_t xend2) { zero_user_segments(&folio->page, start1, xend1, start2, xend2); } /** * folio_zero_segment() - Zero a byte range in a folio. * @folio: The folio to write to. * @start: The first byte to zero. * @xend: One more than the last byte to zero. */ static inline void folio_zero_segment(struct folio *folio, size_t start, size_t xend) { zero_user_segments(&folio->page, start, xend, 0, 0); } /** * folio_zero_range() - Zero a byte range in a folio. * @folio: The folio to write to. * @start: The first byte to zero. * @length: The number of bytes to zero. */ static inline void folio_zero_range(struct folio *folio, size_t start, size_t length) { zero_user_segments(&folio->page, start, start + length, 0, 0); } /** * folio_release_kmap - Unmap a folio and drop a refcount. * @folio: The folio to release. * @addr: The address previously returned by a call to kmap_local_folio(). * * It is common, eg in directory handling to kmap a folio. This function * unmaps the folio and drops the refcount that was being held to keep the * folio alive while we accessed it. */ static inline void folio_release_kmap(struct folio *folio, void *addr) { kunmap_local(addr); folio_put(folio); } #endif /* _LINUX_HIGHMEM_H */
393 87 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 // SPDX-License-Identifier: GPL-2.0-only /* * linux/drivers/clocksource/arm_arch_timer.c * * Copyright (C) 2011 ARM Ltd. * All Rights Reserved */ #define pr_fmt(fmt) "arch_timer: " fmt #include <linux/init.h> #include <linux/kernel.h> #include <linux/device.h> #include <linux/smp.h> #include <linux/cpu.h> #include <linux/cpu_pm.h> #include <linux/clockchips.h> #include <linux/clocksource.h> #include <linux/clocksource_ids.h> #include <linux/interrupt.h> #include <linux/kstrtox.h> #include <linux/of_irq.h> #include <linux/of_address.h> #include <linux/io.h> #include <linux/slab.h> #include <linux/sched/clock.h> #include <linux/sched_clock.h> #include <linux/acpi.h> #include <linux/arm-smccc.h> #include <linux/ptp_kvm.h> #include <asm/arch_timer.h> #include <asm/virt.h> #include <clocksource/arm_arch_timer.h> #define CNTTIDR 0x08 #define CNTTIDR_VIRT(n) (BIT(1) << ((n) * 4)) #define CNTACR(n) (0x40 + ((n) * 4)) #define CNTACR_RPCT BIT(0) #define CNTACR_RVCT BIT(1) #define CNTACR_RFRQ BIT(2) #define CNTACR_RVOFF BIT(3) #define CNTACR_RWVT BIT(4) #define CNTACR_RWPT BIT(5) #define CNTPCT_LO 0x00 #define CNTVCT_LO 0x08 #define CNTFRQ 0x10 #define CNTP_CVAL_LO 0x20 #define CNTP_CTL 0x2c #define CNTV_CVAL_LO 0x30 #define CNTV_CTL 0x3c /* * The minimum amount of time a generic counter is guaranteed to not roll over * (40 years) */ #define MIN_ROLLOVER_SECS (40ULL * 365 * 24 * 3600) static unsigned arch_timers_present __initdata; struct arch_timer { void __iomem *base; struct clock_event_device evt; }; static struct arch_timer *arch_timer_mem __ro_after_init; #define to_arch_timer(e) container_of(e, struct arch_timer, evt) static u32 arch_timer_rate __ro_after_init; static int arch_timer_ppi[ARCH_TIMER_MAX_TIMER_PPI] __ro_after_init; static const char *arch_timer_ppi_names[ARCH_TIMER_MAX_TIMER_PPI] = { [ARCH_TIMER_PHYS_SECURE_PPI] = "sec-phys", [ARCH_TIMER_PHYS_NONSECURE_PPI] = "phys", [ARCH_TIMER_VIRT_PPI] = "virt", [ARCH_TIMER_HYP_PPI] = "hyp-phys", [ARCH_TIMER_HYP_VIRT_PPI] = "hyp-virt", }; static struct clock_event_device __percpu *arch_timer_evt; static enum arch_timer_ppi_nr arch_timer_uses_ppi __ro_after_init = ARCH_TIMER_VIRT_PPI; static bool arch_timer_c3stop __ro_after_init; static bool arch_timer_mem_use_virtual __ro_after_init; static bool arch_counter_suspend_stop __ro_after_init; #ifdef CONFIG_GENERIC_GETTIMEOFDAY static enum vdso_clock_mode vdso_default = VDSO_CLOCKMODE_ARCHTIMER; #else static enum vdso_clock_mode vdso_default = VDSO_CLOCKMODE_NONE; #endif /* CONFIG_GENERIC_GETTIMEOFDAY */ static cpumask_t evtstrm_available = CPU_MASK_NONE; static bool evtstrm_enable __ro_after_init = IS_ENABLED(CONFIG_ARM_ARCH_TIMER_EVTSTREAM); static int __init early_evtstrm_cfg(char *buf) { return kstrtobool(buf, &evtstrm_enable); } early_param("clocksource.arm_arch_timer.evtstrm", early_evtstrm_cfg); /* * Makes an educated guess at a valid counter width based on the Generic Timer * specification. Of note: * 1) the system counter is at least 56 bits wide * 2) a roll-over time of not less than 40 years * * See 'ARM DDI 0487G.a D11.1.2 ("The system counter")' for more details. */ static int arch_counter_get_width(void) { u64 min_cycles = MIN_ROLLOVER_SECS * arch_timer_rate; /* guarantee the returned width is within the valid range */ return clamp_val(ilog2(min_cycles - 1) + 1, 56, 64); } /* * Architected system timer support. */ static __always_inline void arch_timer_reg_write(int access, enum arch_timer_reg reg, u64 val, struct clock_event_device *clk) { if (access == ARCH_TIMER_MEM_PHYS_ACCESS) { struct arch_timer *timer = to_arch_timer(clk); switch (reg) { case ARCH_TIMER_REG_CTRL: writel_relaxed((u32)val, timer->base + CNTP_CTL); break; case ARCH_TIMER_REG_CVAL: /* * Not guaranteed to be atomic, so the timer * must be disabled at this point. */ writeq_relaxed(val, timer->base + CNTP_CVAL_LO); break; default: BUILD_BUG(); } } else if (access == ARCH_TIMER_MEM_VIRT_ACCESS) { struct arch_timer *timer = to_arch_timer(clk); switch (reg) { case ARCH_TIMER_REG_CTRL: writel_relaxed((u32)val, timer->base + CNTV_CTL); break; case ARCH_TIMER_REG_CVAL: /* Same restriction as above */ writeq_relaxed(val, timer->base + CNTV_CVAL_LO); break; default: BUILD_BUG(); } } else { arch_timer_reg_write_cp15(access, reg, val); } } static __always_inline u32 arch_timer_reg_read(int access, enum arch_timer_reg reg, struct clock_event_device *clk) { u32 val; if (access == ARCH_TIMER_MEM_PHYS_ACCESS) { struct arch_timer *timer = to_arch_timer(clk); switch (reg) { case ARCH_TIMER_REG_CTRL: val = readl_relaxed(timer->base + CNTP_CTL); break; default: BUILD_BUG(); } } else if (access == ARCH_TIMER_MEM_VIRT_ACCESS) { struct arch_timer *timer = to_arch_timer(clk); switch (reg) { case ARCH_TIMER_REG_CTRL: val = readl_relaxed(timer->base + CNTV_CTL); break; default: BUILD_BUG(); } } else { val = arch_timer_reg_read_cp15(access, reg); } return val; } static noinstr u64 raw_counter_get_cntpct_stable(void) { return __arch_counter_get_cntpct_stable(); } static notrace u64 arch_counter_get_cntpct_stable(void) { u64 val; preempt_disable_notrace(); val = __arch_counter_get_cntpct_stable(); preempt_enable_notrace(); return val; } static noinstr u64 arch_counter_get_cntpct(void) { return __arch_counter_get_cntpct(); } static noinstr u64 raw_counter_get_cntvct_stable(void) { return __arch_counter_get_cntvct_stable(); } static notrace u64 arch_counter_get_cntvct_stable(void) { u64 val; preempt_disable_notrace(); val = __arch_counter_get_cntvct_stable(); preempt_enable_notrace(); return val; } static noinstr u64 arch_counter_get_cntvct(void) { return __arch_counter_get_cntvct(); } /* * Default to cp15 based access because arm64 uses this function for * sched_clock() before DT is probed and the cp15 method is guaranteed * to exist on arm64. arm doesn't use this before DT is probed so even * if we don't have the cp15 accessors we won't have a problem. */ u64 (*arch_timer_read_counter)(void) __ro_after_init = arch_counter_get_cntvct; EXPORT_SYMBOL_GPL(arch_timer_read_counter); static u64 arch_counter_read(struct clocksource *cs) { return arch_timer_read_counter(); } static u64 arch_counter_read_cc(struct cyclecounter *cc) { return arch_timer_read_counter(); } static struct clocksource clocksource_counter = { .name = "arch_sys_counter", .id = CSID_ARM_ARCH_COUNTER, .rating = 400, .read = arch_counter_read, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; static struct cyclecounter cyclecounter __ro_after_init = { .read = arch_counter_read_cc, }; struct ate_acpi_oem_info { char oem_id[ACPI_OEM_ID_SIZE + 1]; char oem_table_id[ACPI_OEM_TABLE_ID_SIZE + 1]; u32 oem_revision; }; #ifdef CONFIG_FSL_ERRATUM_A008585 /* * The number of retries is an arbitrary value well beyond the highest number * of iterations the loop has been observed to take. */ #define __fsl_a008585_read_reg(reg) ({ \ u64 _old, _new; \ int _retries = 200; \ \ do { \ _old = read_sysreg(reg); \ _new = read_sysreg(reg); \ _retries--; \ } while (unlikely(_old != _new) && _retries); \ \ WARN_ON_ONCE(!_retries); \ _new; \ }) static u64 notrace fsl_a008585_read_cntpct_el0(void) { return __fsl_a008585_read_reg(cntpct_el0); } static u64 notrace fsl_a008585_read_cntvct_el0(void) { return __fsl_a008585_read_reg(cntvct_el0); } #endif #ifdef CONFIG_HISILICON_ERRATUM_161010101 /* * Verify whether the value of the second read is larger than the first by * less than 32 is the only way to confirm the value is correct, so clear the * lower 5 bits to check whether the difference is greater than 32 or not. * Theoretically the erratum should not occur more than twice in succession * when reading the system counter, but it is possible that some interrupts * may lead to more than twice read errors, triggering the warning, so setting * the number of retries far beyond the number of iterations the loop has been * observed to take. */ #define __hisi_161010101_read_reg(reg) ({ \ u64 _old, _new; \ int _retries = 50; \ \ do { \ _old = read_sysreg(reg); \ _new = read_sysreg(reg); \ _retries--; \ } while (unlikely((_new - _old) >> 5) && _retries); \ \ WARN_ON_ONCE(!_retries); \ _new; \ }) static u64 notrace hisi_161010101_read_cntpct_el0(void) { return __hisi_161010101_read_reg(cntpct_el0); } static u64 notrace hisi_161010101_read_cntvct_el0(void) { return __hisi_161010101_read_reg(cntvct_el0); } static const struct ate_acpi_oem_info hisi_161010101_oem_info[] = { /* * Note that trailing spaces are required to properly match * the OEM table information. */ { .oem_id = "HISI ", .oem_table_id = "HIP05 ", .oem_revision = 0, }, { .oem_id = "HISI ", .oem_table_id = "HIP06 ", .oem_revision = 0, }, { .oem_id = "HISI ", .oem_table_id = "HIP07 ", .oem_revision = 0, }, { /* Sentinel indicating the end of the OEM array */ }, }; #endif #ifdef CONFIG_ARM64_ERRATUM_858921 static u64 notrace arm64_858921_read_cntpct_el0(void) { u64 old, new; old = read_sysreg(cntpct_el0); new = read_sysreg(cntpct_el0); return (((old ^ new) >> 32) & 1) ? old : new; } static u64 notrace arm64_858921_read_cntvct_el0(void) { u64 old, new; old = read_sysreg(cntvct_el0); new = read_sysreg(cntvct_el0); return (((old ^ new) >> 32) & 1) ? old : new; } #endif #ifdef CONFIG_SUN50I_ERRATUM_UNKNOWN1 /* * The low bits of the counter registers are indeterminate while bit 10 or * greater is rolling over. Since the counter value can jump both backward * (7ff -> 000 -> 800) and forward (7ff -> fff -> 800), ignore register values * with all ones or all zeros in the low bits. Bound the loop by the maximum * number of CPU cycles in 3 consecutive 24 MHz counter periods. */ #define __sun50i_a64_read_reg(reg) ({ \ u64 _val; \ int _retries = 150; \ \ do { \ _val = read_sysreg(reg); \ _retries--; \ } while (((_val + 1) & GENMASK(8, 0)) <= 1 && _retries); \ \ WARN_ON_ONCE(!_retries); \ _val; \ }) static u64 notrace sun50i_a64_read_cntpct_el0(void) { return __sun50i_a64_read_reg(cntpct_el0); } static u64 notrace sun50i_a64_read_cntvct_el0(void) { return __sun50i_a64_read_reg(cntvct_el0); } #endif #ifdef CONFIG_ARM_ARCH_TIMER_OOL_WORKAROUND DEFINE_PER_CPU(const struct arch_timer_erratum_workaround *, timer_unstable_counter_workaround); EXPORT_SYMBOL_GPL(timer_unstable_counter_workaround); static atomic_t timer_unstable_counter_workaround_in_use = ATOMIC_INIT(0); /* * Force the inlining of this function so that the register accesses * can be themselves correctly inlined. */ static __always_inline void erratum_set_next_event_generic(const int access, unsigned long evt, struct clock_event_device *clk) { unsigned long ctrl; u64 cval; ctrl = arch_timer_reg_read(access, ARCH_TIMER_REG_CTRL, clk); ctrl |= ARCH_TIMER_CTRL_ENABLE; ctrl &= ~ARCH_TIMER_CTRL_IT_MASK; if (access == ARCH_TIMER_PHYS_ACCESS) { cval = evt + arch_counter_get_cntpct_stable(); write_sysreg(cval, cntp_cval_el0); } else { cval = evt + arch_counter_get_cntvct_stable(); write_sysreg(cval, cntv_cval_el0); } arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, clk); } static __maybe_unused int erratum_set_next_event_virt(unsigned long evt, struct clock_event_device *clk) { erratum_set_next_event_generic(ARCH_TIMER_VIRT_ACCESS, evt, clk); return 0; } static __maybe_unused int erratum_set_next_event_phys(unsigned long evt, struct clock_event_device *clk) { erratum_set_next_event_generic(ARCH_TIMER_PHYS_ACCESS, evt, clk); return 0; } static const struct arch_timer_erratum_workaround ool_workarounds[] = { #ifdef CONFIG_FSL_ERRATUM_A008585 { .match_type = ate_match_dt, .id = "fsl,erratum-a008585", .desc = "Freescale erratum a005858", .read_cntpct_el0 = fsl_a008585_read_cntpct_el0, .read_cntvct_el0 = fsl_a008585_read_cntvct_el0, .set_next_event_phys = erratum_set_next_event_phys, .set_next_event_virt = erratum_set_next_event_virt, }, #endif #ifdef CONFIG_HISILICON_ERRATUM_161010101 { .match_type = ate_match_dt, .id = "hisilicon,erratum-161010101", .desc = "HiSilicon erratum 161010101", .read_cntpct_el0 = hisi_161010101_read_cntpct_el0, .read_cntvct_el0 = hisi_161010101_read_cntvct_el0, .set_next_event_phys = erratum_set_next_event_phys, .set_next_event_virt = erratum_set_next_event_virt, }, { .match_type = ate_match_acpi_oem_info, .id = hisi_161010101_oem_info, .desc = "HiSilicon erratum 161010101", .read_cntpct_el0 = hisi_161010101_read_cntpct_el0, .read_cntvct_el0 = hisi_161010101_read_cntvct_el0, .set_next_event_phys = erratum_set_next_event_phys, .set_next_event_virt = erratum_set_next_event_virt, }, #endif #ifdef CONFIG_ARM64_ERRATUM_858921 { .match_type = ate_match_local_cap_id, .id = (void *)ARM64_WORKAROUND_858921, .desc = "ARM erratum 858921", .read_cntpct_el0 = arm64_858921_read_cntpct_el0, .read_cntvct_el0 = arm64_858921_read_cntvct_el0, .set_next_event_phys = erratum_set_next_event_phys, .set_next_event_virt = erratum_set_next_event_virt, }, #endif #ifdef CONFIG_SUN50I_ERRATUM_UNKNOWN1 { .match_type = ate_match_dt, .id = "allwinner,erratum-unknown1", .desc = "Allwinner erratum UNKNOWN1", .read_cntpct_el0 = sun50i_a64_read_cntpct_el0, .read_cntvct_el0 = sun50i_a64_read_cntvct_el0, .set_next_event_phys = erratum_set_next_event_phys, .set_next_event_virt = erratum_set_next_event_virt, }, #endif #ifdef CONFIG_ARM64_ERRATUM_1418040 { .match_type = ate_match_local_cap_id, .id = (void *)ARM64_WORKAROUND_1418040, .desc = "ARM erratum 1418040", .disable_compat_vdso = true, }, #endif }; typedef bool (*ate_match_fn_t)(const struct arch_timer_erratum_workaround *, const void *); static bool arch_timer_check_dt_erratum(const struct arch_timer_erratum_workaround *wa, const void *arg) { const struct device_node *np = arg; return of_property_read_bool(np, wa->id); } static bool arch_timer_check_local_cap_erratum(const struct arch_timer_erratum_workaround *wa, const void *arg) { return this_cpu_has_cap((uintptr_t)wa->id); } static bool arch_timer_check_acpi_oem_erratum(const struct arch_timer_erratum_workaround *wa, const void *arg) { static const struct ate_acpi_oem_info empty_oem_info = {}; const struct ate_acpi_oem_info *info = wa->id; const struct acpi_table_header *table = arg; /* Iterate over the ACPI OEM info array, looking for a match */ while (memcmp(info, &empty_oem_info, sizeof(*info))) { if (!memcmp(info->oem_id, table->oem_id, ACPI_OEM_ID_SIZE) && !memcmp(info->oem_table_id, table->oem_table_id, ACPI_OEM_TABLE_ID_SIZE) && info->oem_revision == table->oem_revision) return true; info++; } return false; } static const struct arch_timer_erratum_workaround * arch_timer_iterate_errata(enum arch_timer_erratum_match_type type, ate_match_fn_t match_fn, void *arg) { int i; for (i = 0; i < ARRAY_SIZE(ool_workarounds); i++) { if (ool_workarounds[i].match_type != type) continue; if (match_fn(&ool_workarounds[i], arg)) return &ool_workarounds[i]; } return NULL; } static void arch_timer_enable_workaround(const struct arch_timer_erratum_workaround *wa, bool local) { int i; if (local) { __this_cpu_write(timer_unstable_counter_workaround, wa); } else { for_each_possible_cpu(i) per_cpu(timer_unstable_counter_workaround, i) = wa; } if (wa->read_cntvct_el0 || wa->read_cntpct_el0) atomic_set(&timer_unstable_counter_workaround_in_use, 1); /* * Don't use the vdso fastpath if errata require using the * out-of-line counter accessor. We may change our mind pretty * late in the game (with a per-CPU erratum, for example), so * change both the default value and the vdso itself. */ if (wa->read_cntvct_el0) { clocksource_counter.vdso_clock_mode = VDSO_CLOCKMODE_NONE; vdso_default = VDSO_CLOCKMODE_NONE; } else if (wa->disable_compat_vdso && vdso_default != VDSO_CLOCKMODE_NONE) { vdso_default = VDSO_CLOCKMODE_ARCHTIMER_NOCOMPAT; clocksource_counter.vdso_clock_mode = vdso_default; } } static void arch_timer_check_ool_workaround(enum arch_timer_erratum_match_type type, void *arg) { const struct arch_timer_erratum_workaround *wa, *__wa; ate_match_fn_t match_fn = NULL; bool local = false; switch (type) { case ate_match_dt: match_fn = arch_timer_check_dt_erratum; break; case ate_match_local_cap_id: match_fn = arch_timer_check_local_cap_erratum; local = true; break; case ate_match_acpi_oem_info: match_fn = arch_timer_check_acpi_oem_erratum; break; default: WARN_ON(1); return; } wa = arch_timer_iterate_errata(type, match_fn, arg); if (!wa) return; __wa = __this_cpu_read(timer_unstable_counter_workaround); if (__wa && wa != __wa) pr_warn("Can't enable workaround for %s (clashes with %s\n)", wa->desc, __wa->desc); if (__wa) return; arch_timer_enable_workaround(wa, local); pr_info("Enabling %s workaround for %s\n", local ? "local" : "global", wa->desc); } static bool arch_timer_this_cpu_has_cntvct_wa(void) { return has_erratum_handler(read_cntvct_el0); } static bool arch_timer_counter_has_wa(void) { return atomic_read(&timer_unstable_counter_workaround_in_use); } #else #define arch_timer_check_ool_workaround(t,a) do { } while(0) #define arch_timer_this_cpu_has_cntvct_wa() ({false;}) #define arch_timer_counter_has_wa() ({false;}) #endif /* CONFIG_ARM_ARCH_TIMER_OOL_WORKAROUND */ static __always_inline irqreturn_t timer_handler(const int access, struct clock_event_device *evt) { unsigned long ctrl; ctrl = arch_timer_reg_read(access, ARCH_TIMER_REG_CTRL, evt); if (ctrl & ARCH_TIMER_CTRL_IT_STAT) { ctrl |= ARCH_TIMER_CTRL_IT_MASK; arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, evt); evt->event_handler(evt); return IRQ_HANDLED; } return IRQ_NONE; } static irqreturn_t arch_timer_handler_virt(int irq, void *dev_id) { struct clock_event_device *evt = dev_id; return timer_handler(ARCH_TIMER_VIRT_ACCESS, evt); } static irqreturn_t arch_timer_handler_phys(int irq, void *dev_id) { struct clock_event_device *evt = dev_id; return timer_handler(ARCH_TIMER_PHYS_ACCESS, evt); } static irqreturn_t arch_timer_handler_phys_mem(int irq, void *dev_id) { struct clock_event_device *evt = dev_id; return timer_handler(ARCH_TIMER_MEM_PHYS_ACCESS, evt); } static irqreturn_t arch_timer_handler_virt_mem(int irq, void *dev_id) { struct clock_event_device *evt = dev_id; return timer_handler(ARCH_TIMER_MEM_VIRT_ACCESS, evt); } static __always_inline int arch_timer_shutdown(const int access, struct clock_event_device *clk) { unsigned long ctrl; ctrl = arch_timer_reg_read(access, ARCH_TIMER_REG_CTRL, clk); ctrl &= ~ARCH_TIMER_CTRL_ENABLE; arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, clk); return 0; } static int arch_timer_shutdown_virt(struct clock_event_device *clk) { return arch_timer_shutdown(ARCH_TIMER_VIRT_ACCESS, clk); } static int arch_timer_shutdown_phys(struct clock_event_device *clk) { return arch_timer_shutdown(ARCH_TIMER_PHYS_ACCESS, clk); } static int arch_timer_shutdown_virt_mem(struct clock_event_device *clk) { return arch_timer_shutdown(ARCH_TIMER_MEM_VIRT_ACCESS, clk); } static int arch_timer_shutdown_phys_mem(struct clock_event_device *clk) { return arch_timer_shutdown(ARCH_TIMER_MEM_PHYS_ACCESS, clk); } static __always_inline void set_next_event(const int access, unsigned long evt, struct clock_event_device *clk) { unsigned long ctrl; u64 cnt; ctrl = arch_timer_reg_read(access, ARCH_TIMER_REG_CTRL, clk); ctrl |= ARCH_TIMER_CTRL_ENABLE; ctrl &= ~ARCH_TIMER_CTRL_IT_MASK; if (access == ARCH_TIMER_PHYS_ACCESS) cnt = __arch_counter_get_cntpct(); else cnt = __arch_counter_get_cntvct(); arch_timer_reg_write(access, ARCH_TIMER_REG_CVAL, evt + cnt, clk); arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, clk); } static int arch_timer_set_next_event_virt(unsigned long evt, struct clock_event_device *clk) { set_next_event(ARCH_TIMER_VIRT_ACCESS, evt, clk); return 0; } static int arch_timer_set_next_event_phys(unsigned long evt, struct clock_event_device *clk) { set_next_event(ARCH_TIMER_PHYS_ACCESS, evt, clk); return 0; } static noinstr u64 arch_counter_get_cnt_mem(struct arch_timer *t, int offset_lo) { u32 cnt_lo, cnt_hi, tmp_hi; do { cnt_hi = __le32_to_cpu((__le32 __force)__raw_readl(t->base + offset_lo + 4)); cnt_lo = __le32_to_cpu((__le32 __force)__raw_readl(t->base + offset_lo)); tmp_hi = __le32_to_cpu((__le32 __force)__raw_readl(t->base + offset_lo + 4)); } while (cnt_hi != tmp_hi); return ((u64) cnt_hi << 32) | cnt_lo; } static __always_inline void set_next_event_mem(const int access, unsigned long evt, struct clock_event_device *clk) { struct arch_timer *timer = to_arch_timer(clk); unsigned long ctrl; u64 cnt; ctrl = arch_timer_reg_read(access, ARCH_TIMER_REG_CTRL, clk); /* Timer must be disabled before programming CVAL */ if (ctrl & ARCH_TIMER_CTRL_ENABLE) { ctrl &= ~ARCH_TIMER_CTRL_ENABLE; arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, clk); } ctrl |= ARCH_TIMER_CTRL_ENABLE; ctrl &= ~ARCH_TIMER_CTRL_IT_MASK; if (access == ARCH_TIMER_MEM_VIRT_ACCESS) cnt = arch_counter_get_cnt_mem(timer, CNTVCT_LO); else cnt = arch_counter_get_cnt_mem(timer, CNTPCT_LO); arch_timer_reg_write(access, ARCH_TIMER_REG_CVAL, evt + cnt, clk); arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, clk); } static int arch_timer_set_next_event_virt_mem(unsigned long evt, struct clock_event_device *clk) { set_next_event_mem(ARCH_TIMER_MEM_VIRT_ACCESS, evt, clk); return 0; } static int arch_timer_set_next_event_phys_mem(unsigned long evt, struct clock_event_device *clk) { set_next_event_mem(ARCH_TIMER_MEM_PHYS_ACCESS, evt, clk); return 0; } static u64 __arch_timer_check_delta(void) { #ifdef CONFIG_ARM64 const struct midr_range broken_cval_midrs[] = { /* * XGene-1 implements CVAL in terms of TVAL, meaning * that the maximum timer range is 32bit. Shame on them. * * Note that TVAL is signed, thus has only 31 of its * 32 bits to express magnitude. */ MIDR_REV_RANGE(MIDR_CPU_MODEL(ARM_CPU_IMP_APM, APM_CPU_PART_XGENE), APM_CPU_VAR_POTENZA, 0x0, 0xf), {}, }; if (is_midr_in_range_list(broken_cval_midrs)) { pr_warn_once("Broken CNTx_CVAL_EL1, using 31 bit TVAL instead.\n"); return CLOCKSOURCE_MASK(31); } #endif return CLOCKSOURCE_MASK(arch_counter_get_width()); } static void __arch_timer_setup(unsigned type, struct clock_event_device *clk) { u64 max_delta; clk->features = CLOCK_EVT_FEAT_ONESHOT; if (type == ARCH_TIMER_TYPE_CP15) { typeof(clk->set_next_event) sne; arch_timer_check_ool_workaround(ate_match_local_cap_id, NULL); if (arch_timer_c3stop) clk->features |= CLOCK_EVT_FEAT_C3STOP; clk->name = "arch_sys_timer"; clk->rating = 450; clk->cpumask = cpumask_of(smp_processor_id()); clk->irq = arch_timer_ppi[arch_timer_uses_ppi]; switch (arch_timer_uses_ppi) { case ARCH_TIMER_VIRT_PPI: clk->set_state_shutdown = arch_timer_shutdown_virt; clk->set_state_oneshot_stopped = arch_timer_shutdown_virt; sne = erratum_handler(set_next_event_virt); break; case ARCH_TIMER_PHYS_SECURE_PPI: case ARCH_TIMER_PHYS_NONSECURE_PPI: case ARCH_TIMER_HYP_PPI: clk->set_state_shutdown = arch_timer_shutdown_phys; clk->set_state_oneshot_stopped = arch_timer_shutdown_phys; sne = erratum_handler(set_next_event_phys); break; default: BUG(); } clk->set_next_event = sne; max_delta = __arch_timer_check_delta(); } else { clk->features |= CLOCK_EVT_FEAT_DYNIRQ; clk->name = "arch_mem_timer"; clk->rating = 400; clk->cpumask = cpu_possible_mask; if (arch_timer_mem_use_virtual) { clk->set_state_shutdown = arch_timer_shutdown_virt_mem; clk->set_state_oneshot_stopped = arch_timer_shutdown_virt_mem; clk->set_next_event = arch_timer_set_next_event_virt_mem; } else { clk->set_state_shutdown = arch_timer_shutdown_phys_mem; clk->set_state_oneshot_stopped = arch_timer_shutdown_phys_mem; clk->set_next_event = arch_timer_set_next_event_phys_mem; } max_delta = CLOCKSOURCE_MASK(56); } clk->set_state_shutdown(clk); clockevents_config_and_register(clk, arch_timer_rate, 0xf, max_delta); } static void arch_timer_evtstrm_enable(unsigned int divider) { u32 cntkctl = arch_timer_get_cntkctl(); #ifdef CONFIG_ARM64 /* ECV is likely to require a large divider. Use the EVNTIS flag. */ if (cpus_have_final_cap(ARM64_HAS_ECV) && divider > 15) { cntkctl |= ARCH_TIMER_EVT_INTERVAL_SCALE; divider -= 8; } #endif divider = min(divider, 15U); cntkctl &= ~ARCH_TIMER_EVT_TRIGGER_MASK; /* Set the divider and enable virtual event stream */ cntkctl |= (divider << ARCH_TIMER_EVT_TRIGGER_SHIFT) | ARCH_TIMER_VIRT_EVT_EN; arch_timer_set_cntkctl(cntkctl); arch_timer_set_evtstrm_feature(); cpumask_set_cpu(smp_processor_id(), &evtstrm_available); } static void arch_timer_configure_evtstream(void) { int evt_stream_div, lsb; /* * As the event stream can at most be generated at half the frequency * of the counter, use half the frequency when computing the divider. */ evt_stream_div = arch_timer_rate / ARCH_TIMER_EVT_STREAM_FREQ / 2; /* * Find the closest power of two to the divisor. If the adjacent bit * of lsb (last set bit, starts from 0) is set, then we use (lsb + 1). */ lsb = fls(evt_stream_div) - 1; if (lsb > 0 && (evt_stream_div & BIT(lsb - 1))) lsb++; /* enable event stream */ arch_timer_evtstrm_enable(max(0, lsb)); } static int arch_timer_evtstrm_starting_cpu(unsigned int cpu) { arch_timer_configure_evtstream(); return 0; } static int arch_timer_evtstrm_dying_cpu(unsigned int cpu) { cpumask_clear_cpu(smp_processor_id(), &evtstrm_available); return 0; } static int __init arch_timer_evtstrm_register(void) { if (!arch_timer_evt || !evtstrm_enable) return 0; return cpuhp_setup_state(CPUHP_AP_ARM_ARCH_TIMER_EVTSTRM_STARTING, "clockevents/arm/arch_timer_evtstrm:starting", arch_timer_evtstrm_starting_cpu, arch_timer_evtstrm_dying_cpu); } core_initcall(arch_timer_evtstrm_register); static void arch_counter_set_user_access(void) { u32 cntkctl = arch_timer_get_cntkctl(); /* Disable user access to the timers and both counters */ /* Also disable virtual event stream */ cntkctl &= ~(ARCH_TIMER_USR_PT_ACCESS_EN | ARCH_TIMER_USR_VT_ACCESS_EN | ARCH_TIMER_USR_VCT_ACCESS_EN | ARCH_TIMER_VIRT_EVT_EN | ARCH_TIMER_USR_PCT_ACCESS_EN); /* * Enable user access to the virtual counter if it doesn't * need to be workaround. The vdso may have been already * disabled though. */ if (arch_timer_this_cpu_has_cntvct_wa()) pr_info("CPU%d: Trapping CNTVCT access\n", smp_processor_id()); else cntkctl |= ARCH_TIMER_USR_VCT_ACCESS_EN; arch_timer_set_cntkctl(cntkctl); } static bool arch_timer_has_nonsecure_ppi(void) { return (arch_timer_uses_ppi == ARCH_TIMER_PHYS_SECURE_PPI && arch_timer_ppi[ARCH_TIMER_PHYS_NONSECURE_PPI]); } static u32 check_ppi_trigger(int irq) { u32 flags = irq_get_trigger_type(irq); if (flags != IRQF_TRIGGER_HIGH && flags != IRQF_TRIGGER_LOW) { pr_warn("WARNING: Invalid trigger for IRQ%d, assuming level low\n", irq); pr_warn("WARNING: Please fix your firmware\n"); flags = IRQF_TRIGGER_LOW; } return flags; } static int arch_timer_starting_cpu(unsigned int cpu) { struct clock_event_device *clk = this_cpu_ptr(arch_timer_evt); u32 flags; __arch_timer_setup(ARCH_TIMER_TYPE_CP15, clk); flags = check_ppi_trigger(arch_timer_ppi[arch_timer_uses_ppi]); enable_percpu_irq(arch_timer_ppi[arch_timer_uses_ppi], flags); if (arch_timer_has_nonsecure_ppi()) { flags = check_ppi_trigger(arch_timer_ppi[ARCH_TIMER_PHYS_NONSECURE_PPI]); enable_percpu_irq(arch_timer_ppi[ARCH_TIMER_PHYS_NONSECURE_PPI], flags); } arch_counter_set_user_access(); return 0; } static int validate_timer_rate(void) { if (!arch_timer_rate) return -EINVAL; /* Arch timer frequency < 1MHz can cause trouble */ WARN_ON(arch_timer_rate < 1000000); return 0; } /* * For historical reasons, when probing with DT we use whichever (non-zero) * rate was probed first, and don't verify that others match. If the first node * probed has a clock-frequency property, this overrides the HW register. */ static void __init arch_timer_of_configure_rate(u32 rate, struct device_node *np) { /* Who has more than one independent system counter? */ if (arch_timer_rate) return; if (of_property_read_u32(np, "clock-frequency", &arch_timer_rate)) arch_timer_rate = rate; /* Check the timer frequency. */ if (validate_timer_rate()) pr_warn("frequency not available\n"); } static void __init arch_timer_banner(unsigned type) { pr_info("%s%s%s timer(s) running at %lu.%02luMHz (%s%s%s).\n", type & ARCH_TIMER_TYPE_CP15 ? "cp15" : "", type == (ARCH_TIMER_TYPE_CP15 | ARCH_TIMER_TYPE_MEM) ? " and " : "", type & ARCH_TIMER_TYPE_MEM ? "mmio" : "", (unsigned long)arch_timer_rate / 1000000, (unsigned long)(arch_timer_rate / 10000) % 100, type & ARCH_TIMER_TYPE_CP15 ? (arch_timer_uses_ppi == ARCH_TIMER_VIRT_PPI) ? "virt" : "phys" : "", type == (ARCH_TIMER_TYPE_CP15 | ARCH_TIMER_TYPE_MEM) ? "/" : "", type & ARCH_TIMER_TYPE_MEM ? arch_timer_mem_use_virtual ? "virt" : "phys" : ""); } u32 arch_timer_get_rate(void) { return arch_timer_rate; } bool arch_timer_evtstrm_available(void) { /* * We might get called from a preemptible context. This is fine * because availability of the event stream should be always the same * for a preemptible context and context where we might resume a task. */ return cpumask_test_cpu(raw_smp_processor_id(), &evtstrm_available); } static noinstr u64 arch_counter_get_cntvct_mem(void) { return arch_counter_get_cnt_mem(arch_timer_mem, CNTVCT_LO); } static struct arch_timer_kvm_info arch_timer_kvm_info; struct arch_timer_kvm_info *arch_timer_get_kvm_info(void) { return &arch_timer_kvm_info; } static void __init arch_counter_register(unsigned type) { u64 (*scr)(void); u64 start_count; int width; /* Register the CP15 based counter if we have one */ if (type & ARCH_TIMER_TYPE_CP15) { u64 (*rd)(void); if ((IS_ENABLED(CONFIG_ARM64) && !is_hyp_mode_available()) || arch_timer_uses_ppi == ARCH_TIMER_VIRT_PPI) { if (arch_timer_counter_has_wa()) { rd = arch_counter_get_cntvct_stable; scr = raw_counter_get_cntvct_stable; } else { rd = arch_counter_get_cntvct; scr = arch_counter_get_cntvct; } } else { if (arch_timer_counter_has_wa()) { rd = arch_counter_get_cntpct_stable; scr = raw_counter_get_cntpct_stable; } else { rd = arch_counter_get_cntpct; scr = arch_counter_get_cntpct; } } arch_timer_read_counter = rd; clocksource_counter.vdso_clock_mode = vdso_default; } else { arch_timer_read_counter = arch_counter_get_cntvct_mem; scr = arch_counter_get_cntvct_mem; } width = arch_counter_get_width(); clocksource_counter.mask = CLOCKSOURCE_MASK(width); cyclecounter.mask = CLOCKSOURCE_MASK(width); if (!arch_counter_suspend_stop) clocksource_counter.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP; start_count = arch_timer_read_counter(); clocksource_register_hz(&clocksource_counter, arch_timer_rate); cyclecounter.mult = clocksource_counter.mult; cyclecounter.shift = clocksource_counter.shift; timecounter_init(&arch_timer_kvm_info.timecounter, &cyclecounter, start_count); sched_clock_register(scr, width, arch_timer_rate); } static void arch_timer_stop(struct clock_event_device *clk) { pr_debug("disable IRQ%d cpu #%d\n", clk->irq, smp_processor_id()); disable_percpu_irq(arch_timer_ppi[arch_timer_uses_ppi]); if (arch_timer_has_nonsecure_ppi()) disable_percpu_irq(arch_timer_ppi[ARCH_TIMER_PHYS_NONSECURE_PPI]); } static int arch_timer_dying_cpu(unsigned int cpu) { struct clock_event_device *clk = this_cpu_ptr(arch_timer_evt); arch_timer_stop(clk); return 0; } #ifdef CONFIG_CPU_PM static DEFINE_PER_CPU(unsigned long, saved_cntkctl); static int arch_timer_cpu_pm_notify(struct notifier_block *self, unsigned long action, void *hcpu) { if (action == CPU_PM_ENTER) { __this_cpu_write(saved_cntkctl, arch_timer_get_cntkctl()); cpumask_clear_cpu(smp_processor_id(), &evtstrm_available); } else if (action == CPU_PM_ENTER_FAILED || action == CPU_PM_EXIT) { arch_timer_set_cntkctl(__this_cpu_read(saved_cntkctl)); if (arch_timer_have_evtstrm_feature()) cpumask_set_cpu(smp_processor_id(), &evtstrm_available); } return NOTIFY_OK; } static struct notifier_block arch_timer_cpu_pm_notifier = { .notifier_call = arch_timer_cpu_pm_notify, }; static int __init arch_timer_cpu_pm_init(void) { return cpu_pm_register_notifier(&arch_timer_cpu_pm_notifier); } static void __init arch_timer_cpu_pm_deinit(void) { WARN_ON(cpu_pm_unregister_notifier(&arch_timer_cpu_pm_notifier)); } #else static int __init arch_timer_cpu_pm_init(void) { return 0; } static void __init arch_timer_cpu_pm_deinit(void) { } #endif static int __init arch_timer_register(void) { int err; int ppi; arch_timer_evt = alloc_percpu(struct clock_event_device); if (!arch_timer_evt) { err = -ENOMEM; goto out; } ppi = arch_timer_ppi[arch_timer_uses_ppi]; switch (arch_timer_uses_ppi) { case ARCH_TIMER_VIRT_PPI: err = request_percpu_irq(ppi, arch_timer_handler_virt, "arch_timer", arch_timer_evt); break; case ARCH_TIMER_PHYS_SECURE_PPI: case ARCH_TIMER_PHYS_NONSECURE_PPI: err = request_percpu_irq(ppi, arch_timer_handler_phys, "arch_timer", arch_timer_evt); if (!err && arch_timer_has_nonsecure_ppi()) { ppi = arch_timer_ppi[ARCH_TIMER_PHYS_NONSECURE_PPI]; err = request_percpu_irq(ppi, arch_timer_handler_phys, "arch_timer", arch_timer_evt); if (err) free_percpu_irq(arch_timer_ppi[ARCH_TIMER_PHYS_SECURE_PPI], arch_timer_evt); } break; case ARCH_TIMER_HYP_PPI: err = request_percpu_irq(ppi, arch_timer_handler_phys, "arch_timer", arch_timer_evt); break; default: BUG(); } if (err) { pr_err("can't register interrupt %d (%d)\n", ppi, err); goto out_free; } err = arch_timer_cpu_pm_init(); if (err) goto out_unreg_notify; /* Register and immediately configure the timer on the boot CPU */ err = cpuhp_setup_state(CPUHP_AP_ARM_ARCH_TIMER_STARTING, "clockevents/arm/arch_timer:starting", arch_timer_starting_cpu, arch_timer_dying_cpu); if (err) goto out_unreg_cpupm; return 0; out_unreg_cpupm: arch_timer_cpu_pm_deinit(); out_unreg_notify: free_percpu_irq(arch_timer_ppi[arch_timer_uses_ppi], arch_timer_evt); if (arch_timer_has_nonsecure_ppi()) free_percpu_irq(arch_timer_ppi[ARCH_TIMER_PHYS_NONSECURE_PPI], arch_timer_evt); out_free: free_percpu(arch_timer_evt); arch_timer_evt = NULL; out: return err; } static int __init arch_timer_mem_register(void __iomem *base, unsigned int irq) { int ret; irq_handler_t func; arch_timer_mem = kzalloc(sizeof(*arch_timer_mem), GFP_KERNEL); if (!arch_timer_mem) return -ENOMEM; arch_timer_mem->base = base; arch_timer_mem->evt.irq = irq; __arch_timer_setup(ARCH_TIMER_TYPE_MEM, &arch_timer_mem->evt); if (arch_timer_mem_use_virtual) func = arch_timer_handler_virt_mem; else func = arch_timer_handler_phys_mem; ret = request_irq(irq, func, IRQF_TIMER, "arch_mem_timer", &arch_timer_mem->evt); if (ret) { pr_err("Failed to request mem timer irq\n"); kfree(arch_timer_mem); arch_timer_mem = NULL; } return ret; } static const struct of_device_id arch_timer_of_match[] __initconst = { { .compatible = "arm,armv7-timer", }, { .compatible = "arm,armv8-timer", }, {}, }; static const struct of_device_id arch_timer_mem_of_match[] __initconst = { { .compatible = "arm,armv7-timer-mem", }, {}, }; static bool __init arch_timer_needs_of_probing(void) { struct device_node *dn; bool needs_probing = false; unsigned int mask = ARCH_TIMER_TYPE_CP15 | ARCH_TIMER_TYPE_MEM; /* We have two timers, and both device-tree nodes are probed. */ if ((arch_timers_present & mask) == mask) return false; /* * Only one type of timer is probed, * check if we have another type of timer node in device-tree. */ if (arch_timers_present & ARCH_TIMER_TYPE_CP15) dn = of_find_matching_node(NULL, arch_timer_mem_of_match); else dn = of_find_matching_node(NULL, arch_timer_of_match); if (dn && of_device_is_available(dn)) needs_probing = true; of_node_put(dn); return needs_probing; } static int __init arch_timer_common_init(void) { arch_timer_banner(arch_timers_present); arch_counter_register(arch_timers_present); return arch_timer_arch_init(); } /** * arch_timer_select_ppi() - Select suitable PPI for the current system. * * If HYP mode is available, we know that the physical timer * has been configured to be accessible from PL1. Use it, so * that a guest can use the virtual timer instead. * * On ARMv8.1 with VH extensions, the kernel runs in HYP. VHE * accesses to CNTP_*_EL1 registers are silently redirected to * their CNTHP_*_EL2 counterparts, and use a different PPI * number. * * If no interrupt provided for virtual timer, we'll have to * stick to the physical timer. It'd better be accessible... * For arm64 we never use the secure interrupt. * * Return: a suitable PPI type for the current system. */ static enum arch_timer_ppi_nr __init arch_timer_select_ppi(void) { if (is_kernel_in_hyp_mode()) return ARCH_TIMER_HYP_PPI; if (!is_hyp_mode_available() && arch_timer_ppi[ARCH_TIMER_VIRT_PPI]) return ARCH_TIMER_VIRT_PPI; if (IS_ENABLED(CONFIG_ARM64)) return ARCH_TIMER_PHYS_NONSECURE_PPI; return ARCH_TIMER_PHYS_SECURE_PPI; } static void __init arch_timer_populate_kvm_info(void) { arch_timer_kvm_info.virtual_irq = arch_timer_ppi[ARCH_TIMER_VIRT_PPI]; if (is_kernel_in_hyp_mode()) arch_timer_kvm_info.physical_irq = arch_timer_ppi[ARCH_TIMER_PHYS_NONSECURE_PPI]; } static int __init arch_timer_of_init(struct device_node *np) { int i, irq, ret; u32 rate; bool has_names; if (arch_timers_present & ARCH_TIMER_TYPE_CP15) { pr_warn("multiple nodes in dt, skipping\n"); return 0; } arch_timers_present |= ARCH_TIMER_TYPE_CP15; has_names = of_property_present(np, "interrupt-names"); for (i = ARCH_TIMER_PHYS_SECURE_PPI; i < ARCH_TIMER_MAX_TIMER_PPI; i++) { if (has_names) irq = of_irq_get_byname(np, arch_timer_ppi_names[i]); else irq = of_irq_get(np, i); if (irq > 0) arch_timer_ppi[i] = irq; } arch_timer_populate_kvm_info(); rate = arch_timer_get_cntfrq(); arch_timer_of_configure_rate(rate, np); arch_timer_c3stop = !of_property_read_bool(np, "always-on"); /* Check for globally applicable workarounds */ arch_timer_check_ool_workaround(ate_match_dt, np); /* * If we cannot rely on firmware initializing the timer registers then * we should use the physical timers instead. */ if (IS_ENABLED(CONFIG_ARM) && of_property_read_bool(np, "arm,cpu-registers-not-fw-configured")) arch_timer_uses_ppi = ARCH_TIMER_PHYS_SECURE_PPI; else arch_timer_uses_ppi = arch_timer_select_ppi(); if (!arch_timer_ppi[arch_timer_uses_ppi]) { pr_err("No interrupt available, giving up\n"); return -EINVAL; } /* On some systems, the counter stops ticking when in suspend. */ arch_counter_suspend_stop = of_property_read_bool(np, "arm,no-tick-in-suspend"); ret = arch_timer_register(); if (ret) return ret; if (arch_timer_needs_of_probing()) return 0; return arch_timer_common_init(); } TIMER_OF_DECLARE(armv7_arch_timer, "arm,armv7-timer", arch_timer_of_init); TIMER_OF_DECLARE(armv8_arch_timer, "arm,armv8-timer", arch_timer_of_init); static u32 __init arch_timer_mem_frame_get_cntfrq(struct arch_timer_mem_frame *frame) { void __iomem *base; u32 rate; base = ioremap(frame->cntbase, frame->size); if (!base) { pr_err("Unable to map frame @ %pa\n", &frame->cntbase); return 0; } rate = readl_relaxed(base + CNTFRQ); iounmap(base); return rate; } static struct arch_timer_mem_frame * __init arch_timer_mem_find_best_frame(struct arch_timer_mem *timer_mem) { struct arch_timer_mem_frame *frame, *best_frame = NULL; void __iomem *cntctlbase; u32 cnttidr; int i; cntctlbase = ioremap(timer_mem->cntctlbase, timer_mem->size); if (!cntctlbase) { pr_err("Can't map CNTCTLBase @ %pa\n", &timer_mem->cntctlbase); return NULL; } cnttidr = readl_relaxed(cntctlbase + CNTTIDR); /* * Try to find a virtual capable frame. Otherwise fall back to a * physical capable frame. */ for (i = 0; i < ARCH_TIMER_MEM_MAX_FRAMES; i++) { u32 cntacr = CNTACR_RFRQ | CNTACR_RWPT | CNTACR_RPCT | CNTACR_RWVT | CNTACR_RVOFF | CNTACR_RVCT; frame = &timer_mem->frame[i]; if (!frame->valid) continue; /* Try enabling everything, and see what sticks */ writel_relaxed(cntacr, cntctlbase + CNTACR(i)); cntacr = readl_relaxed(cntctlbase + CNTACR(i)); if ((cnttidr & CNTTIDR_VIRT(i)) && !(~cntacr & (CNTACR_RWVT | CNTACR_RVCT))) { best_frame = frame; arch_timer_mem_use_virtual = true; break; } if (~cntacr & (CNTACR_RWPT | CNTACR_RPCT)) continue; best_frame = frame; } iounmap(cntctlbase); return best_frame; } static int __init arch_timer_mem_frame_register(struct arch_timer_mem_frame *frame) { void __iomem *base; int ret, irq; if (arch_timer_mem_use_virtual) irq = frame->virt_irq; else irq = frame->phys_irq; if (!irq) { pr_err("Frame missing %s irq.\n", arch_timer_mem_use_virtual ? "virt" : "phys"); return -EINVAL; } if (!request_mem_region(frame->cntbase, frame->size, "arch_mem_timer")) return -EBUSY; base = ioremap(frame->cntbase, frame->size); if (!base) { pr_err("Can't map frame's registers\n"); return -ENXIO; } ret = arch_timer_mem_register(base, irq); if (ret) { iounmap(base); return ret; } arch_timers_present |= ARCH_TIMER_TYPE_MEM; return 0; } static int __init arch_timer_mem_of_init(struct device_node *np) { struct arch_timer_mem *timer_mem; struct arch_timer_mem_frame *frame; struct resource res; int ret = -EINVAL; u32 rate; timer_mem = kzalloc(sizeof(*timer_mem), GFP_KERNEL); if (!timer_mem) return -ENOMEM; if (of_address_to_resource(np, 0, &res)) goto out; timer_mem->cntctlbase = res.start; timer_mem->size = resource_size(&res); for_each_available_child_of_node_scoped(np, frame_node) { u32 n; struct arch_timer_mem_frame *frame; if (of_property_read_u32(frame_node, "frame-number", &n)) { pr_err(FW_BUG "Missing frame-number.\n"); goto out; } if (n >= ARCH_TIMER_MEM_MAX_FRAMES) { pr_err(FW_BUG "Wrong frame-number, only 0-%u are permitted.\n", ARCH_TIMER_MEM_MAX_FRAMES - 1); goto out; } frame = &timer_mem->frame[n]; if (frame->valid) { pr_err(FW_BUG "Duplicated frame-number.\n"); goto out; } if (of_address_to_resource(frame_node, 0, &res)) goto out; frame->cntbase = res.start; frame->size = resource_size(&res); frame->virt_irq = irq_of_parse_and_map(frame_node, ARCH_TIMER_VIRT_SPI); frame->phys_irq = irq_of_parse_and_map(frame_node, ARCH_TIMER_PHYS_SPI); frame->valid = true; } frame = arch_timer_mem_find_best_frame(timer_mem); if (!frame) { pr_err("Unable to find a suitable frame in timer @ %pa\n", &timer_mem->cntctlbase); ret = -EINVAL; goto out; } rate = arch_timer_mem_frame_get_cntfrq(frame); arch_timer_of_configure_rate(rate, np); ret = arch_timer_mem_frame_register(frame); if (!ret && !arch_timer_needs_of_probing()) ret = arch_timer_common_init(); out: kfree(timer_mem); return ret; } TIMER_OF_DECLARE(armv7_arch_timer_mem, "arm,armv7-timer-mem", arch_timer_mem_of_init); #ifdef CONFIG_ACPI_GTDT static int __init arch_timer_mem_verify_cntfrq(struct arch_timer_mem *timer_mem) { struct arch_timer_mem_frame *frame; u32 rate; int i; for (i = 0; i < ARCH_TIMER_MEM_MAX_FRAMES; i++) { frame = &timer_mem->frame[i]; if (!frame->valid) continue; rate = arch_timer_mem_frame_get_cntfrq(frame); if (rate == arch_timer_rate) continue; pr_err(FW_BUG "CNTFRQ mismatch: frame @ %pa: (0x%08lx), CPU: (0x%08lx)\n", &frame->cntbase, (unsigned long)rate, (unsigned long)arch_timer_rate); return -EINVAL; } return 0; } static int __init arch_timer_mem_acpi_init(int platform_timer_count) { struct arch_timer_mem *timers, *timer; struct arch_timer_mem_frame *frame, *best_frame = NULL; int timer_count, i, ret = 0; timers = kcalloc(platform_timer_count, sizeof(*timers), GFP_KERNEL); if (!timers) return -ENOMEM; ret = acpi_arch_timer_mem_init(timers, &timer_count); if (ret || !timer_count) goto out; /* * While unlikely, it's theoretically possible that none of the frames * in a timer expose the combination of feature we want. */ for (i = 0; i < timer_count; i++) { timer = &timers[i]; frame = arch_timer_mem_find_best_frame(timer); if (!best_frame) best_frame = frame; ret = arch_timer_mem_verify_cntfrq(timer); if (ret) { pr_err("Disabling MMIO timers due to CNTFRQ mismatch\n"); goto out; } if (!best_frame) /* implies !frame */ /* * Only complain about missing suitable frames if we * haven't already found one in a previous iteration. */ pr_err("Unable to find a suitable frame in timer @ %pa\n", &timer->cntctlbase); } if (best_frame) ret = arch_timer_mem_frame_register(best_frame); out: kfree(timers); return ret; } /* Initialize per-processor generic timer and memory-mapped timer(if present) */ static int __init arch_timer_acpi_init(struct acpi_table_header *table) { int ret, platform_timer_count; if (arch_timers_present & ARCH_TIMER_TYPE_CP15) { pr_warn("already initialized, skipping\n"); return -EINVAL; } arch_timers_present |= ARCH_TIMER_TYPE_CP15; ret = acpi_gtdt_init(table, &platform_timer_count); if (ret) return ret; arch_timer_ppi[ARCH_TIMER_PHYS_NONSECURE_PPI] = acpi_gtdt_map_ppi(ARCH_TIMER_PHYS_NONSECURE_PPI); arch_timer_ppi[ARCH_TIMER_VIRT_PPI] = acpi_gtdt_map_ppi(ARCH_TIMER_VIRT_PPI); arch_timer_ppi[ARCH_TIMER_HYP_PPI] = acpi_gtdt_map_ppi(ARCH_TIMER_HYP_PPI); arch_timer_populate_kvm_info(); /* * When probing via ACPI, we have no mechanism to override the sysreg * CNTFRQ value. This *must* be correct. */ arch_timer_rate = arch_timer_get_cntfrq(); ret = validate_timer_rate(); if (ret) { pr_err(FW_BUG "frequency not available.\n"); return ret; } arch_timer_uses_ppi = arch_timer_select_ppi(); if (!arch_timer_ppi[arch_timer_uses_ppi]) { pr_err("No interrupt available, giving up\n"); return -EINVAL; } /* Always-on capability */ arch_timer_c3stop = acpi_gtdt_c3stop(arch_timer_uses_ppi); /* Check for globally applicable workarounds */ arch_timer_check_ool_workaround(ate_match_acpi_oem_info, table); ret = arch_timer_register(); if (ret) return ret; if (platform_timer_count && arch_timer_mem_acpi_init(platform_timer_count)) pr_err("Failed to initialize memory-mapped timer.\n"); return arch_timer_common_init(); } TIMER_ACPI_DECLARE(arch_timer, ACPI_SIG_GTDT, arch_timer_acpi_init); #endif int kvm_arch_ptp_get_crosststamp(u64 *cycle, struct timespec64 *ts, enum clocksource_ids *cs_id) { struct arm_smccc_res hvc_res; u32 ptp_counter; ktime_t ktime; if (!IS_ENABLED(CONFIG_HAVE_ARM_SMCCC_DISCOVERY)) return -EOPNOTSUPP; if (arch_timer_uses_ppi == ARCH_TIMER_VIRT_PPI) ptp_counter = KVM_PTP_VIRT_COUNTER; else ptp_counter = KVM_PTP_PHYS_COUNTER; arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID, ptp_counter, &hvc_res); if ((int)(hvc_res.a0) < 0) return -EOPNOTSUPP; ktime = (u64)hvc_res.a0 << 32 | hvc_res.a1; *ts = ktime_to_timespec64(ktime); if (cycle) *cycle = (u64)hvc_res.a2 << 32 | hvc_res.a3; if (cs_id) *cs_id = CSID_ARM_ARCH_COUNTER; return 0; } EXPORT_SYMBOL_GPL(kvm_arch_ptp_get_crosststamp);
70 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_ERR_H #define _LINUX_ERR_H #include <linux/compiler.h> #include <linux/types.h> #include <asm/errno.h> /* * Kernel pointers have redundant information, so we can use a * scheme where we can return either an error code or a normal * pointer with the same return value. * * This should be a per-architecture thing, to allow different * error and pointer decisions. */ #define MAX_ERRNO 4095 #ifndef __ASSEMBLY__ /** * IS_ERR_VALUE - Detect an error pointer. * @x: The pointer to check. * * Like IS_ERR(), but does not generate a compiler warning if result is unused. */ #define IS_ERR_VALUE(x) unlikely((unsigned long)(void *)(x) >= (unsigned long)-MAX_ERRNO) /** * ERR_PTR - Create an error pointer. * @error: A negative error code. * * Encodes @error into a pointer value. Users should consider the result * opaque and not assume anything about how the error is encoded. * * Return: A pointer with @error encoded within its value. */ static inline void * __must_check ERR_PTR(long error) { return (void *) error; } /* Return the pointer in the percpu address space. */ #define ERR_PTR_PCPU(error) ((void __percpu *)(unsigned long)ERR_PTR(error)) /* Cast an error pointer to __iomem. */ #define IOMEM_ERR_PTR(error) (__force void __iomem *)ERR_PTR(error) /** * PTR_ERR - Extract the error code from an error pointer. * @ptr: An error pointer. * Return: The error code within @ptr. */ static inline long __must_check PTR_ERR(__force const void *ptr) { return (long) ptr; } /* Read an error pointer from the percpu address space. */ #define PTR_ERR_PCPU(ptr) (PTR_ERR((const void *)(__force const unsigned long)(ptr))) /** * IS_ERR - Detect an error pointer. * @ptr: The pointer to check. * Return: true if @ptr is an error pointer, false otherwise. */ static inline bool __must_check IS_ERR(__force const void *ptr) { return IS_ERR_VALUE((unsigned long)ptr); } /* Read an error pointer from the percpu address space. */ #define IS_ERR_PCPU(ptr) (IS_ERR((const void *)(__force const unsigned long)(ptr))) /** * IS_ERR_OR_NULL - Detect an error pointer or a null pointer. * @ptr: The pointer to check. * * Like IS_ERR(), but also returns true for a null pointer. */ static inline bool __must_check IS_ERR_OR_NULL(__force const void *ptr) { return unlikely(!ptr) || IS_ERR_VALUE((unsigned long)ptr); } /** * ERR_CAST - Explicitly cast an error-valued pointer to another pointer type * @ptr: The pointer to cast. * * Explicitly cast an error-valued pointer to another pointer type in such a * way as to make it clear that's what's going on. */ static inline void * __must_check ERR_CAST(__force const void *ptr) { /* cast away the const */ return (void *) ptr; } /** * PTR_ERR_OR_ZERO - Extract the error code from a pointer if it has one. * @ptr: A potential error pointer. * * Convenience function that can be used inside a function that returns * an error code to propagate errors received as error pointers. * For example, ``return PTR_ERR_OR_ZERO(ptr);`` replaces: * * .. code-block:: c * * if (IS_ERR(ptr)) * return PTR_ERR(ptr); * else * return 0; * * Return: The error code within @ptr if it is an error pointer; 0 otherwise. */ static inline int __must_check PTR_ERR_OR_ZERO(__force const void *ptr) { if (IS_ERR(ptr)) return PTR_ERR(ptr); else return 0; } #endif #endif /* _LINUX_ERR_H */
8 8 8 8 237 118 118 117 68 118 118 237 237 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Based on arch/arm/include/asm/tlbflush.h * * Copyright (C) 1999-2003 Russell King * Copyright (C) 2012 ARM Ltd. */ #ifndef __ASM_TLBFLUSH_H #define __ASM_TLBFLUSH_H #ifndef __ASSEMBLY__ #include <linux/bitfield.h> #include <linux/mm_types.h> #include <linux/sched.h> #include <linux/mmu_notifier.h> #include <asm/cputype.h> #include <asm/mmu.h> /* * Raw TLBI operations. * * Where necessary, use the __tlbi() macro to avoid asm() * boilerplate. Drivers and most kernel code should use the TLB * management routines in preference to the macro below. * * The macro can be used as __tlbi(op) or __tlbi(op, arg), depending * on whether a particular TLBI operation takes an argument or * not. The macros handles invoking the asm with or without the * register argument as appropriate. */ #define __TLBI_0(op, arg) asm (ARM64_ASM_PREAMBLE \ "tlbi " #op "\n" \ ALTERNATIVE("nop\n nop", \ "dsb ish\n tlbi " #op, \ ARM64_WORKAROUND_REPEAT_TLBI, \ CONFIG_ARM64_WORKAROUND_REPEAT_TLBI) \ : : ) #define __TLBI_1(op, arg) asm (ARM64_ASM_PREAMBLE \ "tlbi " #op ", %0\n" \ ALTERNATIVE("nop\n nop", \ "dsb ish\n tlbi " #op ", %0", \ ARM64_WORKAROUND_REPEAT_TLBI, \ CONFIG_ARM64_WORKAROUND_REPEAT_TLBI) \ : : "r" (arg)) #define __TLBI_N(op, arg, n, ...) __TLBI_##n(op, arg) #define __tlbi(op, ...) __TLBI_N(op, ##__VA_ARGS__, 1, 0) #define __tlbi_user(op, arg) do { \ if (arm64_kernel_unmapped_at_el0()) \ __tlbi(op, (arg) | USER_ASID_FLAG); \ } while (0) /* This macro creates a properly formatted VA operand for the TLBI */ #define __TLBI_VADDR(addr, asid) \ ({ \ unsigned long __ta = (addr) >> 12; \ __ta &= GENMASK_ULL(43, 0); \ __ta |= (unsigned long)(asid) << 48; \ __ta; \ }) /* * Get translation granule of the system, which is decided by * PAGE_SIZE. Used by TTL. * - 4KB : 1 * - 16KB : 2 * - 64KB : 3 */ #define TLBI_TTL_TG_4K 1 #define TLBI_TTL_TG_16K 2 #define TLBI_TTL_TG_64K 3 static inline unsigned long get_trans_granule(void) { switch (PAGE_SIZE) { case SZ_4K: return TLBI_TTL_TG_4K; case SZ_16K: return TLBI_TTL_TG_16K; case SZ_64K: return TLBI_TTL_TG_64K; default: return 0; } } /* * Level-based TLBI operations. * * When ARMv8.4-TTL exists, TLBI operations take an additional hint for * the level at which the invalidation must take place. If the level is * wrong, no invalidation may take place. In the case where the level * cannot be easily determined, the value TLBI_TTL_UNKNOWN will perform * a non-hinted invalidation. Any provided level outside the hint range * will also cause fall-back to non-hinted invalidation. * * For Stage-2 invalidation, use the level values provided to that effect * in asm/stage2_pgtable.h. */ #define TLBI_TTL_MASK GENMASK_ULL(47, 44) #define TLBI_TTL_UNKNOWN INT_MAX #define __tlbi_level(op, addr, level) do { \ u64 arg = addr; \ \ if (alternative_has_cap_unlikely(ARM64_HAS_ARMv8_4_TTL) && \ level >= 0 && level <= 3) { \ u64 ttl = level & 3; \ ttl |= get_trans_granule() << 2; \ arg &= ~TLBI_TTL_MASK; \ arg |= FIELD_PREP(TLBI_TTL_MASK, ttl); \ } \ \ __tlbi(op, arg); \ } while(0) #define __tlbi_user_level(op, arg, level) do { \ if (arm64_kernel_unmapped_at_el0()) \ __tlbi_level(op, (arg | USER_ASID_FLAG), level); \ } while (0) /* * This macro creates a properly formatted VA operand for the TLB RANGE. The * value bit assignments are: * * +----------+------+-------+-------+-------+----------------------+ * | ASID | TG | SCALE | NUM | TTL | BADDR | * +-----------------+-------+-------+-------+----------------------+ * |63 48|47 46|45 44|43 39|38 37|36 0| * * The address range is determined by below formula: [BADDR, BADDR + (NUM + 1) * * 2^(5*SCALE + 1) * PAGESIZE) * * Note that the first argument, baddr, is pre-shifted; If LPA2 is in use, BADDR * holds addr[52:16]. Else BADDR holds page number. See for example ARM DDI * 0487J.a section C5.5.60 "TLBI VAE1IS, TLBI VAE1ISNXS, TLB Invalidate by VA, * EL1, Inner Shareable". * */ #define TLBIR_ASID_MASK GENMASK_ULL(63, 48) #define TLBIR_TG_MASK GENMASK_ULL(47, 46) #define TLBIR_SCALE_MASK GENMASK_ULL(45, 44) #define TLBIR_NUM_MASK GENMASK_ULL(43, 39) #define TLBIR_TTL_MASK GENMASK_ULL(38, 37) #define TLBIR_BADDR_MASK GENMASK_ULL(36, 0) #define __TLBI_VADDR_RANGE(baddr, asid, scale, num, ttl) \ ({ \ unsigned long __ta = 0; \ unsigned long __ttl = (ttl >= 1 && ttl <= 3) ? ttl : 0; \ __ta |= FIELD_PREP(TLBIR_BADDR_MASK, baddr); \ __ta |= FIELD_PREP(TLBIR_TTL_MASK, __ttl); \ __ta |= FIELD_PREP(TLBIR_NUM_MASK, num); \ __ta |= FIELD_PREP(TLBIR_SCALE_MASK, scale); \ __ta |= FIELD_PREP(TLBIR_TG_MASK, get_trans_granule()); \ __ta |= FIELD_PREP(TLBIR_ASID_MASK, asid); \ __ta; \ }) /* These macros are used by the TLBI RANGE feature. */ #define __TLBI_RANGE_PAGES(num, scale) \ ((unsigned long)((num) + 1) << (5 * (scale) + 1)) #define MAX_TLBI_RANGE_PAGES __TLBI_RANGE_PAGES(31, 3) /* * Generate 'num' values from -1 to 31 with -1 rejected by the * __flush_tlb_range() loop below. Its return value is only * significant for a maximum of MAX_TLBI_RANGE_PAGES pages. If * 'pages' is more than that, you must iterate over the overall * range. */ #define __TLBI_RANGE_NUM(pages, scale) \ ({ \ int __pages = min((pages), \ __TLBI_RANGE_PAGES(31, (scale))); \ (__pages >> (5 * (scale) + 1)) - 1; \ }) /* * TLB Invalidation * ================ * * This header file implements the low-level TLB invalidation routines * (sometimes referred to as "flushing" in the kernel) for arm64. * * Every invalidation operation uses the following template: * * DSB ISHST // Ensure prior page-table updates have completed * TLBI ... // Invalidate the TLB * DSB ISH // Ensure the TLB invalidation has completed * if (invalidated kernel mappings) * ISB // Discard any instructions fetched from the old mapping * * * The following functions form part of the "core" TLB invalidation API, * as documented in Documentation/core-api/cachetlb.rst: * * flush_tlb_all() * Invalidate the entire TLB (kernel + user) on all CPUs * * flush_tlb_mm(mm) * Invalidate an entire user address space on all CPUs. * The 'mm' argument identifies the ASID to invalidate. * * flush_tlb_range(vma, start, end) * Invalidate the virtual-address range '[start, end)' on all * CPUs for the user address space corresponding to 'vma->mm'. * Note that this operation also invalidates any walk-cache * entries associated with translations for the specified address * range. * * flush_tlb_kernel_range(start, end) * Same as flush_tlb_range(..., start, end), but applies to * kernel mappings rather than a particular user address space. * Whilst not explicitly documented, this function is used when * unmapping pages from vmalloc/io space. * * flush_tlb_page(vma, addr) * Invalidate a single user mapping for address 'addr' in the * address space corresponding to 'vma->mm'. Note that this * operation only invalidates a single, last-level page-table * entry and therefore does not affect any walk-caches. * * * Next, we have some undocumented invalidation routines that you probably * don't want to call unless you know what you're doing: * * local_flush_tlb_all() * Same as flush_tlb_all(), but only applies to the calling CPU. * * __flush_tlb_kernel_pgtable(addr) * Invalidate a single kernel mapping for address 'addr' on all * CPUs, ensuring that any walk-cache entries associated with the * translation are also invalidated. * * __flush_tlb_range(vma, start, end, stride, last_level, tlb_level) * Invalidate the virtual-address range '[start, end)' on all * CPUs for the user address space corresponding to 'vma->mm'. * The invalidation operations are issued at a granularity * determined by 'stride' and only affect any walk-cache entries * if 'last_level' is equal to false. tlb_level is the level at * which the invalidation must take place. If the level is wrong, * no invalidation may take place. In the case where the level * cannot be easily determined, the value TLBI_TTL_UNKNOWN will * perform a non-hinted invalidation. * * * Finally, take a look at asm/tlb.h to see how tlb_flush() is implemented * on top of these routines, since that is our interface to the mmu_gather * API as used by munmap() and friends. */ static inline void local_flush_tlb_all(void) { dsb(nshst); __tlbi(vmalle1); dsb(nsh); isb(); } static inline void flush_tlb_all(void) { dsb(ishst); __tlbi(vmalle1is); dsb(ish); isb(); } static inline void flush_tlb_mm(struct mm_struct *mm) { unsigned long asid; dsb(ishst); asid = __TLBI_VADDR(0, ASID(mm)); __tlbi(aside1is, asid); __tlbi_user(aside1is, asid); dsb(ish); mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); } static inline void __flush_tlb_page_nosync(struct mm_struct *mm, unsigned long uaddr) { unsigned long addr; dsb(ishst); addr = __TLBI_VADDR(uaddr, ASID(mm)); __tlbi(vale1is, addr); __tlbi_user(vale1is, addr); mmu_notifier_arch_invalidate_secondary_tlbs(mm, uaddr & PAGE_MASK, (uaddr & PAGE_MASK) + PAGE_SIZE); } static inline void flush_tlb_page_nosync(struct vm_area_struct *vma, unsigned long uaddr) { return __flush_tlb_page_nosync(vma->vm_mm, uaddr); } static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr) { flush_tlb_page_nosync(vma, uaddr); dsb(ish); } static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm) { /* * TLB flush deferral is not required on systems which are affected by * ARM64_WORKAROUND_REPEAT_TLBI, as __tlbi()/__tlbi_user() implementation * will have two consecutive TLBI instructions with a dsb(ish) in between * defeating the purpose (i.e save overall 'dsb ish' cost). */ if (alternative_has_cap_unlikely(ARM64_WORKAROUND_REPEAT_TLBI)) return false; return true; } /* * To support TLB batched flush for multiple pages unmapping, we only send * the TLBI for each page in arch_tlbbatch_add_pending() and wait for the * completion at the end in arch_tlbbatch_flush(). Since we've already issued * TLBI for each page so only a DSB is needed to synchronise its effect on the * other CPUs. * * This will save the time waiting on DSB comparing issuing a TLBI;DSB sequence * for each page. */ static inline void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) { dsb(ish); } /* * This is meant to avoid soft lock-ups on large TLB flushing ranges and not * necessarily a performance improvement. */ #define MAX_DVM_OPS PTRS_PER_PTE /* * __flush_tlb_range_op - Perform TLBI operation upon a range * * @op: TLBI instruction that operates on a range (has 'r' prefix) * @start: The start address of the range * @pages: Range as the number of pages from 'start' * @stride: Flush granularity * @asid: The ASID of the task (0 for IPA instructions) * @tlb_level: Translation Table level hint, if known * @tlbi_user: If 'true', call an additional __tlbi_user() * (typically for user ASIDs). 'flase' for IPA instructions * @lpa2: If 'true', the lpa2 scheme is used as set out below * * When the CPU does not support TLB range operations, flush the TLB * entries one by one at the granularity of 'stride'. If the TLB * range ops are supported, then: * * 1. If FEAT_LPA2 is in use, the start address of a range operation must be * 64KB aligned, so flush pages one by one until the alignment is reached * using the non-range operations. This step is skipped if LPA2 is not in * use. * * 2. The minimum range granularity is decided by 'scale', so multiple range * TLBI operations may be required. Start from scale = 3, flush the largest * possible number of pages ((num+1)*2^(5*scale+1)) that fit into the * requested range, then decrement scale and continue until one or zero pages * are left. We must start from highest scale to ensure 64KB start alignment * is maintained in the LPA2 case. * * 3. If there is 1 page remaining, flush it through non-range operations. Range * operations can only span an even number of pages. We save this for last to * ensure 64KB start alignment is maintained for the LPA2 case. */ #define __flush_tlb_range_op(op, start, pages, stride, \ asid, tlb_level, tlbi_user, lpa2) \ do { \ typeof(start) __flush_start = start; \ typeof(pages) __flush_pages = pages; \ int num = 0; \ int scale = 3; \ int shift = lpa2 ? 16 : PAGE_SHIFT; \ unsigned long addr; \ \ while (__flush_pages > 0) { \ if (!system_supports_tlb_range() || \ __flush_pages == 1 || \ (lpa2 && __flush_start != ALIGN(__flush_start, SZ_64K))) { \ addr = __TLBI_VADDR(__flush_start, asid); \ __tlbi_level(op, addr, tlb_level); \ if (tlbi_user) \ __tlbi_user_level(op, addr, tlb_level); \ __flush_start += stride; \ __flush_pages -= stride >> PAGE_SHIFT; \ continue; \ } \ \ num = __TLBI_RANGE_NUM(__flush_pages, scale); \ if (num >= 0) { \ addr = __TLBI_VADDR_RANGE(__flush_start >> shift, asid, \ scale, num, tlb_level); \ __tlbi(r##op, addr); \ if (tlbi_user) \ __tlbi_user(r##op, addr); \ __flush_start += __TLBI_RANGE_PAGES(num, scale) << PAGE_SHIFT; \ __flush_pages -= __TLBI_RANGE_PAGES(num, scale);\ } \ scale--; \ } \ } while (0) #define __flush_s2_tlb_range_op(op, start, pages, stride, tlb_level) \ __flush_tlb_range_op(op, start, pages, stride, 0, tlb_level, false, kvm_lpa2_is_enabled()); static inline bool __flush_tlb_range_limit_excess(unsigned long start, unsigned long end, unsigned long pages, unsigned long stride) { /* * When the system does not support TLB range based flush * operation, (MAX_DVM_OPS - 1) pages can be handled. But * with TLB range based operation, MAX_TLBI_RANGE_PAGES * pages can be handled. */ if ((!system_supports_tlb_range() && (end - start) >= (MAX_DVM_OPS * stride)) || pages > MAX_TLBI_RANGE_PAGES) return true; return false; } static inline void __flush_tlb_range_nosync(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned long stride, bool last_level, int tlb_level) { unsigned long asid, pages; start = round_down(start, stride); end = round_up(end, stride); pages = (end - start) >> PAGE_SHIFT; if (__flush_tlb_range_limit_excess(start, end, pages, stride)) { flush_tlb_mm(mm); return; } dsb(ishst); asid = ASID(mm); if (last_level) __flush_tlb_range_op(vale1is, start, pages, stride, asid, tlb_level, true, lpa2_is_enabled()); else __flush_tlb_range_op(vae1is, start, pages, stride, asid, tlb_level, true, lpa2_is_enabled()); mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); } static inline void __flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long stride, bool last_level, int tlb_level) { __flush_tlb_range_nosync(vma->vm_mm, start, end, stride, last_level, tlb_level); dsb(ish); } static inline void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { /* * We cannot use leaf-only invalidation here, since we may be invalidating * table entries as part of collapsing hugepages or moving page tables. * Set the tlb_level to TLBI_TTL_UNKNOWN because we can not get enough * information here. */ __flush_tlb_range(vma, start, end, PAGE_SIZE, false, TLBI_TTL_UNKNOWN); } static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end) { const unsigned long stride = PAGE_SIZE; unsigned long pages; start = round_down(start, stride); end = round_up(end, stride); pages = (end - start) >> PAGE_SHIFT; if (__flush_tlb_range_limit_excess(start, end, pages, stride)) { flush_tlb_all(); return; } dsb(ishst); __flush_tlb_range_op(vaale1is, start, pages, stride, 0, TLBI_TTL_UNKNOWN, false, lpa2_is_enabled()); dsb(ish); isb(); } /* * Used to invalidate the TLB (walk caches) corresponding to intermediate page * table levels (pgd/pud/pmd). */ static inline void __flush_tlb_kernel_pgtable(unsigned long kaddr) { unsigned long addr = __TLBI_VADDR(kaddr, 0); dsb(ishst); __tlbi(vaae1is, addr); dsb(ish); isb(); } static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch, struct mm_struct *mm, unsigned long start, unsigned long end) { __flush_tlb_range_nosync(mm, start, end, PAGE_SIZE, true, 3); } #endif #endif
31 3 12 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM filemap #if !defined(_TRACE_FILEMAP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_FILEMAP_H #include <linux/types.h> #include <linux/tracepoint.h> #include <linux/mm.h> #include <linux/memcontrol.h> #include <linux/device.h> #include <linux/kdev_t.h> #include <linux/errseq.h> DECLARE_EVENT_CLASS(mm_filemap_op_page_cache, TP_PROTO(struct folio *folio), TP_ARGS(folio), TP_STRUCT__entry( __field(unsigned long, pfn) __field(unsigned long, i_ino) __field(unsigned long, index) __field(dev_t, s_dev) __field(unsigned char, order) ), TP_fast_assign( __entry->pfn = folio_pfn(folio); __entry->i_ino = folio->mapping->host->i_ino; __entry->index = folio->index; if (folio->mapping->host->i_sb) __entry->s_dev = folio->mapping->host->i_sb->s_dev; else __entry->s_dev = folio->mapping->host->i_rdev; __entry->order = folio_order(folio); ), TP_printk("dev %d:%d ino %lx pfn=0x%lx ofs=%lu order=%u", MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, __entry->pfn, __entry->index << PAGE_SHIFT, __entry->order) ); DEFINE_EVENT(mm_filemap_op_page_cache, mm_filemap_delete_from_page_cache, TP_PROTO(struct folio *folio), TP_ARGS(folio) ); DEFINE_EVENT(mm_filemap_op_page_cache, mm_filemap_add_to_page_cache, TP_PROTO(struct folio *folio), TP_ARGS(folio) ); DECLARE_EVENT_CLASS(mm_filemap_op_page_cache_range, TP_PROTO( struct address_space *mapping, pgoff_t index, pgoff_t last_index ), TP_ARGS(mapping, index, last_index), TP_STRUCT__entry( __field(unsigned long, i_ino) __field(dev_t, s_dev) __field(unsigned long, index) __field(unsigned long, last_index) ), TP_fast_assign( __entry->i_ino = mapping->host->i_ino; if (mapping->host->i_sb) __entry->s_dev = mapping->host->i_sb->s_dev; else __entry->s_dev = mapping->host->i_rdev; __entry->index = index; __entry->last_index = last_index; ), TP_printk( "dev=%d:%d ino=%lx ofs=%lld-%lld", MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, ((loff_t)__entry->index) << PAGE_SHIFT, ((((loff_t)__entry->last_index + 1) << PAGE_SHIFT) - 1) ) ); DEFINE_EVENT(mm_filemap_op_page_cache_range, mm_filemap_get_pages, TP_PROTO( struct address_space *mapping, pgoff_t index, pgoff_t last_index ), TP_ARGS(mapping, index, last_index) ); DEFINE_EVENT(mm_filemap_op_page_cache_range, mm_filemap_map_pages, TP_PROTO( struct address_space *mapping, pgoff_t index, pgoff_t last_index ), TP_ARGS(mapping, index, last_index) ); TRACE_EVENT(mm_filemap_fault, TP_PROTO(struct address_space *mapping, pgoff_t index), TP_ARGS(mapping, index), TP_STRUCT__entry( __field(unsigned long, i_ino) __field(dev_t, s_dev) __field(unsigned long, index) ), TP_fast_assign( __entry->i_ino = mapping->host->i_ino; if (mapping->host->i_sb) __entry->s_dev = mapping->host->i_sb->s_dev; else __entry->s_dev = mapping->host->i_rdev; __entry->index = index; ), TP_printk( "dev=%d:%d ino=%lx ofs=%lld", MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, ((loff_t)__entry->index) << PAGE_SHIFT ) ); TRACE_EVENT(filemap_set_wb_err, TP_PROTO(struct address_space *mapping, errseq_t eseq), TP_ARGS(mapping, eseq), TP_STRUCT__entry( __field(unsigned long, i_ino) __field(dev_t, s_dev) __field(errseq_t, errseq) ), TP_fast_assign( __entry->i_ino = mapping->host->i_ino; __entry->errseq = eseq; if (mapping->host->i_sb) __entry->s_dev = mapping->host->i_sb->s_dev; else __entry->s_dev = mapping->host->i_rdev; ), TP_printk("dev=%d:%d ino=0x%lx errseq=0x%x", MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, __entry->errseq) ); TRACE_EVENT(file_check_and_advance_wb_err, TP_PROTO(struct file *file, errseq_t old), TP_ARGS(file, old), TP_STRUCT__entry( __field(struct file *, file) __field(unsigned long, i_ino) __field(dev_t, s_dev) __field(errseq_t, old) __field(errseq_t, new) ), TP_fast_assign( __entry->file = file; __entry->i_ino = file->f_mapping->host->i_ino; if (file->f_mapping->host->i_sb) __entry->s_dev = file->f_mapping->host->i_sb->s_dev; else __entry->s_dev = file->f_mapping->host->i_rdev; __entry->old = old; __entry->new = file->f_wb_err; ), TP_printk("file=%p dev=%d:%d ino=0x%lx old=0x%x new=0x%x", __entry->file, MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, __entry->old, __entry->new) ); #endif /* _TRACE_FILEMAP_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
8 8 8 8 8 8 8 8 8 3 8 8 3 213 213 213 213 212 211 185 185 185 8 8 8 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 // SPDX-License-Identifier: GPL-2.0 /* * Copyright 2019 Arm Limited * Author: Andrew Murray <Andrew.Murray@arm.com> */ #include <linux/kvm_host.h> #include <linux/perf_event.h> #include <linux/perf/arm_pmu.h> #include <linux/perf/arm_pmuv3.h> static DEFINE_PER_CPU(struct kvm_pmu_events, kvm_pmu_events); /* * Given the perf event attributes and system type, determine * if we are going to need to switch counters at guest entry/exit. */ static bool kvm_pmu_switch_needed(struct perf_event_attr *attr) { /** * With VHE the guest kernel runs at EL1 and the host at EL2, * where user (EL0) is excluded then we have no reason to switch * counters. */ if (has_vhe() && attr->exclude_user) return false; /* Only switch if attributes are different */ return (attr->exclude_host != attr->exclude_guest); } struct kvm_pmu_events *kvm_get_pmu_events(void) { return this_cpu_ptr(&kvm_pmu_events); } /* * Add events to track that we may want to switch at guest entry/exit * time. */ void kvm_set_pmu_events(u64 set, struct perf_event_attr *attr) { struct kvm_pmu_events *pmu = kvm_get_pmu_events(); if (!system_supports_pmuv3() || !kvm_pmu_switch_needed(attr)) return; if (!attr->exclude_host) pmu->events_host |= set; if (!attr->exclude_guest) pmu->events_guest |= set; } /* * Stop tracking events */ void kvm_clr_pmu_events(u64 clr) { struct kvm_pmu_events *pmu = kvm_get_pmu_events(); if (!system_supports_pmuv3()) return; pmu->events_host &= ~clr; pmu->events_guest &= ~clr; } /* * Read a value direct from PMEVTYPER<idx> where idx is 0-30 * or PMxCFILTR_EL0 where idx is 31-32. */ static u64 kvm_vcpu_pmu_read_evtype_direct(int idx) { if (idx == ARMV8_PMU_CYCLE_IDX) return read_pmccfiltr(); else if (idx == ARMV8_PMU_INSTR_IDX) return read_pmicfiltr(); return read_pmevtypern(idx); } /* * Write a value direct to PMEVTYPER<idx> where idx is 0-30 * or PMxCFILTR_EL0 where idx is 31-32. */ static void kvm_vcpu_pmu_write_evtype_direct(int idx, u32 val) { if (idx == ARMV8_PMU_CYCLE_IDX) write_pmccfiltr(val); else if (idx == ARMV8_PMU_INSTR_IDX) write_pmicfiltr(val); else write_pmevtypern(idx, val); } /* * Modify ARMv8 PMU events to include EL0 counting */ static void kvm_vcpu_pmu_enable_el0(unsigned long events) { u64 typer; u32 counter; for_each_set_bit(counter, &events, ARMPMU_MAX_HWEVENTS) { typer = kvm_vcpu_pmu_read_evtype_direct(counter); typer &= ~ARMV8_PMU_EXCLUDE_EL0; kvm_vcpu_pmu_write_evtype_direct(counter, typer); } } /* * Modify ARMv8 PMU events to exclude EL0 counting */ static void kvm_vcpu_pmu_disable_el0(unsigned long events) { u64 typer; u32 counter; for_each_set_bit(counter, &events, ARMPMU_MAX_HWEVENTS) { typer = kvm_vcpu_pmu_read_evtype_direct(counter); typer |= ARMV8_PMU_EXCLUDE_EL0; kvm_vcpu_pmu_write_evtype_direct(counter, typer); } } /* * On VHE ensure that only guest events have EL0 counting enabled. * This is called from both vcpu_{load,put} and the sysreg handling. * Since the latter is preemptible, special care must be taken to * disable preemption. */ void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu) { struct kvm_pmu_events *pmu; u64 events_guest, events_host; if (!system_supports_pmuv3() || !has_vhe()) return; preempt_disable(); pmu = kvm_get_pmu_events(); events_guest = pmu->events_guest; events_host = pmu->events_host; kvm_vcpu_pmu_enable_el0(events_guest); kvm_vcpu_pmu_disable_el0(events_host); preempt_enable(); } /* * On VHE ensure that only host events have EL0 counting enabled */ void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu) { struct kvm_pmu_events *pmu; u64 events_guest, events_host; if (!system_supports_pmuv3() || !has_vhe()) return; pmu = kvm_get_pmu_events(); events_guest = pmu->events_guest; events_host = pmu->events_host; kvm_vcpu_pmu_enable_el0(events_host); kvm_vcpu_pmu_disable_el0(events_guest); } /* * With VHE, keep track of the PMUSERENR_EL0 value for the host EL0 on the pCPU * where PMUSERENR_EL0 for the guest is loaded, since PMUSERENR_EL0 is switched * to the value for the guest on vcpu_load(). The value for the host EL0 * will be restored on vcpu_put(), before returning to userspace. * This isn't necessary for nVHE, as the register is context switched for * every guest enter/exit. * * Return true if KVM takes care of the register. Otherwise return false. */ bool kvm_set_pmuserenr(u64 val) { struct kvm_cpu_context *hctxt; struct kvm_vcpu *vcpu; if (!system_supports_pmuv3() || !has_vhe()) return false; vcpu = kvm_get_running_vcpu(); if (!vcpu || !vcpu_get_flag(vcpu, PMUSERENR_ON_CPU)) return false; hctxt = host_data_ptr(host_ctxt); ctxt_sys_reg(hctxt, PMUSERENR_EL0) = val; return true; } /* * If we interrupted the guest to update the host PMU context, make * sure we re-apply the guest EL0 state. */ void kvm_vcpu_pmu_resync_el0(void) { struct kvm_vcpu *vcpu; if (!has_vhe() || !in_interrupt()) return; vcpu = kvm_get_running_vcpu(); if (!vcpu) return; kvm_make_request(KVM_REQ_RESYNC_PMU_EL0, vcpu); }
4 1 4 1 4 4 1 4 2 4 4 4 1 1 4 4 211 147 66 212 209 185 60 127 187 188 61 61 65 67 67 60 61 61 1 60 65 67 1 66 67 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2012-2015 - ARM Ltd * Author: Marc Zyngier <marc.zyngier@arm.com> */ #include <hyp/adjust_pc.h> #include <linux/compiler.h> #include <linux/irqchip/arm-gic-v3.h> #include <linux/kvm_host.h> #include <asm/kvm_emulate.h> #include <asm/kvm_hyp.h> #include <asm/kvm_mmu.h> #define vtr_to_max_lr_idx(v) ((v) & 0xf) #define vtr_to_nr_pre_bits(v) ((((u32)(v) >> 26) & 7) + 1) #define vtr_to_nr_apr_regs(v) (1 << (vtr_to_nr_pre_bits(v) - 5)) u64 __gic_v3_get_lr(unsigned int lr) { switch (lr & 0xf) { case 0: return read_gicreg(ICH_LR0_EL2); case 1: return read_gicreg(ICH_LR1_EL2); case 2: return read_gicreg(ICH_LR2_EL2); case 3: return read_gicreg(ICH_LR3_EL2); case 4: return read_gicreg(ICH_LR4_EL2); case 5: return read_gicreg(ICH_LR5_EL2); case 6: return read_gicreg(ICH_LR6_EL2); case 7: return read_gicreg(ICH_LR7_EL2); case 8: return read_gicreg(ICH_LR8_EL2); case 9: return read_gicreg(ICH_LR9_EL2); case 10: return read_gicreg(ICH_LR10_EL2); case 11: return read_gicreg(ICH_LR11_EL2); case 12: return read_gicreg(ICH_LR12_EL2); case 13: return read_gicreg(ICH_LR13_EL2); case 14: return read_gicreg(ICH_LR14_EL2); case 15: return read_gicreg(ICH_LR15_EL2); } unreachable(); } static void __gic_v3_set_lr(u64 val, int lr) { switch (lr & 0xf) { case 0: write_gicreg(val, ICH_LR0_EL2); break; case 1: write_gicreg(val, ICH_LR1_EL2); break; case 2: write_gicreg(val, ICH_LR2_EL2); break; case 3: write_gicreg(val, ICH_LR3_EL2); break; case 4: write_gicreg(val, ICH_LR4_EL2); break; case 5: write_gicreg(val, ICH_LR5_EL2); break; case 6: write_gicreg(val, ICH_LR6_EL2); break; case 7: write_gicreg(val, ICH_LR7_EL2); break; case 8: write_gicreg(val, ICH_LR8_EL2); break; case 9: write_gicreg(val, ICH_LR9_EL2); break; case 10: write_gicreg(val, ICH_LR10_EL2); break; case 11: write_gicreg(val, ICH_LR11_EL2); break; case 12: write_gicreg(val, ICH_LR12_EL2); break; case 13: write_gicreg(val, ICH_LR13_EL2); break; case 14: write_gicreg(val, ICH_LR14_EL2); break; case 15: write_gicreg(val, ICH_LR15_EL2); break; } } static void __vgic_v3_write_ap0rn(u32 val, int n) { switch (n) { case 0: write_gicreg(val, ICH_AP0R0_EL2); break; case 1: write_gicreg(val, ICH_AP0R1_EL2); break; case 2: write_gicreg(val, ICH_AP0R2_EL2); break; case 3: write_gicreg(val, ICH_AP0R3_EL2); break; } } static void __vgic_v3_write_ap1rn(u32 val, int n) { switch (n) { case 0: write_gicreg(val, ICH_AP1R0_EL2); break; case 1: write_gicreg(val, ICH_AP1R1_EL2); break; case 2: write_gicreg(val, ICH_AP1R2_EL2); break; case 3: write_gicreg(val, ICH_AP1R3_EL2); break; } } static u32 __vgic_v3_read_ap0rn(int n) { u32 val; switch (n) { case 0: val = read_gicreg(ICH_AP0R0_EL2); break; case 1: val = read_gicreg(ICH_AP0R1_EL2); break; case 2: val = read_gicreg(ICH_AP0R2_EL2); break; case 3: val = read_gicreg(ICH_AP0R3_EL2); break; default: unreachable(); } return val; } static u32 __vgic_v3_read_ap1rn(int n) { u32 val; switch (n) { case 0: val = read_gicreg(ICH_AP1R0_EL2); break; case 1: val = read_gicreg(ICH_AP1R1_EL2); break; case 2: val = read_gicreg(ICH_AP1R2_EL2); break; case 3: val = read_gicreg(ICH_AP1R3_EL2); break; default: unreachable(); } return val; } void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if) { u64 used_lrs = cpu_if->used_lrs; /* * Make sure stores to the GIC via the memory mapped interface * are now visible to the system register interface when reading the * LRs, and when reading back the VMCR on non-VHE systems. */ if (used_lrs || !has_vhe()) { if (!cpu_if->vgic_sre) { dsb(sy); isb(); } } if (used_lrs || cpu_if->its_vpe.its_vm) { int i; u32 elrsr; elrsr = read_gicreg(ICH_ELRSR_EL2); write_gicreg(cpu_if->vgic_hcr & ~ICH_HCR_EL2_En, ICH_HCR_EL2); for (i = 0; i < used_lrs; i++) { if (elrsr & (1 << i)) cpu_if->vgic_lr[i] &= ~ICH_LR_STATE; else cpu_if->vgic_lr[i] = __gic_v3_get_lr(i); __gic_v3_set_lr(0, i); } } } void __vgic_v3_restore_state(struct vgic_v3_cpu_if *cpu_if) { u64 used_lrs = cpu_if->used_lrs; int i; if (used_lrs || cpu_if->its_vpe.its_vm) { write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2); for (i = 0; i < used_lrs; i++) __gic_v3_set_lr(cpu_if->vgic_lr[i], i); } /* * Ensure that writes to the LRs, and on non-VHE systems ensure that * the write to the VMCR in __vgic_v3_activate_traps(), will have * reached the (re)distributors. This ensure the guest will read the * correct values from the memory-mapped interface. */ if (used_lrs || !has_vhe()) { if (!cpu_if->vgic_sre) { isb(); dsb(sy); } } } void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if) { /* * VFIQEn is RES1 if ICC_SRE_EL1.SRE is 1. This causes a * Group0 interrupt (as generated in GICv2 mode) to be * delivered as a FIQ to the guest, with potentially fatal * consequences. So we must make sure that ICC_SRE_EL1 has * been actually programmed with the value we want before * starting to mess with the rest of the GIC, and VMCR_EL2 in * particular. This logic must be called before * __vgic_v3_restore_state(). * * However, if the vgic is disabled (ICH_HCR_EL2.EN==0), no GIC is * provisioned at all. In order to prevent illegal accesses to the * system registers to trap to EL1 (duh), force ICC_SRE_EL1.SRE to 1 * so that the trap bits can take effect. Yes, we *loves* the GIC. */ if (!(cpu_if->vgic_hcr & ICH_HCR_EL2_En)) { write_gicreg(ICC_SRE_EL1_SRE, ICC_SRE_EL1); isb(); } else if (!cpu_if->vgic_sre) { write_gicreg(0, ICC_SRE_EL1); isb(); write_gicreg(cpu_if->vgic_vmcr, ICH_VMCR_EL2); if (has_vhe()) { /* * Ensure that the write to the VMCR will have reached * the (re)distributors. This ensure the guest will * read the correct values from the memory-mapped * interface. */ isb(); dsb(sy); } } /* Only disable SRE if the host implements the GICv2 interface */ if (static_branch_unlikely(&vgic_v3_has_v2_compat)) { /* * Prevent the guest from touching the ICC_SRE_EL1 system * register. Note that this may not have any effect, as * ICC_SRE_EL2.Enable being RAO/WI is a valid implementation. */ write_gicreg(read_gicreg(ICC_SRE_EL2) & ~ICC_SRE_EL2_ENABLE, ICC_SRE_EL2); } /* * If we need to trap system registers, we must write * ICH_HCR_EL2 anyway, even if no interrupts are being * injected. Note that this also applies if we don't expect * any system register access (no vgic at all). */ if (static_branch_unlikely(&vgic_v3_cpuif_trap) || cpu_if->its_vpe.its_vm || !cpu_if->vgic_sre) write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2); } void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if) { u64 val; if (!cpu_if->vgic_sre) { cpu_if->vgic_vmcr = read_gicreg(ICH_VMCR_EL2); } /* Only restore SRE if the host implements the GICv2 interface */ if (static_branch_unlikely(&vgic_v3_has_v2_compat)) { val = read_gicreg(ICC_SRE_EL2); write_gicreg(val | ICC_SRE_EL2_ENABLE, ICC_SRE_EL2); if (!cpu_if->vgic_sre) { /* Make sure ENABLE is set at EL2 before setting SRE at EL1 */ isb(); write_gicreg(1, ICC_SRE_EL1); } } /* * If we were trapping system registers, we enabled the VGIC even if * no interrupts were being injected, and we disable it again here. */ if (static_branch_unlikely(&vgic_v3_cpuif_trap) || cpu_if->its_vpe.its_vm || !cpu_if->vgic_sre) write_gicreg(0, ICH_HCR_EL2); } static void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if) { u64 val; u32 nr_pre_bits; val = read_gicreg(ICH_VTR_EL2); nr_pre_bits = vtr_to_nr_pre_bits(val); switch (nr_pre_bits) { case 7: cpu_if->vgic_ap0r[3] = __vgic_v3_read_ap0rn(3); cpu_if->vgic_ap0r[2] = __vgic_v3_read_ap0rn(2); fallthrough; case 6: cpu_if->vgic_ap0r[1] = __vgic_v3_read_ap0rn(1); fallthrough; default: cpu_if->vgic_ap0r[0] = __vgic_v3_read_ap0rn(0); } switch (nr_pre_bits) { case 7: cpu_if->vgic_ap1r[3] = __vgic_v3_read_ap1rn(3); cpu_if->vgic_ap1r[2] = __vgic_v3_read_ap1rn(2); fallthrough; case 6: cpu_if->vgic_ap1r[1] = __vgic_v3_read_ap1rn(1); fallthrough; default: cpu_if->vgic_ap1r[0] = __vgic_v3_read_ap1rn(0); } } static void __vgic_v3_restore_aprs(struct vgic_v3_cpu_if *cpu_if) { u64 val; u32 nr_pre_bits; val = read_gicreg(ICH_VTR_EL2); nr_pre_bits = vtr_to_nr_pre_bits(val); switch (nr_pre_bits) { case 7: __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[3], 3); __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[2], 2); fallthrough; case 6: __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[1], 1); fallthrough; default: __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[0], 0); } switch (nr_pre_bits) { case 7: __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[3], 3); __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[2], 2); fallthrough; case 6: __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[1], 1); fallthrough; default: __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[0], 0); } } void __vgic_v3_init_lrs(void) { int max_lr_idx = vtr_to_max_lr_idx(read_gicreg(ICH_VTR_EL2)); int i; for (i = 0; i <= max_lr_idx; i++) __gic_v3_set_lr(0, i); } /* * Return the GIC CPU configuration: * - [31:0] ICH_VTR_EL2 * - [62:32] RES0 * - [63] MMIO (GICv2) capable */ u64 __vgic_v3_get_gic_config(void) { u64 val, sre; unsigned long flags = 0; /* * In compat mode, we cannot access ICC_SRE_EL1 at any EL * other than EL1 itself; just return the * ICH_VTR_EL2. ICC_IDR0_EL1 is only implemented on a GICv5 * system, so we first check if we have GICv5 support. */ if (cpus_have_final_cap(ARM64_HAS_GICV5_CPUIF)) return read_gicreg(ICH_VTR_EL2); sre = read_gicreg(ICC_SRE_EL1); /* * To check whether we have a MMIO-based (GICv2 compatible) * CPU interface, we need to disable the system register * view. * * Table 11-2 "Permitted ICC_SRE_ELx.SRE settings" indicates * that to be able to set ICC_SRE_EL1.SRE to 0, all the * interrupt overrides must be set. You've got to love this. * * As we always run VHE with HCR_xMO set, no extra xMO * manipulation is required in that case. * * To safely disable SRE, we have to prevent any interrupt * from firing (which would be deadly). This only makes sense * on VHE, as interrupts are already masked for nVHE as part * of the exception entry to EL2. */ if (has_vhe()) { flags = local_daif_save(); } else { sysreg_clear_set_hcr(0, HCR_AMO | HCR_FMO | HCR_IMO); isb(); } write_gicreg(0, ICC_SRE_EL1); isb(); val = read_gicreg(ICC_SRE_EL1); write_gicreg(sre, ICC_SRE_EL1); isb(); if (has_vhe()) { local_daif_restore(flags); } else { sysreg_clear_set_hcr(HCR_AMO | HCR_FMO | HCR_IMO, 0); isb(); } val = (val & ICC_SRE_EL1_SRE) ? 0 : (1ULL << 63); val |= read_gicreg(ICH_VTR_EL2); return val; } static void __vgic_v3_compat_mode_enable(void) { if (!cpus_have_final_cap(ARM64_HAS_GICV5_CPUIF)) return; sysreg_clear_set_s(SYS_ICH_VCTLR_EL2, 0, ICH_VCTLR_EL2_V3); /* Wait for V3 to become enabled */ isb(); } static u64 __vgic_v3_read_vmcr(void) { return read_gicreg(ICH_VMCR_EL2); } static void __vgic_v3_write_vmcr(u32 vmcr) { write_gicreg(vmcr, ICH_VMCR_EL2); } void __vgic_v3_save_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if) { __vgic_v3_save_aprs(cpu_if); if (cpu_if->vgic_sre) cpu_if->vgic_vmcr = __vgic_v3_read_vmcr(); } void __vgic_v3_restore_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if) { __vgic_v3_compat_mode_enable(); /* * If dealing with a GICv2 emulation on GICv3, VMCR_EL2.VFIQen * is dependent on ICC_SRE_EL1.SRE, and we have to perform the * VMCR_EL2 save/restore in the world switch. */ if (cpu_if->vgic_sre) __vgic_v3_write_vmcr(cpu_if->vgic_vmcr); __vgic_v3_restore_aprs(cpu_if); } static int __vgic_v3_bpr_min(void) { /* See Pseudocode for VPriorityGroup */ return 8 - vtr_to_nr_pre_bits(read_gicreg(ICH_VTR_EL2)); } static int __vgic_v3_get_group(struct kvm_vcpu *vcpu) { u64 esr = kvm_vcpu_get_esr(vcpu); u8 crm = (esr & ESR_ELx_SYS64_ISS_CRM_MASK) >> ESR_ELx_SYS64_ISS_CRM_SHIFT; return crm != 8; } #define GICv3_IDLE_PRIORITY 0xff static int __vgic_v3_highest_priority_lr(struct kvm_vcpu *vcpu, u32 vmcr, u64 *lr_val) { unsigned int used_lrs = vcpu->arch.vgic_cpu.vgic_v3.used_lrs; u8 priority = GICv3_IDLE_PRIORITY; int i, lr = -1; for (i = 0; i < used_lrs; i++) { u64 val = __gic_v3_get_lr(i); u8 lr_prio = (val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT; /* Not pending in the state? */ if ((val & ICH_LR_STATE) != ICH_LR_PENDING_BIT) continue; /* Group-0 interrupt, but Group-0 disabled? */ if (!(val & ICH_LR_GROUP) && !(vmcr & ICH_VMCR_ENG0_MASK)) continue; /* Group-1 interrupt, but Group-1 disabled? */ if ((val & ICH_LR_GROUP) && !(vmcr & ICH_VMCR_ENG1_MASK)) continue; /* Not the highest priority? */ if (lr_prio >= priority) continue; /* This is a candidate */ priority = lr_prio; *lr_val = val; lr = i; } if (lr == -1) *lr_val = ICC_IAR1_EL1_SPURIOUS; return lr; } static int __vgic_v3_find_active_lr(struct kvm_vcpu *vcpu, int intid, u64 *lr_val) { unsigned int used_lrs = vcpu->arch.vgic_cpu.vgic_v3.used_lrs; int i; for (i = 0; i < used_lrs; i++) { u64 val = __gic_v3_get_lr(i); if ((val & ICH_LR_VIRTUAL_ID_MASK) == intid && (val & ICH_LR_ACTIVE_BIT)) { *lr_val = val; return i; } } *lr_val = ICC_IAR1_EL1_SPURIOUS; return -1; } static int __vgic_v3_get_highest_active_priority(void) { u8 nr_apr_regs = vtr_to_nr_apr_regs(read_gicreg(ICH_VTR_EL2)); u32 hap = 0; int i; for (i = 0; i < nr_apr_regs; i++) { u32 val; /* * The ICH_AP0Rn_EL2 and ICH_AP1Rn_EL2 registers * contain the active priority levels for this VCPU * for the maximum number of supported priority * levels, and we return the full priority level only * if the BPR is programmed to its minimum, otherwise * we return a combination of the priority level and * subpriority, as determined by the setting of the * BPR, but without the full subpriority. */ val = __vgic_v3_read_ap0rn(i); val |= __vgic_v3_read_ap1rn(i); if (!val) { hap += 32; continue; } return (hap + __ffs(val)) << __vgic_v3_bpr_min(); } return GICv3_IDLE_PRIORITY; } static unsigned int __vgic_v3_get_bpr0(u32 vmcr) { return (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT; } static unsigned int __vgic_v3_get_bpr1(u32 vmcr) { unsigned int bpr; if (vmcr & ICH_VMCR_CBPR_MASK) { bpr = __vgic_v3_get_bpr0(vmcr); if (bpr < 7) bpr++; } else { bpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT; } return bpr; } /* * Convert a priority to a preemption level, taking the relevant BPR * into account by zeroing the sub-priority bits. */ static u8 __vgic_v3_pri_to_pre(u8 pri, u32 vmcr, int grp) { unsigned int bpr; if (!grp) bpr = __vgic_v3_get_bpr0(vmcr) + 1; else bpr = __vgic_v3_get_bpr1(vmcr); return pri & (GENMASK(7, 0) << bpr); } /* * The priority value is independent of any of the BPR values, so we * normalize it using the minimal BPR value. This guarantees that no * matter what the guest does with its BPR, we can always set/get the * same value of a priority. */ static void __vgic_v3_set_active_priority(u8 pri, u32 vmcr, int grp) { u8 pre, ap; u32 val; int apr; pre = __vgic_v3_pri_to_pre(pri, vmcr, grp); ap = pre >> __vgic_v3_bpr_min(); apr = ap / 32; if (!grp) { val = __vgic_v3_read_ap0rn(apr); __vgic_v3_write_ap0rn(val | BIT(ap % 32), apr); } else { val = __vgic_v3_read_ap1rn(apr); __vgic_v3_write_ap1rn(val | BIT(ap % 32), apr); } } static int __vgic_v3_clear_highest_active_priority(void) { u8 nr_apr_regs = vtr_to_nr_apr_regs(read_gicreg(ICH_VTR_EL2)); u32 hap = 0; int i; for (i = 0; i < nr_apr_regs; i++) { u32 ap0, ap1; int c0, c1; ap0 = __vgic_v3_read_ap0rn(i); ap1 = __vgic_v3_read_ap1rn(i); if (!ap0 && !ap1) { hap += 32; continue; } c0 = ap0 ? __ffs(ap0) : 32; c1 = ap1 ? __ffs(ap1) : 32; /* Always clear the LSB, which is the highest priority */ if (c0 < c1) { ap0 &= ~BIT(c0); __vgic_v3_write_ap0rn(ap0, i); hap += c0; } else { ap1 &= ~BIT(c1); __vgic_v3_write_ap1rn(ap1, i); hap += c1; } /* Rescale to 8 bits of priority */ return hap << __vgic_v3_bpr_min(); } return GICv3_IDLE_PRIORITY; } static void __vgic_v3_read_iar(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { u64 lr_val; u8 lr_prio, pmr; int lr, grp; grp = __vgic_v3_get_group(vcpu); lr = __vgic_v3_highest_priority_lr(vcpu, vmcr, &lr_val); if (lr < 0) goto spurious; if (grp != !!(lr_val & ICH_LR_GROUP)) goto spurious; pmr = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT; lr_prio = (lr_val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT; if (pmr <= lr_prio) goto spurious; if (__vgic_v3_get_highest_active_priority() <= __vgic_v3_pri_to_pre(lr_prio, vmcr, grp)) goto spurious; lr_val &= ~ICH_LR_STATE; lr_val |= ICH_LR_ACTIVE_BIT; __gic_v3_set_lr(lr_val, lr); __vgic_v3_set_active_priority(lr_prio, vmcr, grp); vcpu_set_reg(vcpu, rt, lr_val & ICH_LR_VIRTUAL_ID_MASK); return; spurious: vcpu_set_reg(vcpu, rt, ICC_IAR1_EL1_SPURIOUS); } static void __vgic_v3_clear_active_lr(int lr, u64 lr_val) { lr_val &= ~ICH_LR_ACTIVE_BIT; if (lr_val & ICH_LR_HW) { u32 pid; pid = (lr_val & ICH_LR_PHYS_ID_MASK) >> ICH_LR_PHYS_ID_SHIFT; gic_write_dir(pid); } __gic_v3_set_lr(lr_val, lr); } static void __vgic_v3_bump_eoicount(void) { u32 hcr; hcr = read_gicreg(ICH_HCR_EL2); hcr += 1 << ICH_HCR_EL2_EOIcount_SHIFT; write_gicreg(hcr, ICH_HCR_EL2); } static void __vgic_v3_write_dir(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { u32 vid = vcpu_get_reg(vcpu, rt); u64 lr_val; int lr; /* EOImode == 0, nothing to be done here */ if (!(vmcr & ICH_VMCR_EOIM_MASK)) return; /* No deactivate to be performed on an LPI */ if (vid >= VGIC_MIN_LPI) return; lr = __vgic_v3_find_active_lr(vcpu, vid, &lr_val); if (lr == -1) { __vgic_v3_bump_eoicount(); return; } __vgic_v3_clear_active_lr(lr, lr_val); } static void __vgic_v3_write_eoir(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { u32 vid = vcpu_get_reg(vcpu, rt); u64 lr_val; u8 lr_prio, act_prio; int lr, grp; grp = __vgic_v3_get_group(vcpu); /* Drop priority in any case */ act_prio = __vgic_v3_clear_highest_active_priority(); lr = __vgic_v3_find_active_lr(vcpu, vid, &lr_val); if (lr == -1) { /* Do not bump EOIcount for LPIs that aren't in the LRs */ if (!(vid >= VGIC_MIN_LPI)) __vgic_v3_bump_eoicount(); return; } /* EOImode == 1 and not an LPI, nothing to be done here */ if ((vmcr & ICH_VMCR_EOIM_MASK) && !(vid >= VGIC_MIN_LPI)) return; lr_prio = (lr_val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT; /* If priorities or group do not match, the guest has fscked-up. */ if (grp != !!(lr_val & ICH_LR_GROUP) || __vgic_v3_pri_to_pre(lr_prio, vmcr, grp) != act_prio) return; /* Let's now perform the deactivation */ __vgic_v3_clear_active_lr(lr, lr_val); } static void __vgic_v3_read_igrpen0(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { vcpu_set_reg(vcpu, rt, !!(vmcr & ICH_VMCR_ENG0_MASK)); } static void __vgic_v3_read_igrpen1(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { vcpu_set_reg(vcpu, rt, !!(vmcr & ICH_VMCR_ENG1_MASK)); } static void __vgic_v3_write_igrpen0(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { u64 val = vcpu_get_reg(vcpu, rt); if (val & 1) vmcr |= ICH_VMCR_ENG0_MASK; else vmcr &= ~ICH_VMCR_ENG0_MASK; __vgic_v3_write_vmcr(vmcr); } static void __vgic_v3_write_igrpen1(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { u64 val = vcpu_get_reg(vcpu, rt); if (val & 1) vmcr |= ICH_VMCR_ENG1_MASK; else vmcr &= ~ICH_VMCR_ENG1_MASK; __vgic_v3_write_vmcr(vmcr); } static void __vgic_v3_read_bpr0(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { vcpu_set_reg(vcpu, rt, __vgic_v3_get_bpr0(vmcr)); } static void __vgic_v3_read_bpr1(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { vcpu_set_reg(vcpu, rt, __vgic_v3_get_bpr1(vmcr)); } static void __vgic_v3_write_bpr0(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { u64 val = vcpu_get_reg(vcpu, rt); u8 bpr_min = __vgic_v3_bpr_min() - 1; /* Enforce BPR limiting */ if (val < bpr_min) val = bpr_min; val <<= ICH_VMCR_BPR0_SHIFT; val &= ICH_VMCR_BPR0_MASK; vmcr &= ~ICH_VMCR_BPR0_MASK; vmcr |= val; __vgic_v3_write_vmcr(vmcr); } static void __vgic_v3_write_bpr1(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { u64 val = vcpu_get_reg(vcpu, rt); u8 bpr_min = __vgic_v3_bpr_min(); if (vmcr & ICH_VMCR_CBPR_MASK) return; /* Enforce BPR limiting */ if (val < bpr_min) val = bpr_min; val <<= ICH_VMCR_BPR1_SHIFT; val &= ICH_VMCR_BPR1_MASK; vmcr &= ~ICH_VMCR_BPR1_MASK; vmcr |= val; __vgic_v3_write_vmcr(vmcr); } static void __vgic_v3_read_apxrn(struct kvm_vcpu *vcpu, int rt, int n) { u32 val; if (!__vgic_v3_get_group(vcpu)) val = __vgic_v3_read_ap0rn(n); else val = __vgic_v3_read_ap1rn(n); vcpu_set_reg(vcpu, rt, val); } static void __vgic_v3_write_apxrn(struct kvm_vcpu *vcpu, int rt, int n) { u32 val = vcpu_get_reg(vcpu, rt); if (!__vgic_v3_get_group(vcpu)) __vgic_v3_write_ap0rn(val, n); else __vgic_v3_write_ap1rn(val, n); } static void __vgic_v3_read_apxr0(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { __vgic_v3_read_apxrn(vcpu, rt, 0); } static void __vgic_v3_read_apxr1(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { __vgic_v3_read_apxrn(vcpu, rt, 1); } static void __vgic_v3_read_apxr2(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { __vgic_v3_read_apxrn(vcpu, rt, 2); } static void __vgic_v3_read_apxr3(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { __vgic_v3_read_apxrn(vcpu, rt, 3); } static void __vgic_v3_write_apxr0(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { __vgic_v3_write_apxrn(vcpu, rt, 0); } static void __vgic_v3_write_apxr1(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { __vgic_v3_write_apxrn(vcpu, rt, 1); } static void __vgic_v3_write_apxr2(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { __vgic_v3_write_apxrn(vcpu, rt, 2); } static void __vgic_v3_write_apxr3(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { __vgic_v3_write_apxrn(vcpu, rt, 3); } static void __vgic_v3_read_hppir(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { u64 lr_val; int lr, lr_grp, grp; grp = __vgic_v3_get_group(vcpu); lr = __vgic_v3_highest_priority_lr(vcpu, vmcr, &lr_val); if (lr == -1) goto spurious; lr_grp = !!(lr_val & ICH_LR_GROUP); if (lr_grp != grp) lr_val = ICC_IAR1_EL1_SPURIOUS; spurious: vcpu_set_reg(vcpu, rt, lr_val & ICH_LR_VIRTUAL_ID_MASK); } static void __vgic_v3_read_pmr(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { vmcr &= ICH_VMCR_PMR_MASK; vmcr >>= ICH_VMCR_PMR_SHIFT; vcpu_set_reg(vcpu, rt, vmcr); } static void __vgic_v3_write_pmr(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { u32 val = vcpu_get_reg(vcpu, rt); val <<= ICH_VMCR_PMR_SHIFT; val &= ICH_VMCR_PMR_MASK; vmcr &= ~ICH_VMCR_PMR_MASK; vmcr |= val; write_gicreg(vmcr, ICH_VMCR_EL2); } static void __vgic_v3_read_rpr(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { u32 val = __vgic_v3_get_highest_active_priority(); vcpu_set_reg(vcpu, rt, val); } static void __vgic_v3_read_ctlr(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { u32 vtr, val; vtr = read_gicreg(ICH_VTR_EL2); /* PRIbits */ val = ((vtr >> 29) & 7) << ICC_CTLR_EL1_PRI_BITS_SHIFT; /* IDbits */ val |= ((vtr >> 23) & 7) << ICC_CTLR_EL1_ID_BITS_SHIFT; /* A3V */ val |= ((vtr >> 21) & 1) << ICC_CTLR_EL1_A3V_SHIFT; /* EOImode */ val |= ((vmcr & ICH_VMCR_EOIM_MASK) >> ICH_VMCR_EOIM_SHIFT) << ICC_CTLR_EL1_EOImode_SHIFT; /* CBPR */ val |= (vmcr & ICH_VMCR_CBPR_MASK) >> ICH_VMCR_CBPR_SHIFT; vcpu_set_reg(vcpu, rt, val); } static void __vgic_v3_write_ctlr(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { u32 val = vcpu_get_reg(vcpu, rt); if (val & ICC_CTLR_EL1_CBPR_MASK) vmcr |= ICH_VMCR_CBPR_MASK; else vmcr &= ~ICH_VMCR_CBPR_MASK; if (val & ICC_CTLR_EL1_EOImode_MASK) vmcr |= ICH_VMCR_EOIM_MASK; else vmcr &= ~ICH_VMCR_EOIM_MASK; write_gicreg(vmcr, ICH_VMCR_EL2); } static bool __vgic_v3_check_trap_forwarding(struct kvm_vcpu *vcpu, u32 sysreg, bool is_read) { u64 ich_hcr; if (!is_nested_ctxt(vcpu)) return false; ich_hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2); switch (sysreg) { case SYS_ICC_IGRPEN0_EL1: if (is_read && (__vcpu_sys_reg(vcpu, HFGRTR_EL2) & HFGRTR_EL2_ICC_IGRPENn_EL1)) return true; if (!is_read && (__vcpu_sys_reg(vcpu, HFGWTR_EL2) & HFGWTR_EL2_ICC_IGRPENn_EL1)) return true; fallthrough; case SYS_ICC_AP0Rn_EL1(0): case SYS_ICC_AP0Rn_EL1(1): case SYS_ICC_AP0Rn_EL1(2): case SYS_ICC_AP0Rn_EL1(3): case SYS_ICC_BPR0_EL1: case SYS_ICC_EOIR0_EL1: case SYS_ICC_HPPIR0_EL1: case SYS_ICC_IAR0_EL1: return ich_hcr & ICH_HCR_EL2_TALL0; case SYS_ICC_IGRPEN1_EL1: if (is_read && (__vcpu_sys_reg(vcpu, HFGRTR_EL2) & HFGRTR_EL2_ICC_IGRPENn_EL1)) return true; if (!is_read && (__vcpu_sys_reg(vcpu, HFGWTR_EL2) & HFGWTR_EL2_ICC_IGRPENn_EL1)) return true; fallthrough; case SYS_ICC_AP1Rn_EL1(0): case SYS_ICC_AP1Rn_EL1(1): case SYS_ICC_AP1Rn_EL1(2): case SYS_ICC_AP1Rn_EL1(3): case SYS_ICC_BPR1_EL1: case SYS_ICC_EOIR1_EL1: case SYS_ICC_HPPIR1_EL1: case SYS_ICC_IAR1_EL1: return ich_hcr & ICH_HCR_EL2_TALL1; case SYS_ICC_DIR_EL1: if (ich_hcr & ICH_HCR_EL2_TDIR) return true; fallthrough; case SYS_ICC_RPR_EL1: case SYS_ICC_CTLR_EL1: case SYS_ICC_PMR_EL1: return ich_hcr & ICH_HCR_EL2_TC; default: return false; } } int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu) { int rt; u64 esr; u32 vmcr; void (*fn)(struct kvm_vcpu *, u32, int); bool is_read; u32 sysreg; if (kern_hyp_va(vcpu->kvm)->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V3) return 0; esr = kvm_vcpu_get_esr(vcpu); if (vcpu_mode_is_32bit(vcpu)) { if (!kvm_condition_valid(vcpu)) { __kvm_skip_instr(vcpu); return 1; } sysreg = esr_cp15_to_sysreg(esr); } else { sysreg = esr_sys64_to_sysreg(esr); } is_read = (esr & ESR_ELx_SYS64_ISS_DIR_MASK) == ESR_ELx_SYS64_ISS_DIR_READ; if (__vgic_v3_check_trap_forwarding(vcpu, sysreg, is_read)) return 0; switch (sysreg) { case SYS_ICC_IAR0_EL1: case SYS_ICC_IAR1_EL1: if (unlikely(!is_read)) return 0; fn = __vgic_v3_read_iar; break; case SYS_ICC_EOIR0_EL1: case SYS_ICC_EOIR1_EL1: if (unlikely(is_read)) return 0; fn = __vgic_v3_write_eoir; break; case SYS_ICC_IGRPEN1_EL1: if (is_read) fn = __vgic_v3_read_igrpen1; else fn = __vgic_v3_write_igrpen1; break; case SYS_ICC_BPR1_EL1: if (is_read) fn = __vgic_v3_read_bpr1; else fn = __vgic_v3_write_bpr1; break; case SYS_ICC_AP0Rn_EL1(0): case SYS_ICC_AP1Rn_EL1(0): if (is_read) fn = __vgic_v3_read_apxr0; else fn = __vgic_v3_write_apxr0; break; case SYS_ICC_AP0Rn_EL1(1): case SYS_ICC_AP1Rn_EL1(1): if (is_read) fn = __vgic_v3_read_apxr1; else fn = __vgic_v3_write_apxr1; break; case SYS_ICC_AP0Rn_EL1(2): case SYS_ICC_AP1Rn_EL1(2): if (is_read) fn = __vgic_v3_read_apxr2; else fn = __vgic_v3_write_apxr2; break; case SYS_ICC_AP0Rn_EL1(3): case SYS_ICC_AP1Rn_EL1(3): if (is_read) fn = __vgic_v3_read_apxr3; else fn = __vgic_v3_write_apxr3; break; case SYS_ICC_HPPIR0_EL1: case SYS_ICC_HPPIR1_EL1: if (unlikely(!is_read)) return 0; fn = __vgic_v3_read_hppir; break; case SYS_ICC_IGRPEN0_EL1: if (is_read) fn = __vgic_v3_read_igrpen0; else fn = __vgic_v3_write_igrpen0; break; case SYS_ICC_BPR0_EL1: if (is_read) fn = __vgic_v3_read_bpr0; else fn = __vgic_v3_write_bpr0; break; case SYS_ICC_DIR_EL1: if (unlikely(is_read)) return 0; fn = __vgic_v3_write_dir; break; case SYS_ICC_RPR_EL1: if (unlikely(!is_read)) return 0; fn = __vgic_v3_read_rpr; break; case SYS_ICC_CTLR_EL1: if (is_read) fn = __vgic_v3_read_ctlr; else fn = __vgic_v3_write_ctlr; break; case SYS_ICC_PMR_EL1: if (is_read) fn = __vgic_v3_read_pmr; else fn = __vgic_v3_write_pmr; break; default: return 0; } vmcr = __vgic_v3_read_vmcr(); rt = kvm_vcpu_sys_get_rt(vcpu); fn(vcpu, vmcr, rt); __kvm_skip_instr(vcpu); return 1; }
12 213 213 214 213 30 12 12 7 7 14 14 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Red Black Trees (C) 1999 Andrea Arcangeli <andrea@suse.de> linux/include/linux/rbtree.h To use rbtrees you'll have to implement your own insert and search cores. This will avoid us to use callbacks and to drop drammatically performances. I know it's not the cleaner way, but in C (not in C++) to get performances and genericity... See Documentation/core-api/rbtree.rst for documentation and samples. */ #ifndef _LINUX_RBTREE_H #define _LINUX_RBTREE_H #include <linux/container_of.h> #include <linux/rbtree_types.h> #include <linux/stddef.h> #include <linux/rcupdate.h> #define rb_parent(r) ((struct rb_node *)((r)->__rb_parent_color & ~3)) #define rb_entry(ptr, type, member) container_of(ptr, type, member) #define RB_EMPTY_ROOT(root) (READ_ONCE((root)->rb_node) == NULL) /* 'empty' nodes are nodes that are known not to be inserted in an rbtree */ #define RB_EMPTY_NODE(node) \ ((node)->__rb_parent_color == (unsigned long)(node)) #define RB_CLEAR_NODE(node) \ ((node)->__rb_parent_color = (unsigned long)(node)) extern void rb_insert_color(struct rb_node *, struct rb_root *); extern void rb_erase(struct rb_node *, struct rb_root *); /* Find logical next and previous nodes in a tree */ extern struct rb_node *rb_next(const struct rb_node *); extern struct rb_node *rb_prev(const struct rb_node *); extern struct rb_node *rb_first(const struct rb_root *); extern struct rb_node *rb_last(const struct rb_root *); /* Postorder iteration - always visit the parent after its children */ extern struct rb_node *rb_first_postorder(const struct rb_root *); extern struct rb_node *rb_next_postorder(const struct rb_node *); /* Fast replacement of a single node without remove/rebalance/add/rebalance */ extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root); extern void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new, struct rb_root *root); static inline void rb_link_node(struct rb_node *node, struct rb_node *parent, struct rb_node **rb_link) { node->__rb_parent_color = (unsigned long)parent; node->rb_left = node->rb_right = NULL; *rb_link = node; } static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent, struct rb_node **rb_link) { node->__rb_parent_color = (unsigned long)parent; node->rb_left = node->rb_right = NULL; rcu_assign_pointer(*rb_link, node); } #define rb_entry_safe(ptr, type, member) \ ({ typeof(ptr) ____ptr = (ptr); \ ____ptr ? rb_entry(____ptr, type, member) : NULL; \ }) /** * rbtree_postorder_for_each_entry_safe - iterate in post-order over rb_root of * given type allowing the backing memory of @pos to be invalidated * * @pos: the 'type *' to use as a loop cursor. * @n: another 'type *' to use as temporary storage * @root: 'rb_root *' of the rbtree. * @field: the name of the rb_node field within 'type'. * * rbtree_postorder_for_each_entry_safe() provides a similar guarantee as * list_for_each_entry_safe() and allows the iteration to continue independent * of changes to @pos by the body of the loop. * * Note, however, that it cannot handle other modifications that re-order the * rbtree it is iterating over. This includes calling rb_erase() on @pos, as * rb_erase() may rebalance the tree, causing us to miss some nodes. */ #define rbtree_postorder_for_each_entry_safe(pos, n, root, field) \ for (pos = rb_entry_safe(rb_first_postorder(root), typeof(*pos), field); \ pos && ({ n = rb_entry_safe(rb_next_postorder(&pos->field), \ typeof(*pos), field); 1; }); \ pos = n) /* Same as rb_first(), but O(1) */ #define rb_first_cached(root) (root)->rb_leftmost static inline void rb_insert_color_cached(struct rb_node *node, struct rb_root_cached *root, bool leftmost) { if (leftmost) root->rb_leftmost = node; rb_insert_color(node, &root->rb_root); } static inline struct rb_node * rb_erase_cached(struct rb_node *node, struct rb_root_cached *root) { struct rb_node *leftmost = NULL; if (root->rb_leftmost == node) leftmost = root->rb_leftmost = rb_next(node); rb_erase(node, &root->rb_root); return leftmost; } static inline void rb_replace_node_cached(struct rb_node *victim, struct rb_node *new, struct rb_root_cached *root) { if (root->rb_leftmost == victim) root->rb_leftmost = new; rb_replace_node(victim, new, &root->rb_root); } /* * The below helper functions use 2 operators with 3 different * calling conventions. The operators are related like: * * comp(a->key,b) < 0 := less(a,b) * comp(a->key,b) > 0 := less(b,a) * comp(a->key,b) == 0 := !less(a,b) && !less(b,a) * * If these operators define a partial order on the elements we make no * guarantee on which of the elements matching the key is found. See * rb_find(). * * The reason for this is to allow the find() interface without requiring an * on-stack dummy object, which might not be feasible due to object size. */ /** * rb_add_cached() - insert @node into the leftmost cached tree @tree * @node: node to insert * @tree: leftmost cached tree to insert @node into * @less: operator defining the (partial) node order * * Returns @node when it is the new leftmost, or NULL. */ static __always_inline struct rb_node * rb_add_cached(struct rb_node *node, struct rb_root_cached *tree, bool (*less)(struct rb_node *, const struct rb_node *)) { struct rb_node **link = &tree->rb_root.rb_node; struct rb_node *parent = NULL; bool leftmost = true; while (*link) { parent = *link; if (less(node, parent)) { link = &parent->rb_left; } else { link = &parent->rb_right; leftmost = false; } } rb_link_node(node, parent, link); rb_insert_color_cached(node, tree, leftmost); return leftmost ? node : NULL; } /** * rb_add() - insert @node into @tree * @node: node to insert * @tree: tree to insert @node into * @less: operator defining the (partial) node order */ static __always_inline void rb_add(struct rb_node *node, struct rb_root *tree, bool (*less)(struct rb_node *, const struct rb_node *)) { struct rb_node **link = &tree->rb_node; struct rb_node *parent = NULL; while (*link) { parent = *link; if (less(node, parent)) link = &parent->rb_left; else link = &parent->rb_right; } rb_link_node(node, parent, link); rb_insert_color(node, tree); } /** * rb_find_add_cached() - find equivalent @node in @tree, or add @node * @node: node to look-for / insert * @tree: tree to search / modify * @cmp: operator defining the node order * * Returns the rb_node matching @node, or NULL when no match is found and @node * is inserted. */ static __always_inline struct rb_node * rb_find_add_cached(struct rb_node *node, struct rb_root_cached *tree, int (*cmp)(const struct rb_node *new, const struct rb_node *exist)) { bool leftmost = true; struct rb_node **link = &tree->rb_root.rb_node; struct rb_node *parent = NULL; int c; while (*link) { parent = *link; c = cmp(node, parent); if (c < 0) { link = &parent->rb_left; } else if (c > 0) { link = &parent->rb_right; leftmost = false; } else { return parent; } } rb_link_node(node, parent, link); rb_insert_color_cached(node, tree, leftmost); return NULL; } /** * rb_find_add() - find equivalent @node in @tree, or add @node * @node: node to look-for / insert * @tree: tree to search / modify * @cmp: operator defining the node order * * Returns the rb_node matching @node, or NULL when no match is found and @node * is inserted. */ static __always_inline struct rb_node * rb_find_add(struct rb_node *node, struct rb_root *tree, int (*cmp)(struct rb_node *, const struct rb_node *)) { struct rb_node **link = &tree->rb_node; struct rb_node *parent = NULL; int c; while (*link) { parent = *link; c = cmp(node, parent); if (c < 0) link = &parent->rb_left; else if (c > 0) link = &parent->rb_right; else return parent; } rb_link_node(node, parent, link); rb_insert_color(node, tree); return NULL; } /** * rb_find_add_rcu() - find equivalent @node in @tree, or add @node * @node: node to look-for / insert * @tree: tree to search / modify * @cmp: operator defining the node order * * Adds a Store-Release for link_node. * * Returns the rb_node matching @node, or NULL when no match is found and @node * is inserted. */ static __always_inline struct rb_node * rb_find_add_rcu(struct rb_node *node, struct rb_root *tree, int (*cmp)(struct rb_node *, const struct rb_node *)) { struct rb_node **link = &tree->rb_node; struct rb_node *parent = NULL; int c; while (*link) { parent = *link; c = cmp(node, parent); if (c < 0) link = &parent->rb_left; else if (c > 0) link = &parent->rb_right; else return parent; } rb_link_node_rcu(node, parent, link); rb_insert_color(node, tree); return NULL; } /** * rb_find() - find @key in tree @tree * @key: key to match * @tree: tree to search * @cmp: operator defining the node order * * Returns the rb_node matching @key or NULL. */ static __always_inline struct rb_node * rb_find(const void *key, const struct rb_root *tree, int (*cmp)(const void *key, const struct rb_node *)) { struct rb_node *node = tree->rb_node; while (node) { int c = cmp(key, node); if (c < 0) node = node->rb_left; else if (c > 0) node = node->rb_right; else return node; } return NULL; } /** * rb_find_rcu() - find @key in tree @tree * @key: key to match * @tree: tree to search * @cmp: operator defining the node order * * Notably, tree descent vs concurrent tree rotations is unsound and can result * in false-negatives. * * Returns the rb_node matching @key or NULL. */ static __always_inline struct rb_node * rb_find_rcu(const void *key, const struct rb_root *tree, int (*cmp)(const void *key, const struct rb_node *)) { struct rb_node *node = tree->rb_node; while (node) { int c = cmp(key, node); if (c < 0) node = rcu_dereference_raw(node->rb_left); else if (c > 0) node = rcu_dereference_raw(node->rb_right); else return node; } return NULL; } /** * rb_find_first() - find the first @key in @tree * @key: key to match * @tree: tree to search * @cmp: operator defining node order * * Returns the leftmost node matching @key, or NULL. */ static __always_inline struct rb_node * rb_find_first(const void *key, const struct rb_root *tree, int (*cmp)(const void *key, const struct rb_node *)) { struct rb_node *node = tree->rb_node; struct rb_node *match = NULL; while (node) { int c = cmp(key, node); if (c <= 0) { if (!c) match = node; node = node->rb_left; } else if (c > 0) { node = node->rb_right; } } return match; } /** * rb_next_match() - find the next @key in @tree * @key: key to match * @tree: tree to search * @cmp: operator defining node order * * Returns the next node matching @key, or NULL. */ static __always_inline struct rb_node * rb_next_match(const void *key, struct rb_node *node, int (*cmp)(const void *key, const struct rb_node *)) { node = rb_next(node); if (node && cmp(key, node)) node = NULL; return node; } /** * rb_for_each() - iterates a subtree matching @key * @node: iterator * @key: key to match * @tree: tree to search * @cmp: operator defining node order */ #define rb_for_each(node, key, tree, cmp) \ for ((node) = rb_find_first((key), (tree), (cmp)); \ (node); (node) = rb_next_match((key), (node), (cmp))) #endif /* _LINUX_RBTREE_H */
21 20 5 5 21 15 15 15 16 7 7 12 12 11 6 6 6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 // SPDX-License-Identifier: GPL-2.0-only #include <linux/export.h> #include <linux/bvec.h> #include <linux/fault-inject-usercopy.h> #include <linux/uio.h> #include <linux/pagemap.h> #include <linux/highmem.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/splice.h> #include <linux/compat.h> #include <linux/scatterlist.h> #include <linux/instrumented.h> #include <linux/iov_iter.h> static __always_inline size_t copy_to_user_iter(void __user *iter_to, size_t progress, size_t len, void *from, void *priv2) { if (should_fail_usercopy()) return len; if (access_ok(iter_to, len)) { from += progress; instrument_copy_to_user(iter_to, from, len); len = raw_copy_to_user(iter_to, from, len); } return len; } static __always_inline size_t copy_to_user_iter_nofault(void __user *iter_to, size_t progress, size_t len, void *from, void *priv2) { ssize_t res; if (should_fail_usercopy()) return len; from += progress; res = copy_to_user_nofault(iter_to, from, len); return res < 0 ? len : res; } static __always_inline size_t copy_from_user_iter(void __user *iter_from, size_t progress, size_t len, void *to, void *priv2) { size_t res = len; if (should_fail_usercopy()) return len; if (access_ok(iter_from, len)) { to += progress; instrument_copy_from_user_before(to, iter_from, len); res = raw_copy_from_user(to, iter_from, len); instrument_copy_from_user_after(to, iter_from, len, res); } return res; } static __always_inline size_t memcpy_to_iter(void *iter_to, size_t progress, size_t len, void *from, void *priv2) { memcpy(iter_to, from + progress, len); return 0; } static __always_inline size_t memcpy_from_iter(void *iter_from, size_t progress, size_t len, void *to, void *priv2) { memcpy(to + progress, iter_from, len); return 0; } /* * fault_in_iov_iter_readable - fault in iov iterator for reading * @i: iterator * @size: maximum length * * Fault in one or more iovecs of the given iov_iter, to a maximum length of * @size. For each iovec, fault in each page that constitutes the iovec. * * Returns the number of bytes not faulted in (like copy_to_user() and * copy_from_user()). * * Always returns 0 for non-userspace iterators. */ size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t size) { if (iter_is_ubuf(i)) { size_t n = min(size, iov_iter_count(i)); n -= fault_in_readable(i->ubuf + i->iov_offset, n); return size - n; } else if (iter_is_iovec(i)) { size_t count = min(size, iov_iter_count(i)); const struct iovec *p; size_t skip; size -= count; for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) { size_t len = min(count, p->iov_len - skip); size_t ret; if (unlikely(!len)) continue; ret = fault_in_readable(p->iov_base + skip, len); count -= len - ret; if (ret) break; } return count + size; } return 0; } EXPORT_SYMBOL(fault_in_iov_iter_readable); /* * fault_in_iov_iter_writeable - fault in iov iterator for writing * @i: iterator * @size: maximum length * * Faults in the iterator using get_user_pages(), i.e., without triggering * hardware page faults. This is primarily useful when we already know that * some or all of the pages in @i aren't in memory. * * Returns the number of bytes not faulted in, like copy_to_user() and * copy_from_user(). * * Always returns 0 for non-user-space iterators. */ size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t size) { if (iter_is_ubuf(i)) { size_t n = min(size, iov_iter_count(i)); n -= fault_in_safe_writeable(i->ubuf + i->iov_offset, n); return size - n; } else if (iter_is_iovec(i)) { size_t count = min(size, iov_iter_count(i)); const struct iovec *p; size_t skip; size -= count; for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) { size_t len = min(count, p->iov_len - skip); size_t ret; if (unlikely(!len)) continue; ret = fault_in_safe_writeable(p->iov_base + skip, len); count -= len - ret; if (ret) break; } return count + size; } return 0; } EXPORT_SYMBOL(fault_in_iov_iter_writeable); void iov_iter_init(struct iov_iter *i, unsigned int direction, const struct iovec *iov, unsigned long nr_segs, size_t count) { WARN_ON(direction & ~(READ | WRITE)); *i = (struct iov_iter) { .iter_type = ITER_IOVEC, .nofault = false, .data_source = direction, .__iov = iov, .nr_segs = nr_segs, .iov_offset = 0, .count = count }; } EXPORT_SYMBOL(iov_iter_init); size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { if (WARN_ON_ONCE(i->data_source)) return 0; if (user_backed_iter(i)) might_fault(); return iterate_and_advance(i, bytes, (void *)addr, copy_to_user_iter, memcpy_to_iter); } EXPORT_SYMBOL(_copy_to_iter); #ifdef CONFIG_ARCH_HAS_COPY_MC static __always_inline size_t copy_to_user_iter_mc(void __user *iter_to, size_t progress, size_t len, void *from, void *priv2) { if (access_ok(iter_to, len)) { from += progress; instrument_copy_to_user(iter_to, from, len); len = copy_mc_to_user(iter_to, from, len); } return len; } static __always_inline size_t memcpy_to_iter_mc(void *iter_to, size_t progress, size_t len, void *from, void *priv2) { return copy_mc_to_kernel(iter_to, from + progress, len); } /** * _copy_mc_to_iter - copy to iter with source memory error exception handling * @addr: source kernel address * @bytes: total transfer length * @i: destination iterator * * The pmem driver deploys this for the dax operation * (dax_copy_to_iter()) for dax reads (bypass page-cache and the * block-layer). Upon #MC read(2) aborts and returns EIO or the bytes * successfully copied. * * The main differences between this and typical _copy_to_iter(). * * * Typical tail/residue handling after a fault retries the copy * byte-by-byte until the fault happens again. Re-triggering machine * checks is potentially fatal so the implementation uses source * alignment and poison alignment assumptions to avoid re-triggering * hardware exceptions. * * * ITER_KVEC and ITER_BVEC can return short copies. Compare to * copy_to_iter() where only ITER_IOVEC attempts might return a short copy. * * Return: number of bytes copied (may be %0) */ size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { if (WARN_ON_ONCE(i->data_source)) return 0; if (user_backed_iter(i)) might_fault(); return iterate_and_advance(i, bytes, (void *)addr, copy_to_user_iter_mc, memcpy_to_iter_mc); } EXPORT_SYMBOL_GPL(_copy_mc_to_iter); #endif /* CONFIG_ARCH_HAS_COPY_MC */ static __always_inline size_t __copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) { return iterate_and_advance(i, bytes, addr, copy_from_user_iter, memcpy_from_iter); } size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) { if (WARN_ON_ONCE(!i->data_source)) return 0; if (user_backed_iter(i)) might_fault(); return __copy_from_iter(addr, bytes, i); } EXPORT_SYMBOL(_copy_from_iter); static __always_inline size_t copy_from_user_iter_nocache(void __user *iter_from, size_t progress, size_t len, void *to, void *priv2) { return __copy_from_user_inatomic_nocache(to + progress, iter_from, len); } size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) { if (WARN_ON_ONCE(!i->data_source)) return 0; return iterate_and_advance(i, bytes, addr, copy_from_user_iter_nocache, memcpy_from_iter); } EXPORT_SYMBOL(_copy_from_iter_nocache); #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE static __always_inline size_t copy_from_user_iter_flushcache(void __user *iter_from, size_t progress, size_t len, void *to, void *priv2) { return __copy_from_user_flushcache(to + progress, iter_from, len); } static __always_inline size_t memcpy_from_iter_flushcache(void *iter_from, size_t progress, size_t len, void *to, void *priv2) { memcpy_flushcache(to + progress, iter_from, len); return 0; } /** * _copy_from_iter_flushcache - write destination through cpu cache * @addr: destination kernel address * @bytes: total transfer length * @i: source iterator * * The pmem driver arranges for filesystem-dax to use this facility via * dax_copy_from_iter() for ensuring that writes to persistent memory * are flushed through the CPU cache. It is differentiated from * _copy_from_iter_nocache() in that guarantees all data is flushed for * all iterator types. The _copy_from_iter_nocache() only attempts to * bypass the cache for the ITER_IOVEC case, and on some archs may use * instructions that strand dirty-data in the cache. * * Return: number of bytes copied (may be %0) */ size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i) { if (WARN_ON_ONCE(!i->data_source)) return 0; return iterate_and_advance(i, bytes, addr, copy_from_user_iter_flushcache, memcpy_from_iter_flushcache); } EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache); #endif static inline bool page_copy_sane(struct page *page, size_t offset, size_t n) { struct page *head; size_t v = n + offset; /* * The general case needs to access the page order in order * to compute the page size. * However, we mostly deal with order-0 pages and thus can * avoid a possible cache line miss for requests that fit all * page orders. */ if (n <= v && v <= PAGE_SIZE) return true; head = compound_head(page); v += (page - head) << PAGE_SHIFT; if (WARN_ON(n > v || v > page_size(head))) return false; return true; } size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i) { size_t res = 0; if (!page_copy_sane(page, offset, bytes)) return 0; if (WARN_ON_ONCE(i->data_source)) return 0; page += offset / PAGE_SIZE; // first subpage offset %= PAGE_SIZE; while (1) { void *kaddr = kmap_local_page(page); size_t n = min(bytes, (size_t)PAGE_SIZE - offset); n = _copy_to_iter(kaddr + offset, n, i); kunmap_local(kaddr); res += n; bytes -= n; if (!bytes || !n) break; offset += n; if (offset == PAGE_SIZE) { page++; offset = 0; } } return res; } EXPORT_SYMBOL(copy_page_to_iter); size_t copy_page_to_iter_nofault(struct page *page, unsigned offset, size_t bytes, struct iov_iter *i) { size_t res = 0; if (!page_copy_sane(page, offset, bytes)) return 0; if (WARN_ON_ONCE(i->data_source)) return 0; page += offset / PAGE_SIZE; // first subpage offset %= PAGE_SIZE; while (1) { void *kaddr = kmap_local_page(page); size_t n = min(bytes, (size_t)PAGE_SIZE - offset); n = iterate_and_advance(i, n, kaddr + offset, copy_to_user_iter_nofault, memcpy_to_iter); kunmap_local(kaddr); res += n; bytes -= n; if (!bytes || !n) break; offset += n; if (offset == PAGE_SIZE) { page++; offset = 0; } } return res; } EXPORT_SYMBOL(copy_page_to_iter_nofault); size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i) { size_t res = 0; if (!page_copy_sane(page, offset, bytes)) return 0; page += offset / PAGE_SIZE; // first subpage offset %= PAGE_SIZE; while (1) { void *kaddr = kmap_local_page(page); size_t n = min(bytes, (size_t)PAGE_SIZE - offset); n = _copy_from_iter(kaddr + offset, n, i); kunmap_local(kaddr); res += n; bytes -= n; if (!bytes || !n) break; offset += n; if (offset == PAGE_SIZE) { page++; offset = 0; } } return res; } EXPORT_SYMBOL(copy_page_from_iter); static __always_inline size_t zero_to_user_iter(void __user *iter_to, size_t progress, size_t len, void *priv, void *priv2) { return clear_user(iter_to, len); } static __always_inline size_t zero_to_iter(void *iter_to, size_t progress, size_t len, void *priv, void *priv2) { memset(iter_to, 0, len); return 0; } size_t iov_iter_zero(size_t bytes, struct iov_iter *i) { return iterate_and_advance(i, bytes, NULL, zero_to_user_iter, zero_to_iter); } EXPORT_SYMBOL(iov_iter_zero); size_t copy_folio_from_iter_atomic(struct folio *folio, size_t offset, size_t bytes, struct iov_iter *i) { size_t n, copied = 0; if (!page_copy_sane(&folio->page, offset, bytes)) return 0; if (WARN_ON_ONCE(!i->data_source)) return 0; do { char *to = kmap_local_folio(folio, offset); n = bytes - copied; if (folio_test_partial_kmap(folio) && n > PAGE_SIZE - offset_in_page(offset)) n = PAGE_SIZE - offset_in_page(offset); pagefault_disable(); n = __copy_from_iter(to, n, i); pagefault_enable(); kunmap_local(to); copied += n; offset += n; } while (copied != bytes && n > 0); return copied; } EXPORT_SYMBOL(copy_folio_from_iter_atomic); static void iov_iter_bvec_advance(struct iov_iter *i, size_t size) { const struct bio_vec *bvec, *end; if (!i->count) return; i->count -= size; size += i->iov_offset; for (bvec = i->bvec, end = bvec + i->nr_segs; bvec < end; bvec++) { if (likely(size < bvec->bv_len)) break; size -= bvec->bv_len; } i->iov_offset = size; i->nr_segs -= bvec - i->bvec; i->bvec = bvec; } static void iov_iter_iovec_advance(struct iov_iter *i, size_t size) { const struct iovec *iov, *end; if (!i->count) return; i->count -= size; size += i->iov_offset; // from beginning of current segment for (iov = iter_iov(i), end = iov + i->nr_segs; iov < end; iov++) { if (likely(size < iov->iov_len)) break; size -= iov->iov_len; } i->iov_offset = size; i->nr_segs -= iov - iter_iov(i); i->__iov = iov; } static void iov_iter_folioq_advance(struct iov_iter *i, size_t size) { const struct folio_queue *folioq = i->folioq; unsigned int slot = i->folioq_slot; if (!i->count) return; i->count -= size; if (slot >= folioq_nr_slots(folioq)) { folioq = folioq->next; slot = 0; } size += i->iov_offset; /* From beginning of current segment. */ do { size_t fsize = folioq_folio_size(folioq, slot); if (likely(size < fsize)) break; size -= fsize; slot++; if (slot >= folioq_nr_slots(folioq) && folioq->next) { folioq = folioq->next; slot = 0; } } while (size); i->iov_offset = size; i->folioq_slot = slot; i->folioq = folioq; } void iov_iter_advance(struct iov_iter *i, size_t size) { if (unlikely(i->count < size)) size = i->count; if (likely(iter_is_ubuf(i)) || unlikely(iov_iter_is_xarray(i))) { i->iov_offset += size; i->count -= size; } else if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) { /* iovec and kvec have identical layouts */ iov_iter_iovec_advance(i, size); } else if (iov_iter_is_bvec(i)) { iov_iter_bvec_advance(i, size); } else if (iov_iter_is_folioq(i)) { iov_iter_folioq_advance(i, size); } else if (iov_iter_is_discard(i)) { i->count -= size; } } EXPORT_SYMBOL(iov_iter_advance); static void iov_iter_folioq_revert(struct iov_iter *i, size_t unroll) { const struct folio_queue *folioq = i->folioq; unsigned int slot = i->folioq_slot; for (;;) { size_t fsize; if (slot == 0) { folioq = folioq->prev; slot = folioq_nr_slots(folioq); } slot--; fsize = folioq_folio_size(folioq, slot); if (unroll <= fsize) { i->iov_offset = fsize - unroll; break; } unroll -= fsize; } i->folioq_slot = slot; i->folioq = folioq; } void iov_iter_revert(struct iov_iter *i, size_t unroll) { if (!unroll) return; if (WARN_ON(unroll > MAX_RW_COUNT)) return; i->count += unroll; if (unlikely(iov_iter_is_discard(i))) return; if (unroll <= i->iov_offset) { i->iov_offset -= unroll; return; } unroll -= i->iov_offset; if (iov_iter_is_xarray(i) || iter_is_ubuf(i)) { BUG(); /* We should never go beyond the start of the specified * range since we might then be straying into pages that * aren't pinned. */ } else if (iov_iter_is_bvec(i)) { const struct bio_vec *bvec = i->bvec; while (1) { size_t n = (--bvec)->bv_len; i->nr_segs++; if (unroll <= n) { i->bvec = bvec; i->iov_offset = n - unroll; return; } unroll -= n; } } else if (iov_iter_is_folioq(i)) { i->iov_offset = 0; iov_iter_folioq_revert(i, unroll); } else { /* same logics for iovec and kvec */ const struct iovec *iov = iter_iov(i); while (1) { size_t n = (--iov)->iov_len; i->nr_segs++; if (unroll <= n) { i->__iov = iov; i->iov_offset = n - unroll; return; } unroll -= n; } } } EXPORT_SYMBOL(iov_iter_revert); /* * Return the count of just the current iov_iter segment. */ size_t iov_iter_single_seg_count(const struct iov_iter *i) { if (i->nr_segs > 1) { if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) return min(i->count, iter_iov(i)->iov_len - i->iov_offset); if (iov_iter_is_bvec(i)) return min(i->count, i->bvec->bv_len - i->iov_offset); } if (unlikely(iov_iter_is_folioq(i))) return !i->count ? 0 : umin(folioq_folio_size(i->folioq, i->folioq_slot), i->count); return i->count; } EXPORT_SYMBOL(iov_iter_single_seg_count); void iov_iter_kvec(struct iov_iter *i, unsigned int direction, const struct kvec *kvec, unsigned long nr_segs, size_t count) { WARN_ON(direction & ~(READ | WRITE)); *i = (struct iov_iter){ .iter_type = ITER_KVEC, .data_source = direction, .kvec = kvec, .nr_segs = nr_segs, .iov_offset = 0, .count = count }; } EXPORT_SYMBOL(iov_iter_kvec); void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_vec *bvec, unsigned long nr_segs, size_t count) { WARN_ON(direction & ~(READ | WRITE)); *i = (struct iov_iter){ .iter_type = ITER_BVEC, .data_source = direction, .bvec = bvec, .nr_segs = nr_segs, .iov_offset = 0, .count = count }; } EXPORT_SYMBOL(iov_iter_bvec); /** * iov_iter_folio_queue - Initialise an I/O iterator to use the folios in a folio queue * @i: The iterator to initialise. * @direction: The direction of the transfer. * @folioq: The starting point in the folio queue. * @first_slot: The first slot in the folio queue to use * @offset: The offset into the folio in the first slot to start at * @count: The size of the I/O buffer in bytes. * * Set up an I/O iterator to either draw data out of the pages attached to an * inode or to inject data into those pages. The pages *must* be prevented * from evaporation, either by taking a ref on them or locking them by the * caller. */ void iov_iter_folio_queue(struct iov_iter *i, unsigned int direction, const struct folio_queue *folioq, unsigned int first_slot, unsigned int offset, size_t count) { BUG_ON(direction & ~1); *i = (struct iov_iter) { .iter_type = ITER_FOLIOQ, .data_source = direction, .folioq = folioq, .folioq_slot = first_slot, .count = count, .iov_offset = offset, }; } EXPORT_SYMBOL(iov_iter_folio_queue); /** * iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray * @i: The iterator to initialise. * @direction: The direction of the transfer. * @xarray: The xarray to access. * @start: The start file position. * @count: The size of the I/O buffer in bytes. * * Set up an I/O iterator to either draw data out of the pages attached to an * inode or to inject data into those pages. The pages *must* be prevented * from evaporation, either by taking a ref on them or locking them by the * caller. */ void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray *xarray, loff_t start, size_t count) { BUG_ON(direction & ~1); *i = (struct iov_iter) { .iter_type = ITER_XARRAY, .data_source = direction, .xarray = xarray, .xarray_start = start, .count = count, .iov_offset = 0 }; } EXPORT_SYMBOL(iov_iter_xarray); /** * iov_iter_discard - Initialise an I/O iterator that discards data * @i: The iterator to initialise. * @direction: The direction of the transfer. * @count: The size of the I/O buffer in bytes. * * Set up an I/O iterator that just discards everything that's written to it. * It's only available as a READ iterator. */ void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count) { BUG_ON(direction != READ); *i = (struct iov_iter){ .iter_type = ITER_DISCARD, .data_source = false, .count = count, .iov_offset = 0 }; } EXPORT_SYMBOL(iov_iter_discard); static bool iov_iter_aligned_iovec(const struct iov_iter *i, unsigned addr_mask, unsigned len_mask) { const struct iovec *iov = iter_iov(i); size_t size = i->count; size_t skip = i->iov_offset; do { size_t len = iov->iov_len - skip; if (len > size) len = size; if (len & len_mask) return false; if ((unsigned long)(iov->iov_base + skip) & addr_mask) return false; iov++; size -= len; skip = 0; } while (size); return true; } static bool iov_iter_aligned_bvec(const struct iov_iter *i, unsigned addr_mask, unsigned len_mask) { const struct bio_vec *bvec = i->bvec; unsigned skip = i->iov_offset; size_t size = i->count; do { size_t len = bvec->bv_len - skip; if (len > size) len = size; if (len & len_mask) return false; if ((unsigned long)(bvec->bv_offset + skip) & addr_mask) return false; bvec++; size -= len; skip = 0; } while (size); return true; } /** * iov_iter_is_aligned() - Check if the addresses and lengths of each segments * are aligned to the parameters. * * @i: &struct iov_iter to restore * @addr_mask: bit mask to check against the iov element's addresses * @len_mask: bit mask to check against the iov element's lengths * * Return: false if any addresses or lengths intersect with the provided masks */ bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask, unsigned len_mask) { if (likely(iter_is_ubuf(i))) { if (i->count & len_mask) return false; if ((unsigned long)(i->ubuf + i->iov_offset) & addr_mask) return false; return true; } if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) return iov_iter_aligned_iovec(i, addr_mask, len_mask); if (iov_iter_is_bvec(i)) return iov_iter_aligned_bvec(i, addr_mask, len_mask); /* With both xarray and folioq types, we're dealing with whole folios. */ if (iov_iter_is_xarray(i)) { if (i->count & len_mask) return false; if ((i->xarray_start + i->iov_offset) & addr_mask) return false; } if (iov_iter_is_folioq(i)) { if (i->count & len_mask) return false; if (i->iov_offset & addr_mask) return false; } return true; } EXPORT_SYMBOL_GPL(iov_iter_is_aligned); static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i) { const struct iovec *iov = iter_iov(i); unsigned long res = 0; size_t size = i->count; size_t skip = i->iov_offset; do { size_t len = iov->iov_len - skip; if (len) { res |= (unsigned long)iov->iov_base + skip; if (len > size) len = size; res |= len; size -= len; } iov++; skip = 0; } while (size); return res; } static unsigned long iov_iter_alignment_bvec(const struct iov_iter *i) { const struct bio_vec *bvec = i->bvec; unsigned res = 0; size_t size = i->count; unsigned skip = i->iov_offset; do { size_t len = bvec->bv_len - skip; res |= (unsigned long)bvec->bv_offset + skip; if (len > size) len = size; res |= len; bvec++; size -= len; skip = 0; } while (size); return res; } unsigned long iov_iter_alignment(const struct iov_iter *i) { if (likely(iter_is_ubuf(i))) { size_t size = i->count; if (size) return ((unsigned long)i->ubuf + i->iov_offset) | size; return 0; } /* iovec and kvec have identical layouts */ if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) return iov_iter_alignment_iovec(i); if (iov_iter_is_bvec(i)) return iov_iter_alignment_bvec(i); /* With both xarray and folioq types, we're dealing with whole folios. */ if (iov_iter_is_folioq(i)) return i->iov_offset | i->count; if (iov_iter_is_xarray(i)) return (i->xarray_start + i->iov_offset) | i->count; return 0; } EXPORT_SYMBOL(iov_iter_alignment); unsigned long iov_iter_gap_alignment(const struct iov_iter *i) { unsigned long res = 0; unsigned long v = 0; size_t size = i->count; unsigned k; if (iter_is_ubuf(i)) return 0; if (WARN_ON(!iter_is_iovec(i))) return ~0U; for (k = 0; k < i->nr_segs; k++) { const struct iovec *iov = iter_iov(i) + k; if (iov->iov_len) { unsigned long base = (unsigned long)iov->iov_base; if (v) // if not the first one res |= base | v; // this start | previous end v = base + iov->iov_len; if (size <= iov->iov_len) break; size -= iov->iov_len; } } return res; } EXPORT_SYMBOL(iov_iter_gap_alignment); static int want_pages_array(struct page ***res, size_t size, size_t start, unsigned int maxpages) { unsigned int count = DIV_ROUND_UP(size + start, PAGE_SIZE); if (count > maxpages) count = maxpages; WARN_ON(!count); // caller should've prevented that if (!*res) { *res = kvmalloc_array(count, sizeof(struct page *), GFP_KERNEL); if (!*res) return 0; } return count; } static ssize_t iter_folioq_get_pages(struct iov_iter *iter, struct page ***ppages, size_t maxsize, unsigned maxpages, size_t *_start_offset) { const struct folio_queue *folioq = iter->folioq; struct page **pages; unsigned int slot = iter->folioq_slot; size_t extracted = 0, count = iter->count, iov_offset = iter->iov_offset; if (slot >= folioq_nr_slots(folioq)) { folioq = folioq->next; slot = 0; if (WARN_ON(iov_offset != 0)) return -EIO; } maxpages = want_pages_array(ppages, maxsize, iov_offset & ~PAGE_MASK, maxpages); if (!maxpages) return -ENOMEM; *_start_offset = iov_offset & ~PAGE_MASK; pages = *ppages; for (;;) { struct folio *folio = folioq_folio(folioq, slot); size_t offset = iov_offset, fsize = folioq_folio_size(folioq, slot); size_t part = PAGE_SIZE - offset % PAGE_SIZE; if (offset < fsize) { part = umin(part, umin(maxsize - extracted, fsize - offset)); count -= part; iov_offset += part; extracted += part; *pages = folio_page(folio, offset / PAGE_SIZE); get_page(*pages); pages++; maxpages--; } if (maxpages == 0 || extracted >= maxsize) break; if (iov_offset >= fsize) { iov_offset = 0; slot++; if (slot == folioq_nr_slots(folioq) && folioq->next) { folioq = folioq->next; slot = 0; } } } iter->count = count; iter->iov_offset = iov_offset; iter->folioq = folioq; iter->folioq_slot = slot; return extracted; } static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa, pgoff_t index, unsigned int nr_pages) { XA_STATE(xas, xa, index); struct folio *folio; unsigned int ret = 0; rcu_read_lock(); for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) { if (xas_retry(&xas, folio)) continue; /* Has the folio moved or been split? */ if (unlikely(folio != xas_reload(&xas))) { xas_reset(&xas); continue; } pages[ret] = folio_file_page(folio, xas.xa_index); folio_get(folio); if (++ret == nr_pages) break; } rcu_read_unlock(); return ret; } static ssize_t iter_xarray_get_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned maxpages, size_t *_start_offset) { unsigned nr, offset, count; pgoff_t index; loff_t pos; pos = i->xarray_start + i->iov_offset; index = pos >> PAGE_SHIFT; offset = pos & ~PAGE_MASK; *_start_offset = offset; count = want_pages_array(pages, maxsize, offset, maxpages); if (!count) return -ENOMEM; nr = iter_xarray_populate_pages(*pages, i->xarray, index, count); if (nr == 0) return 0; maxsize = min_t(size_t, nr * PAGE_SIZE - offset, maxsize); i->iov_offset += maxsize; i->count -= maxsize; return maxsize; } /* must be done on non-empty ITER_UBUF or ITER_IOVEC one */ static unsigned long first_iovec_segment(const struct iov_iter *i, size_t *size) { size_t skip; long k; if (iter_is_ubuf(i)) return (unsigned long)i->ubuf + i->iov_offset; for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) { const struct iovec *iov = iter_iov(i) + k; size_t len = iov->iov_len - skip; if (unlikely(!len)) continue; if (*size > len) *size = len; return (unsigned long)iov->iov_base + skip; } BUG(); // if it had been empty, we wouldn't get called } /* must be done on non-empty ITER_BVEC one */ static struct page *first_bvec_segment(const struct iov_iter *i, size_t *size, size_t *start) { struct page *page; size_t skip = i->iov_offset, len; len = i->bvec->bv_len - skip; if (*size > len) *size = len; skip += i->bvec->bv_offset; page = i->bvec->bv_page + skip / PAGE_SIZE; *start = skip % PAGE_SIZE; return page; } static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, size_t *start) { unsigned int n, gup_flags = 0; if (maxsize > i->count) maxsize = i->count; if (!maxsize) return 0; if (maxsize > MAX_RW_COUNT) maxsize = MAX_RW_COUNT; if (likely(user_backed_iter(i))) { unsigned long addr; int res; if (iov_iter_rw(i) != WRITE) gup_flags |= FOLL_WRITE; if (i->nofault) gup_flags |= FOLL_NOFAULT; addr = first_iovec_segment(i, &maxsize); *start = addr % PAGE_SIZE; addr &= PAGE_MASK; n = want_pages_array(pages, maxsize, *start, maxpages); if (!n) return -ENOMEM; res = get_user_pages_fast(addr, n, gup_flags, *pages); if (unlikely(res <= 0)) return res; maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - *start); iov_iter_advance(i, maxsize); return maxsize; } if (iov_iter_is_bvec(i)) { struct page **p; struct page *page; page = first_bvec_segment(i, &maxsize, start); n = want_pages_array(pages, maxsize, *start, maxpages); if (!n) return -ENOMEM; p = *pages; for (int k = 0; k < n; k++) { struct folio *folio = page_folio(page + k); p[k] = page + k; if (!folio_test_slab(folio)) folio_get(folio); } maxsize = min_t(size_t, maxsize, n * PAGE_SIZE - *start); i->count -= maxsize; i->iov_offset += maxsize; if (i->iov_offset == i->bvec->bv_len) { i->iov_offset = 0; i->bvec++; i->nr_segs--; } return maxsize; } if (iov_iter_is_folioq(i)) return iter_folioq_get_pages(i, pages, maxsize, maxpages, start); if (iov_iter_is_xarray(i)) return iter_xarray_get_pages(i, pages, maxsize, maxpages, start); return -EFAULT; } ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages, size_t maxsize, unsigned maxpages, size_t *start) { if (!maxpages) return 0; BUG_ON(!pages); return __iov_iter_get_pages_alloc(i, &pages, maxsize, maxpages, start); } EXPORT_SYMBOL(iov_iter_get_pages2); ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, struct page ***pages, size_t maxsize, size_t *start) { ssize_t len; *pages = NULL; len = __iov_iter_get_pages_alloc(i, pages, maxsize, ~0U, start); if (len <= 0) { kvfree(*pages); *pages = NULL; } return len; } EXPORT_SYMBOL(iov_iter_get_pages_alloc2); static int iov_npages(const struct iov_iter *i, int maxpages) { size_t skip = i->iov_offset, size = i->count; const struct iovec *p; int npages = 0; for (p = iter_iov(i); size; skip = 0, p++) { unsigned offs = offset_in_page(p->iov_base + skip); size_t len = min(p->iov_len - skip, size); if (len) { size -= len; npages += DIV_ROUND_UP(offs + len, PAGE_SIZE); if (unlikely(npages > maxpages)) return maxpages; } } return npages; } static int bvec_npages(const struct iov_iter *i, int maxpages) { size_t skip = i->iov_offset, size = i->count; const struct bio_vec *p; int npages = 0; for (p = i->bvec; size; skip = 0, p++) { unsigned offs = (p->bv_offset + skip) % PAGE_SIZE; size_t len = min(p->bv_len - skip, size); size -= len; npages += DIV_ROUND_UP(offs + len, PAGE_SIZE); if (unlikely(npages > maxpages)) return maxpages; } return npages; } int iov_iter_npages(const struct iov_iter *i, int maxpages) { if (unlikely(!i->count)) return 0; if (likely(iter_is_ubuf(i))) { unsigned offs = offset_in_page(i->ubuf + i->iov_offset); int npages = DIV_ROUND_UP(offs + i->count, PAGE_SIZE); return min(npages, maxpages); } /* iovec and kvec have identical layouts */ if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) return iov_npages(i, maxpages); if (iov_iter_is_bvec(i)) return bvec_npages(i, maxpages); if (iov_iter_is_folioq(i)) { unsigned offset = i->iov_offset % PAGE_SIZE; int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE); return min(npages, maxpages); } if (iov_iter_is_xarray(i)) { unsigned offset = (i->xarray_start + i->iov_offset) % PAGE_SIZE; int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE); return min(npages, maxpages); } return 0; } EXPORT_SYMBOL(iov_iter_npages); const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags) { *new = *old; if (iov_iter_is_bvec(new)) return new->bvec = kmemdup(new->bvec, new->nr_segs * sizeof(struct bio_vec), flags); else if (iov_iter_is_kvec(new) || iter_is_iovec(new)) /* iovec and kvec have identical layout */ return new->__iov = kmemdup(new->__iov, new->nr_segs * sizeof(struct iovec), flags); return NULL; } EXPORT_SYMBOL(dup_iter); static __noclone int copy_compat_iovec_from_user(struct iovec *iov, const struct iovec __user *uvec, u32 nr_segs) { const struct compat_iovec __user *uiov = (const struct compat_iovec __user *)uvec; int ret = -EFAULT; u32 i; if (!user_access_begin(uiov, nr_segs * sizeof(*uiov))) return -EFAULT; for (i = 0; i < nr_segs; i++) { compat_uptr_t buf; compat_ssize_t len; unsafe_get_user(len, &uiov[i].iov_len, uaccess_end); unsafe_get_user(buf, &uiov[i].iov_base, uaccess_end); /* check for compat_size_t not fitting in compat_ssize_t .. */ if (len < 0) { ret = -EINVAL; goto uaccess_end; } iov[i].iov_base = compat_ptr(buf); iov[i].iov_len = len; } ret = 0; uaccess_end: user_access_end(); return ret; } static __noclone int copy_iovec_from_user(struct iovec *iov, const struct iovec __user *uiov, unsigned long nr_segs) { int ret = -EFAULT; if (!user_access_begin(uiov, nr_segs * sizeof(*uiov))) return -EFAULT; do { void __user *buf; ssize_t len; unsafe_get_user(len, &uiov->iov_len, uaccess_end); unsafe_get_user(buf, &uiov->iov_base, uaccess_end); /* check for size_t not fitting in ssize_t .. */ if (unlikely(len < 0)) { ret = -EINVAL; goto uaccess_end; } iov->iov_base = buf; iov->iov_len = len; uiov++; iov++; } while (--nr_segs); ret = 0; uaccess_end: user_access_end(); return ret; } struct iovec *iovec_from_user(const struct iovec __user *uvec, unsigned long nr_segs, unsigned long fast_segs, struct iovec *fast_iov, bool compat) { struct iovec *iov = fast_iov; int ret; /* * SuS says "The readv() function *may* fail if the iovcnt argument was * less than or equal to 0, or greater than {IOV_MAX}. Linux has * traditionally returned zero for zero segments, so... */ if (nr_segs == 0) return iov; if (nr_segs > UIO_MAXIOV) return ERR_PTR(-EINVAL); if (nr_segs > fast_segs) { iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL); if (!iov) return ERR_PTR(-ENOMEM); } if (unlikely(compat)) ret = copy_compat_iovec_from_user(iov, uvec, nr_segs); else ret = copy_iovec_from_user(iov, uvec, nr_segs); if (ret) { if (iov != fast_iov) kfree(iov); return ERR_PTR(ret); } return iov; } /* * Single segment iovec supplied by the user, import it as ITER_UBUF. */ static ssize_t __import_iovec_ubuf(int type, const struct iovec __user *uvec, struct iovec **iovp, struct iov_iter *i, bool compat) { struct iovec *iov = *iovp; ssize_t ret; *iovp = NULL; if (compat) ret = copy_compat_iovec_from_user(iov, uvec, 1); else ret = copy_iovec_from_user(iov, uvec, 1); if (unlikely(ret)) return ret; ret = import_ubuf(type, iov->iov_base, iov->iov_len, i); if (unlikely(ret)) return ret; return i->count; } ssize_t __import_iovec(int type, const struct iovec __user *uvec, unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, struct iov_iter *i, bool compat) { ssize_t total_len = 0; unsigned long seg; struct iovec *iov; if (nr_segs == 1) return __import_iovec_ubuf(type, uvec, iovp, i, compat); iov = iovec_from_user(uvec, nr_segs, fast_segs, *iovp, compat); if (IS_ERR(iov)) { *iovp = NULL; return PTR_ERR(iov); } /* * According to the Single Unix Specification we should return EINVAL if * an element length is < 0 when cast to ssize_t or if the total length * would overflow the ssize_t return value of the system call. * * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the * overflow case. */ for (seg = 0; seg < nr_segs; seg++) { ssize_t len = (ssize_t)iov[seg].iov_len; if (!access_ok(iov[seg].iov_base, len)) { if (iov != *iovp) kfree(iov); *iovp = NULL; return -EFAULT; } if (len > MAX_RW_COUNT - total_len) { len = MAX_RW_COUNT - total_len; iov[seg].iov_len = len; } total_len += len; } iov_iter_init(i, type, iov, nr_segs, total_len); if (iov == *iovp) *iovp = NULL; else *iovp = iov; return total_len; } /** * import_iovec() - Copy an array of &struct iovec from userspace * into the kernel, check that it is valid, and initialize a new * &struct iov_iter iterator to access it. * * @type: One of %READ or %WRITE. * @uvec: Pointer to the userspace array. * @nr_segs: Number of elements in userspace array. * @fast_segs: Number of elements in @iov. * @iovp: (input and output parameter) Pointer to pointer to (usually small * on-stack) kernel array. * @i: Pointer to iterator that will be initialized on success. * * If the array pointed to by *@iov is large enough to hold all @nr_segs, * then this function places %NULL in *@iov on return. Otherwise, a new * array will be allocated and the result placed in *@iov. This means that * the caller may call kfree() on *@iov regardless of whether the small * on-stack array was used or not (and regardless of whether this function * returns an error or not). * * Return: Negative error code on error, bytes imported on success */ ssize_t import_iovec(int type, const struct iovec __user *uvec, unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, struct iov_iter *i) { return __import_iovec(type, uvec, nr_segs, fast_segs, iovp, i, in_compat_syscall()); } EXPORT_SYMBOL(import_iovec); int import_ubuf(int rw, void __user *buf, size_t len, struct iov_iter *i) { if (len > MAX_RW_COUNT) len = MAX_RW_COUNT; if (unlikely(!access_ok(buf, len))) return -EFAULT; iov_iter_ubuf(i, rw, buf, len); return 0; } EXPORT_SYMBOL_GPL(import_ubuf); /** * iov_iter_restore() - Restore a &struct iov_iter to the same state as when * iov_iter_save_state() was called. * * @i: &struct iov_iter to restore * @state: state to restore from * * Used after iov_iter_save_state() to bring restore @i, if operations may * have advanced it. * * Note: only works on ITER_IOVEC, ITER_BVEC, and ITER_KVEC */ void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state) { if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i) && !iter_is_ubuf(i)) && !iov_iter_is_kvec(i)) return; i->iov_offset = state->iov_offset; i->count = state->count; if (iter_is_ubuf(i)) return; /* * For the *vec iters, nr_segs + iov is constant - if we increment * the vec, then we also decrement the nr_segs count. Hence we don't * need to track both of these, just one is enough and we can deduct * the other from that. ITER_KVEC and ITER_IOVEC are the same struct * size, so we can just increment the iov pointer as they are unionzed. * ITER_BVEC _may_ be the same size on some archs, but on others it is * not. Be safe and handle it separately. */ BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec)); if (iov_iter_is_bvec(i)) i->bvec -= state->nr_segs - i->nr_segs; else i->__iov -= state->nr_segs - i->nr_segs; i->nr_segs = state->nr_segs; } /* * Extract a list of contiguous pages from an ITER_FOLIOQ iterator. This does * not get references on the pages, nor does it get a pin on them. */ static ssize_t iov_iter_extract_folioq_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, iov_iter_extraction_t extraction_flags, size_t *offset0) { const struct folio_queue *folioq = i->folioq; struct page **p; unsigned int nr = 0; size_t extracted = 0, offset, slot = i->folioq_slot; if (slot >= folioq_nr_slots(folioq)) { folioq = folioq->next; slot = 0; if (WARN_ON(i->iov_offset != 0)) return -EIO; } offset = i->iov_offset & ~PAGE_MASK; *offset0 = offset; maxpages = want_pages_array(pages, maxsize, offset, maxpages); if (!maxpages) return -ENOMEM; p = *pages; for (;;) { struct folio *folio = folioq_folio(folioq, slot); size_t offset = i->iov_offset, fsize = folioq_folio_size(folioq, slot); size_t part = PAGE_SIZE - offset % PAGE_SIZE; if (offset < fsize) { part = umin(part, umin(maxsize - extracted, fsize - offset)); i->count -= part; i->iov_offset += part; extracted += part; p[nr++] = folio_page(folio, offset / PAGE_SIZE); } if (nr >= maxpages || extracted >= maxsize) break; if (i->iov_offset >= fsize) { i->iov_offset = 0; slot++; if (slot == folioq_nr_slots(folioq) && folioq->next) { folioq = folioq->next; slot = 0; } } } i->folioq = folioq; i->folioq_slot = slot; return extracted; } /* * Extract a list of contiguous pages from an ITER_XARRAY iterator. This does not * get references on the pages, nor does it get a pin on them. */ static ssize_t iov_iter_extract_xarray_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, iov_iter_extraction_t extraction_flags, size_t *offset0) { struct page **p; struct folio *folio; unsigned int nr = 0, offset; loff_t pos = i->xarray_start + i->iov_offset; XA_STATE(xas, i->xarray, pos >> PAGE_SHIFT); offset = pos & ~PAGE_MASK; *offset0 = offset; maxpages = want_pages_array(pages, maxsize, offset, maxpages); if (!maxpages) return -ENOMEM; p = *pages; rcu_read_lock(); for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) { if (xas_retry(&xas, folio)) continue; /* Has the folio moved or been split? */ if (unlikely(folio != xas_reload(&xas))) { xas_reset(&xas); continue; } p[nr++] = folio_file_page(folio, xas.xa_index); if (nr == maxpages) break; } rcu_read_unlock(); maxsize = min_t(size_t, nr * PAGE_SIZE - offset, maxsize); iov_iter_advance(i, maxsize); return maxsize; } /* * Extract a list of virtually contiguous pages from an ITER_BVEC iterator. * This does not get references on the pages, nor does it get a pin on them. */ static ssize_t iov_iter_extract_bvec_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, iov_iter_extraction_t extraction_flags, size_t *offset0) { size_t skip = i->iov_offset, size = 0; struct bvec_iter bi; int k = 0; if (i->nr_segs == 0) return 0; if (i->iov_offset == i->bvec->bv_len) { i->iov_offset = 0; i->nr_segs--; i->bvec++; skip = 0; } bi.bi_idx = 0; bi.bi_size = maxsize; bi.bi_bvec_done = skip; maxpages = want_pages_array(pages, maxsize, skip, maxpages); while (bi.bi_size && bi.bi_idx < i->nr_segs) { struct bio_vec bv = bvec_iter_bvec(i->bvec, bi); /* * The iov_iter_extract_pages interface only allows an offset * into the first page. Break out of the loop if we see an * offset into subsequent pages, the caller will have to call * iov_iter_extract_pages again for the reminder. */ if (k) { if (bv.bv_offset) break; } else { *offset0 = bv.bv_offset; } (*pages)[k++] = bv.bv_page; size += bv.bv_len; if (k >= maxpages) break; /* * We are done when the end of the bvec doesn't align to a page * boundary as that would create a hole in the returned space. * The caller will handle this with another call to * iov_iter_extract_pages. */ if (bv.bv_offset + bv.bv_len != PAGE_SIZE) break; bvec_iter_advance_single(i->bvec, &bi, bv.bv_len); } iov_iter_advance(i, size); return size; } /* * Extract a list of virtually contiguous pages from an ITER_KVEC iterator. * This does not get references on the pages, nor does it get a pin on them. */ static ssize_t iov_iter_extract_kvec_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, iov_iter_extraction_t extraction_flags, size_t *offset0) { struct page **p, *page; const void *kaddr; size_t skip = i->iov_offset, offset, len, size; int k; for (;;) { if (i->nr_segs == 0) return 0; size = min(maxsize, i->kvec->iov_len - skip); if (size) break; i->iov_offset = 0; i->nr_segs--; i->kvec++; skip = 0; } kaddr = i->kvec->iov_base + skip; offset = (unsigned long)kaddr & ~PAGE_MASK; *offset0 = offset; maxpages = want_pages_array(pages, size, offset, maxpages); if (!maxpages) return -ENOMEM; p = *pages; kaddr -= offset; len = offset + size; for (k = 0; k < maxpages; k++) { size_t seg = min_t(size_t, len, PAGE_SIZE); if (is_vmalloc_or_module_addr(kaddr)) page = vmalloc_to_page(kaddr); else page = virt_to_page(kaddr); p[k] = page; len -= seg; kaddr += PAGE_SIZE; } size = min_t(size_t, size, maxpages * PAGE_SIZE - offset); iov_iter_advance(i, size); return size; } /* * Extract a list of contiguous pages from a user iterator and get a pin on * each of them. This should only be used if the iterator is user-backed * (IOBUF/UBUF). * * It does not get refs on the pages, but the pages must be unpinned by the * caller once the transfer is complete. * * This is safe to be used where background IO/DMA *is* going to be modifying * the buffer; using a pin rather than a ref makes forces fork() to give the * child a copy of the page. */ static ssize_t iov_iter_extract_user_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, iov_iter_extraction_t extraction_flags, size_t *offset0) { unsigned long addr; unsigned int gup_flags = 0; size_t offset; int res; if (i->data_source == ITER_DEST) gup_flags |= FOLL_WRITE; if (extraction_flags & ITER_ALLOW_P2PDMA) gup_flags |= FOLL_PCI_P2PDMA; if (i->nofault) gup_flags |= FOLL_NOFAULT; addr = first_iovec_segment(i, &maxsize); *offset0 = offset = addr % PAGE_SIZE; addr &= PAGE_MASK; maxpages = want_pages_array(pages, maxsize, offset, maxpages); if (!maxpages) return -ENOMEM; res = pin_user_pages_fast(addr, maxpages, gup_flags, *pages); if (unlikely(res <= 0)) return res; maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - offset); iov_iter_advance(i, maxsize); return maxsize; } /** * iov_iter_extract_pages - Extract a list of contiguous pages from an iterator * @i: The iterator to extract from * @pages: Where to return the list of pages * @maxsize: The maximum amount of iterator to extract * @maxpages: The maximum size of the list of pages * @extraction_flags: Flags to qualify request * @offset0: Where to return the starting offset into (*@pages)[0] * * Extract a list of contiguous pages from the current point of the iterator, * advancing the iterator. The maximum number of pages and the maximum amount * of page contents can be set. * * If *@pages is NULL, a page list will be allocated to the required size and * *@pages will be set to its base. If *@pages is not NULL, it will be assumed * that the caller allocated a page list at least @maxpages in size and this * will be filled in. * * @extraction_flags can have ITER_ALLOW_P2PDMA set to request peer-to-peer DMA * be allowed on the pages extracted. * * The iov_iter_extract_will_pin() function can be used to query how cleanup * should be performed. * * Extra refs or pins on the pages may be obtained as follows: * * (*) If the iterator is user-backed (ITER_IOVEC/ITER_UBUF), pins will be * added to the pages, but refs will not be taken. * iov_iter_extract_will_pin() will return true. * * (*) If the iterator is ITER_KVEC, ITER_BVEC, ITER_FOLIOQ or ITER_XARRAY, the * pages are merely listed; no extra refs or pins are obtained. * iov_iter_extract_will_pin() will return 0. * * Note also: * * (*) Use with ITER_DISCARD is not supported as that has no content. * * On success, the function sets *@pages to the new pagelist, if allocated, and * sets *offset0 to the offset into the first page. * * It may also return -ENOMEM and -EFAULT. */ ssize_t iov_iter_extract_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, iov_iter_extraction_t extraction_flags, size_t *offset0) { maxsize = min_t(size_t, min_t(size_t, maxsize, i->count), MAX_RW_COUNT); if (!maxsize) return 0; if (likely(user_backed_iter(i))) return iov_iter_extract_user_pages(i, pages, maxsize, maxpages, extraction_flags, offset0); if (iov_iter_is_kvec(i)) return iov_iter_extract_kvec_pages(i, pages, maxsize, maxpages, extraction_flags, offset0); if (iov_iter_is_bvec(i)) return iov_iter_extract_bvec_pages(i, pages, maxsize, maxpages, extraction_flags, offset0); if (iov_iter_is_folioq(i)) return iov_iter_extract_folioq_pages(i, pages, maxsize, maxpages, extraction_flags, offset0); if (iov_iter_is_xarray(i)) return iov_iter_extract_xarray_pages(i, pages, maxsize, maxpages, extraction_flags, offset0); return -EFAULT; } EXPORT_SYMBOL_GPL(iov_iter_extract_pages);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 /* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ /* * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. */ #ifndef RXE_VERBS_H #define RXE_VERBS_H #include <linux/interrupt.h> #include <linux/workqueue.h> #include "rxe_pool.h" #include "rxe_task.h" #include "rxe_hw_counters.h" static inline int pkey_match(u16 key1, u16 key2) { return (((key1 & 0x7fff) != 0) && ((key1 & 0x7fff) == (key2 & 0x7fff)) && ((key1 & 0x8000) || (key2 & 0x8000))) ? 1 : 0; } /* Return >0 if psn_a > psn_b * 0 if psn_a == psn_b * <0 if psn_a < psn_b */ static inline int psn_compare(u32 psn_a, u32 psn_b) { s32 diff; diff = (psn_a - psn_b) << 8; return diff; } struct rxe_ucontext { struct ib_ucontext ibuc; struct rxe_pool_elem elem; }; struct rxe_pd { struct ib_pd ibpd; struct rxe_pool_elem elem; }; struct rxe_ah { struct ib_ah ibah; struct rxe_pool_elem elem; struct rxe_av av; bool is_user; int ah_num; }; struct rxe_cqe { union { struct ib_wc ibwc; struct ib_uverbs_wc uibwc; }; }; struct rxe_cq { struct ib_cq ibcq; struct rxe_pool_elem elem; struct rxe_queue *queue; spinlock_t cq_lock; u8 notify; bool is_user; atomic_t num_wq; }; enum wqe_state { wqe_state_posted, wqe_state_processing, wqe_state_pending, wqe_state_done, wqe_state_error, }; struct rxe_sq { int max_wr; int max_sge; int max_inline; spinlock_t sq_lock; /* guard queue */ struct rxe_queue *queue; }; struct rxe_rq { int max_wr; int max_sge; spinlock_t producer_lock; /* guard queue producer */ spinlock_t consumer_lock; /* guard queue consumer */ struct rxe_queue *queue; }; struct rxe_srq { struct ib_srq ibsrq; struct rxe_pool_elem elem; struct rxe_pd *pd; struct rxe_rq rq; u32 srq_num; int limit; int error; }; struct rxe_req_info { int wqe_index; u32 psn; int opcode; atomic_t rd_atomic; int wait_fence; int need_rd_atomic; int wait_psn; int need_retry; int wait_for_rnr_timer; int noack_pkts; int again; }; struct rxe_comp_info { u32 psn; int opcode; int timeout; int timeout_retry; int started_retry; u32 retry_cnt; u32 rnr_retry; }; /* responder states */ enum resp_states { RESPST_NONE, RESPST_GET_REQ, RESPST_CHK_PSN, RESPST_CHK_OP_SEQ, RESPST_CHK_OP_VALID, RESPST_CHK_RESOURCE, RESPST_CHK_LENGTH, RESPST_CHK_RKEY, RESPST_EXECUTE, RESPST_READ_REPLY, RESPST_ATOMIC_REPLY, RESPST_ATOMIC_WRITE_REPLY, RESPST_PROCESS_FLUSH, RESPST_COMPLETE, RESPST_ACKNOWLEDGE, RESPST_CLEANUP, RESPST_DUPLICATE_REQUEST, RESPST_ERR_MALFORMED_WQE, RESPST_ERR_UNSUPPORTED_OPCODE, RESPST_ERR_MISALIGNED_ATOMIC, RESPST_ERR_PSN_OUT_OF_SEQ, RESPST_ERR_MISSING_OPCODE_FIRST, RESPST_ERR_MISSING_OPCODE_LAST_C, RESPST_ERR_MISSING_OPCODE_LAST_D1E, RESPST_ERR_TOO_MANY_RDMA_ATM_REQ, RESPST_ERR_RNR, RESPST_ERR_RKEY_VIOLATION, RESPST_ERR_INVALIDATE_RKEY, RESPST_ERR_LENGTH, RESPST_ERR_CQ_OVERFLOW, RESPST_ERROR, RESPST_DONE, RESPST_EXIT, }; enum rdatm_res_state { rdatm_res_state_next, rdatm_res_state_new, rdatm_res_state_replay, }; struct resp_res { int type; int replay; u32 first_psn; u32 last_psn; u32 cur_psn; enum rdatm_res_state state; union { struct { u64 orig_val; } atomic; struct { u64 va_org; u32 rkey; u32 length; u64 va; u32 resid; } read; struct { u32 length; u64 va; u8 type; u8 level; } flush; }; }; struct rxe_resp_info { u32 msn; u32 psn; u32 ack_psn; int opcode; int drop_msg; int goto_error; int sent_psn_nak; enum ib_wc_status status; u8 aeth_syndrome; /* Receive only */ struct rxe_recv_wqe *wqe; /* RDMA read / atomic only */ u64 va; u64 offset; struct rxe_mr *mr; u32 resid; u32 rkey; u32 length; /* SRQ only */ struct { struct rxe_recv_wqe wqe; struct ib_sge sge[RXE_MAX_SGE]; } srq_wqe; /* Responder resources. It's a circular list where the oldest * resource is dropped first. */ struct resp_res *resources; unsigned int res_head; unsigned int res_tail; struct resp_res *res; }; struct rxe_qp { struct ib_qp ibqp; struct rxe_pool_elem elem; struct ib_qp_attr attr; unsigned int valid; unsigned int mtu; bool is_user; struct rxe_pd *pd; struct rxe_srq *srq; struct rxe_cq *scq; struct rxe_cq *rcq; enum ib_sig_type sq_sig_type; struct rxe_sq sq; struct rxe_rq rq; struct socket *sk; u32 dst_cookie; u16 src_port; struct rxe_av pri_av; struct rxe_av alt_av; atomic_t mcg_num; struct sk_buff_head req_pkts; struct sk_buff_head resp_pkts; struct rxe_task send_task; struct rxe_task recv_task; struct rxe_req_info req; struct rxe_comp_info comp; struct rxe_resp_info resp; atomic_t ssn; atomic_t skb_out; int need_req_skb; /* Timer for retranmitting packet when ACKs have been lost. RC * only. The requester sets it when it is not already * started. The responder resets it whenever an ack is * received. */ struct timer_list retrans_timer; u64 qp_timeout_jiffies; /* Timer for handling RNR NAKS. */ struct timer_list rnr_nak_timer; spinlock_t state_lock; /* guard requester and completer */ struct execute_work cleanup_work; }; enum { RXE_ACCESS_REMOTE = IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_ATOMIC, RXE_ACCESS_SUPPORTED_MR = RXE_ACCESS_REMOTE | IB_ACCESS_LOCAL_WRITE | IB_ACCESS_MW_BIND | IB_ACCESS_ON_DEMAND | IB_ACCESS_FLUSH_GLOBAL | IB_ACCESS_FLUSH_PERSISTENT | IB_ACCESS_OPTIONAL, RXE_ACCESS_SUPPORTED_QP = RXE_ACCESS_SUPPORTED_MR, RXE_ACCESS_SUPPORTED_MW = RXE_ACCESS_SUPPORTED_MR | IB_ZERO_BASED, }; enum rxe_mr_state { RXE_MR_STATE_INVALID, RXE_MR_STATE_FREE, RXE_MR_STATE_VALID, }; enum rxe_mr_copy_dir { RXE_TO_MR_OBJ, RXE_FROM_MR_OBJ, }; enum rxe_mr_lookup_type { RXE_LOOKUP_LOCAL, RXE_LOOKUP_REMOTE, }; enum rxe_rereg { RXE_MR_REREG_SUPPORTED = IB_MR_REREG_PD | IB_MR_REREG_ACCESS, }; static inline int rkey_is_mw(u32 rkey) { u32 index = rkey >> 8; return (index >= RXE_MIN_MW_INDEX) && (index <= RXE_MAX_MW_INDEX); } struct rxe_mr { struct rxe_pool_elem elem; struct ib_mr ibmr; struct ib_umem *umem; u32 lkey; u32 rkey; enum rxe_mr_state state; int access; atomic_t num_mw; unsigned int page_offset; unsigned int page_shift; u64 page_mask; u32 num_buf; u32 nbuf; struct xarray page_list; }; static inline unsigned int mr_page_size(struct rxe_mr *mr) { return mr ? mr->ibmr.page_size : PAGE_SIZE; } enum rxe_mw_state { RXE_MW_STATE_INVALID = RXE_MR_STATE_INVALID, RXE_MW_STATE_FREE = RXE_MR_STATE_FREE, RXE_MW_STATE_VALID = RXE_MR_STATE_VALID, }; struct rxe_mw { struct ib_mw ibmw; struct rxe_pool_elem elem; spinlock_t lock; enum rxe_mw_state state; struct rxe_qp *qp; /* Type 2 only */ struct rxe_mr *mr; u32 rkey; int access; u64 addr; u64 length; }; struct rxe_mcg { struct rb_node node; struct kref ref_cnt; struct rxe_dev *rxe; struct list_head qp_list; union ib_gid mgid; atomic_t qp_num; u32 qkey; u16 pkey; }; struct rxe_mca { struct list_head qp_list; struct rxe_qp *qp; }; struct rxe_port { struct ib_port_attr attr; __be64 port_guid; __be64 subnet_prefix; spinlock_t port_lock; /* guard port */ unsigned int mtu_cap; /* special QPs */ u32 qp_gsi_index; }; #define RXE_PORT 1 struct rxe_dev { struct ib_device ib_dev; struct ib_device_attr attr; int max_ucontext; int max_inline_data; struct mutex usdev_lock; char raw_gid[ETH_ALEN]; struct rxe_pool uc_pool; struct rxe_pool pd_pool; struct rxe_pool ah_pool; struct rxe_pool srq_pool; struct rxe_pool qp_pool; struct rxe_pool cq_pool; struct rxe_pool mr_pool; struct rxe_pool mw_pool; /* multicast support */ spinlock_t mcg_lock; struct rb_root mcg_tree; atomic_t mcg_num; atomic_t mcg_attach; spinlock_t pending_lock; /* guard pending_mmaps */ struct list_head pending_mmaps; spinlock_t mmap_offset_lock; /* guard mmap_offset */ u64 mmap_offset; atomic64_t stats_counters[RXE_NUM_OF_COUNTERS]; struct rxe_port port; }; static inline struct net_device *rxe_ib_device_get_netdev(struct ib_device *dev) { return ib_device_get_netdev(dev, RXE_PORT); } static inline void rxe_counter_inc(struct rxe_dev *rxe, enum rxe_counters index) { atomic64_inc(&rxe->stats_counters[index]); } static inline struct rxe_dev *to_rdev(struct ib_device *dev) { return dev ? container_of(dev, struct rxe_dev, ib_dev) : NULL; } static inline struct rxe_ucontext *to_ruc(struct ib_ucontext *uc) { return uc ? container_of(uc, struct rxe_ucontext, ibuc) : NULL; } static inline struct rxe_pd *to_rpd(struct ib_pd *pd) { return pd ? container_of(pd, struct rxe_pd, ibpd) : NULL; } static inline struct rxe_ah *to_rah(struct ib_ah *ah) { return ah ? container_of(ah, struct rxe_ah, ibah) : NULL; } static inline struct rxe_srq *to_rsrq(struct ib_srq *srq) { return srq ? container_of(srq, struct rxe_srq, ibsrq) : NULL; } static inline struct rxe_qp *to_rqp(struct ib_qp *qp) { return qp ? container_of(qp, struct rxe_qp, ibqp) : NULL; } static inline struct rxe_cq *to_rcq(struct ib_cq *cq) { return cq ? container_of(cq, struct rxe_cq, ibcq) : NULL; } static inline struct rxe_mr *to_rmr(struct ib_mr *mr) { return mr ? container_of(mr, struct rxe_mr, ibmr) : NULL; } static inline struct rxe_mw *to_rmw(struct ib_mw *mw) { return mw ? container_of(mw, struct rxe_mw, ibmw) : NULL; } static inline struct rxe_pd *rxe_ah_pd(struct rxe_ah *ah) { return to_rpd(ah->ibah.pd); } static inline struct rxe_pd *mr_pd(struct rxe_mr *mr) { return to_rpd(mr->ibmr.pd); } static inline struct rxe_pd *rxe_mw_pd(struct rxe_mw *mw) { return to_rpd(mw->ibmw.pd); } int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name, struct net_device *ndev); #endif /* RXE_VERBS_H */
2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Based on arch/arm/include/asm/cacheflush.h * * Copyright (C) 1999-2002 Russell King. * Copyright (C) 2012 ARM Ltd. */ #ifndef __ASM_CACHEFLUSH_H #define __ASM_CACHEFLUSH_H #include <linux/kgdb.h> #include <linux/mm.h> /* * This flag is used to indicate that the page pointed to by a pte is clean * and does not require cleaning before returning it to the user. */ #define PG_dcache_clean PG_arch_1 /* * MM Cache Management * =================== * * The arch/arm64/mm/cache.S implements these methods. * * Start addresses are inclusive and end addresses are exclusive; start * addresses should be rounded down, end addresses up. * * See Documentation/core-api/cachetlb.rst for more information. Please note that * the implementation assumes non-aliasing VIPT D-cache and (aliasing) * VIPT I-cache. * * All functions below apply to the interval [start, end) * - start - virtual start address (inclusive) * - end - virtual end address (exclusive) * * caches_clean_inval_pou(start, end) * * Ensure coherency between the I-cache and the D-cache region to * the Point of Unification. * * caches_clean_inval_user_pou(start, end) * * Ensure coherency between the I-cache and the D-cache region to * the Point of Unification. * Use only if the region might access user memory. * * icache_inval_pou(start, end) * * Invalidate I-cache region to the Point of Unification. * * dcache_clean_inval_poc(start, end) * * Clean and invalidate D-cache region to the Point of Coherency. * * dcache_inval_poc(start, end) * * Invalidate D-cache region to the Point of Coherency. * * dcache_clean_poc(start, end) * * Clean D-cache region to the Point of Coherency. * * dcache_clean_pop(start, end) * * Clean D-cache region to the Point of Persistence. * * dcache_clean_pou(start, end) * * Clean D-cache region to the Point of Unification. */ extern void caches_clean_inval_pou(unsigned long start, unsigned long end); extern void icache_inval_pou(unsigned long start, unsigned long end); extern void dcache_clean_inval_poc(unsigned long start, unsigned long end); extern void dcache_inval_poc(unsigned long start, unsigned long end); extern void dcache_clean_poc(unsigned long start, unsigned long end); extern void dcache_clean_pop(unsigned long start, unsigned long end); extern void dcache_clean_pou(unsigned long start, unsigned long end); extern long caches_clean_inval_user_pou(unsigned long start, unsigned long end); extern void sync_icache_aliases(unsigned long start, unsigned long end); static inline void flush_icache_range(unsigned long start, unsigned long end) { caches_clean_inval_pou(start, end); /* * IPI all online CPUs so that they undergo a context synchronization * event and are forced to refetch the new instructions. */ /* * KGDB performs cache maintenance with interrupts disabled, so we * will deadlock trying to IPI the secondary CPUs. In theory, we can * set CACHE_FLUSH_IS_SAFE to 0 to avoid this known issue, but that * just means that KGDB will elide the maintenance altogether! As it * turns out, KGDB uses IPIs to round-up the secondary CPUs during * the patching operation, so we don't need extra IPIs here anyway. * In which case, add a KGDB-specific bodge and return early. */ if (in_dbg_master()) return; kick_all_cpus_sync(); } #define flush_icache_range flush_icache_range /* * Copy user data from/to a page which is mapped into a different * processes address space. Really, we want to allow our "user * space" model to handle this. */ extern void copy_to_user_page(struct vm_area_struct *, struct page *, unsigned long, void *, const void *, unsigned long); #define copy_to_user_page copy_to_user_page /* * flush_dcache_folio is used when the kernel has written to the page * cache page at virtual address page->virtual. * * If this page isn't mapped (ie, folio_mapping == NULL), or it might * have userspace mappings, then we _must_ always clean + invalidate * the dcache entries associated with the kernel mapping. * * Otherwise we can defer the operation, and clean the cache when we are * about to change to user space. This is the same method as used on SPARC64. * See update_mmu_cache for the user space part. */ #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 extern void flush_dcache_page(struct page *); void flush_dcache_folio(struct folio *); #define flush_dcache_folio flush_dcache_folio static __always_inline void icache_inval_all_pou(void) { if (alternative_has_cap_unlikely(ARM64_HAS_CACHE_DIC)) return; asm("ic ialluis"); dsb(ish); } #include <asm-generic/cacheflush.h> #endif /* __ASM_CACHEFLUSH_H */
14 13 14 6 6 6 14 8 14 14 8 8 6 6 6 6 6 6 6 6 6 13 9 2 12 2 3 3 11 11 3 8 11 2 13 13 8 3 8 14 8 14 14 13 13 13 13 13 13 13 13 13 13 13 12 14 13 14 14 13 13 13 13 14 13 13 13 13 15 13 12 15 14 14 14 8 14 14 12 14 14 14 3 3 14 13 14 14 14 14 14 14 13 14 14 14 13 8 14 14 3 3 3 3 3 3 3 3 3 3 3 3 8 8 14 11 11 11 3 8 8 8 8 8 8 8 8 8 8 8 8 14 11 11 14 14 13 8 1 2 2 3 3 3 3 3 3 3 1 2 1 2 2 3 3 3 10 10 10 8 8 8 8 8 8 8 3 8 8 8 8 3 8 8 14 14 14 12 8 14 14 14 14 14 14 14 14 13 14 14 14 14 14 14 14 14 14 14 14 14 14 14 9 10 14 3 3 3 3 3 3 3 3 7 3 7 7 8 8 8 8 8 8 14 14 14 14 8 8 8 14 14 14 14 14 14 14 14 14 14 14 14 14 8 14 14 13 8 8 8 8 8 3 3 3 14 14 14 14 14 14 14 13 14 14 14 14 14 14 2 2 2 2 2 2 2 2 2 5 3 2 1 1 1 13 13 14 13 13 14 13 12 12 12 3 1 2 2 2 2 14 3 3 3 3 3 2 3 3 3 3 3 3 1 2 3 14 14 14 14 14 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 5 5 5 8 8 8 8 8 3 3 3 3 3 3 3 215 215 8 3 3 8 8 8 8 3 3 14 14 13 14 14 14 14 13 14 14 14 14 14 14 13 14 14 14 14 14 14 14 14 14 14 13 12 11 14 14 14 14 14 14 14 13 14 14 14 14 14 14 14 14 14 14 14 14 13 14 13 14 14 14 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291 8292 8293 8294 8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305 8306 8307 8308 8309 8310 8311 8312 8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324 8325 8326 8327 8328 8329 8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346 8347 8348 8349 8350 8351 8352 8353 8354 8355 8356 8357 8358 8359 8360 8361 8362 8363 8364 8365 8366 8367 8368 8369 8370 8371 8372 8373 8374 8375 8376 8377 8378 8379 8380 8381 8382 8383 8384 8385 8386 8387 8388 8389 8390 8391 8392 8393 8394 8395 8396 8397 8398 8399 8400 8401 8402 8403 8404 8405 8406 8407 8408 8409 8410 8411 8412 8413 8414 8415 8416 8417 8418 8419 8420 8421 8422 8423 8424 8425 8426 8427 8428 8429 8430 8431 8432 8433 8434 8435 8436 8437 8438 8439 8440 8441 8442 8443 8444 8445 8446 8447 8448 8449 8450 8451 8452 8453 8454 8455 8456 8457 8458 8459 8460 8461 8462 8463 8464 8465 8466 8467 8468 8469 8470 8471 8472 8473 8474 8475 8476 8477 8478 8479 8480 8481 8482 8483 8484 8485 8486 8487 8488 8489 8490 8491 8492 8493 8494 8495 8496 8497 8498 8499 8500 8501 8502 8503 8504 8505 8506 8507 8508 8509 8510 8511 8512 8513 8514 8515 8516 8517 8518 8519 8520 8521 8522 8523 8524 8525 8526 8527 8528 8529 8530 8531 8532 8533 8534 8535 8536 8537 8538 8539 8540 8541 8542 8543 8544 8545 8546 8547 8548 8549 8550 8551 8552 8553 8554 8555 8556 8557 8558 8559 8560 8561 8562 8563 8564 8565 8566 8567 8568 8569 8570 8571 8572 8573 8574 8575 8576 8577 8578 8579 8580 8581 8582 8583 8584 8585 8586 8587 8588 8589 8590 8591 8592 8593 8594 8595 8596 8597 8598 8599 8600 8601 8602 8603 8604 8605 8606 8607 8608 8609 8610 8611 8612 8613 8614 8615 8616 8617 8618 8619 8620 8621 8622 8623 8624 8625 8626 8627 8628 8629 8630 8631 8632 8633 8634 8635 8636 8637 8638 8639 8640 8641 8642 8643 8644 8645 8646 8647 8648 8649 8650 8651 8652 8653 8654 8655 8656 8657 8658 8659 8660 8661 8662 8663 8664 8665 8666 8667 8668 8669 8670 8671 8672 8673 8674 8675 8676 8677 8678 8679 8680 8681 8682 8683 8684 8685 8686 8687 8688 8689 8690 8691 8692 8693 8694 8695 8696 8697 8698 8699 8700 8701 8702 8703 8704 8705 8706 8707 8708 8709 8710 8711 8712 8713 8714 8715 8716 8717 8718 8719 8720 8721 8722 8723 8724 8725 8726 8727 8728 8729 8730 8731 8732 8733 8734 8735 8736 8737 8738 8739 8740 8741 8742 8743 8744 8745 8746 8747 8748 8749 8750 8751 8752 8753 8754 8755 8756 8757 8758 8759 8760 8761 8762 8763 8764 8765 8766 8767 8768 8769 8770 8771 8772 8773 8774 8775 8776 8777 8778 8779 8780 8781 8782 8783 8784 8785 8786 8787 8788 8789 8790 8791 8792 8793 8794 8795 8796 8797 8798 8799 8800 8801 8802 8803 8804 8805 8806 8807 8808 8809 8810 8811 8812 8813 8814 8815 8816 8817 8818 8819 8820 8821 8822 8823 8824 8825 8826 8827 8828 8829 8830 8831 8832 8833 8834 8835 8836 8837 8838 8839 8840 8841 8842 8843 8844 8845 8846 8847 8848 8849 8850 8851 8852 8853 8854 8855 8856 8857 8858 8859 8860 8861 8862 8863 8864 8865 8866 8867 8868 8869 8870 8871 8872 8873 8874 8875 8876 8877 8878 8879 8880 8881 8882 8883 8884 8885 8886 8887 8888 8889 8890 8891 8892 8893 8894 8895 8896 8897 8898 8899 8900 8901 8902 8903 8904 8905 8906 8907 8908 8909 8910 8911 8912 8913 8914 8915 8916 8917 8918 8919 8920 8921 8922 8923 8924 8925 8926 8927 8928 8929 8930 8931 8932 8933 8934 8935 8936 8937 8938 8939 8940 8941 8942 8943 8944 8945 8946 8947 8948 8949 8950 8951 8952 8953 8954 8955 8956 8957 8958 8959 8960 8961 8962 8963 8964 8965 8966 8967 8968 8969 8970 8971 8972 8973 8974 8975 8976 8977 8978 8979 8980 8981 8982 8983 8984 8985 8986 8987 8988 8989 8990 8991 8992 8993 8994 8995 8996 8997 8998 8999 9000 9001 9002 9003 9004 9005 9006 9007 9008 9009 9010 9011 9012 9013 9014 9015 9016 9017 9018 9019 9020 9021 9022 9023 9024 9025 9026 9027 9028 9029 9030 9031 9032 9033 9034 9035 9036 9037 9038 9039 9040 9041 9042 9043 9044 9045 9046 9047 9048 9049 9050 9051 9052 9053 9054 9055 9056 9057 9058 9059 9060 9061 9062 9063 9064 9065 9066 9067 9068 9069 9070 9071 9072 9073 9074 9075 9076 9077 9078 9079 9080 9081 9082 9083 9084 9085 9086 9087 9088 9089 9090 9091 9092 9093 9094 9095 9096 9097 9098 9099 9100 9101 9102 9103 9104 9105 9106 9107 9108 9109 9110 9111 9112 9113 9114 9115 9116 9117 9118 9119 9120 9121 9122 9123 9124 9125 9126 9127 9128 9129 9130 9131 9132 9133 9134 9135 9136 9137 9138 9139 9140 9141 9142 9143 9144 9145 9146 9147 9148 9149 9150 9151 9152 9153 9154 9155 9156 9157 9158 9159 9160 9161 9162 9163 9164 9165 9166 9167 9168 9169 9170 9171 9172 9173 9174 9175 9176 9177 9178 9179 9180 9181 9182 9183 9184 9185 9186 9187 9188 9189 9190 9191 9192 9193 9194 9195 9196 9197 9198 9199 9200 9201 9202 9203 9204 9205 9206 9207 9208 9209 9210 9211 9212 9213 9214 9215 9216 9217 9218 9219 9220 9221 9222 9223 9224 9225 9226 9227 9228 9229 9230 9231 9232 9233 9234 9235 9236 9237 9238 9239 9240 9241 9242 9243 9244 9245 9246 9247 9248 9249 9250 9251 9252 9253 9254 9255 9256 9257 9258 9259 9260 9261 9262 9263 9264 9265 9266 9267 9268 9269 9270 9271 9272 9273 9274 9275 9276 9277 9278 9279 9280 9281 9282 9283 9284 9285 9286 9287 9288 9289 9290 9291 9292 9293 9294 9295 9296 9297 9298 9299 9300 9301 9302 9303 9304 9305 9306 9307 9308 9309 9310 9311 9312 9313 9314 9315 9316 9317 9318 9319 9320 9321 9322 9323 9324 9325 9326 9327 9328 9329 9330 9331 9332 9333 9334 9335 9336 9337 9338 9339 9340 9341 9342 9343 9344 9345 9346 9347 9348 9349 9350 9351 9352 9353 9354 9355 9356 9357 9358 9359 9360 9361 9362 9363 9364 9365 9366 9367 9368 9369 9370 9371 9372 9373 9374 9375 9376 9377 9378 9379 9380 9381 9382 9383 9384 9385 9386 9387 9388 9389 9390 9391 9392 9393 9394 9395 9396 9397 9398 9399 9400 9401 9402 9403 9404 9405 9406 9407 9408 9409 9410 9411 9412 9413 9414 9415 9416 9417 9418 9419 9420 9421 9422 9423 9424 9425 9426 9427 9428 9429 9430 9431 9432 9433 9434 9435 9436 9437 9438 9439 9440 9441 9442 9443 9444 9445 9446 9447 9448 9449 9450 9451 9452 9453 9454 9455 9456 9457 9458 9459 9460 9461 9462 9463 9464 9465 9466 9467 9468 9469 9470 9471 9472 9473 9474 9475 9476 9477 9478 9479 9480 9481 9482 9483 9484 9485 9486 9487 9488 9489 9490 9491 9492 9493 9494 9495 9496 9497 9498 9499 9500 9501 9502 9503 9504 9505 9506 9507 9508 9509 9510 9511 9512 9513 9514 9515 9516 9517 9518 9519 9520 9521 9522 9523 9524 9525 9526 9527 9528 9529 9530 9531 9532 9533 9534 9535 9536 9537 9538 9539 9540 9541 9542 9543 9544 9545 9546 9547 9548 9549 9550 9551 9552 9553 9554 9555 9556 9557 9558 9559 9560 9561 9562 9563 9564 9565 9566 9567 9568 9569 9570 9571 9572 9573 9574 9575 9576 9577 9578 9579 9580 9581 9582 9583 9584 9585 9586 9587 9588 9589 9590 9591 9592 9593 9594 9595 9596 9597 9598 9599 9600 9601 9602 9603 9604 9605 9606 9607 9608 9609 9610 9611 9612 9613 9614 9615 9616 9617 9618 9619 9620 9621 9622 9623 9624 9625 9626 9627 9628 9629 9630 9631 9632 9633 9634 9635 9636 9637 9638 9639 9640 9641 9642 9643 9644 9645 9646 9647 9648 9649 9650 9651 9652 9653 9654 9655 9656 9657 9658 9659 9660 9661 9662 9663 9664 9665 9666 9667 9668 9669 9670 9671 9672 9673 9674 9675 9676 9677 9678 9679 9680 9681 9682 9683 9684 9685 9686 9687 9688 9689 9690 9691 9692 9693 9694 9695 9696 9697 9698 9699 9700 9701 9702 9703 9704 9705 9706 9707 9708 9709 9710 9711 9712 9713 9714 9715 9716 9717 9718 9719 9720 9721 9722 9723 9724 9725 9726 9727 9728 9729 9730 9731 9732 9733 9734 9735 9736 9737 9738 9739 9740 9741 9742 9743 9744 9745 9746 9747 9748 9749 9750 9751 9752 9753 9754 9755 9756 9757 9758 9759 9760 9761 9762 9763 9764 9765 9766 9767 9768 9769 9770 9771 9772 9773 9774 9775 9776 9777 9778 9779 9780 9781 9782 9783 9784 9785 9786 9787 9788 9789 9790 9791 9792 9793 9794 9795 9796 9797 9798 9799 9800 9801 9802 9803 9804 9805 9806 9807 9808 9809 9810 9811 9812 9813 9814 9815 9816 9817 9818 9819 9820 9821 9822 9823 9824 9825 9826 9827 9828 9829 9830 9831 9832 9833 9834 9835 9836 9837 9838 9839 9840 9841 9842 9843 9844 9845 9846 9847 9848 9849 9850 9851 9852 9853 9854 9855 9856 9857 9858 9859 9860 9861 9862 9863 9864 9865 9866 9867 9868 9869 9870 9871 9872 9873 9874 9875 9876 9877 9878 9879 9880 9881 9882 9883 9884 9885 9886 9887 9888 9889 9890 9891 9892 9893 9894 9895 9896 9897 9898 9899 9900 9901 9902 9903 9904 9905 9906 9907 9908 9909 9910 9911 9912 9913 9914 9915 9916 9917 9918 9919 9920 9921 9922 9923 9924 9925 9926 9927 9928 9929 9930 9931 9932 9933 9934 9935 9936 9937 9938 9939 9940 9941 9942 9943 9944 9945 9946 9947 9948 9949 9950 9951 9952 9953 9954 9955 9956 9957 9958 9959 9960 9961 9962 9963 9964 9965 9966 9967 9968 9969 9970 9971 9972 9973 9974 9975 9976 9977 9978 9979 9980 9981 9982 9983 9984 9985 9986 9987 9988 9989 9990 9991 9992 9993 9994 9995 9996 9997 9998 9999 10000 10001 10002 10003 10004 10005 10006 10007 10008 10009 10010 10011 10012 10013 10014 10015 10016 10017 10018 10019 10020 10021 10022 10023 10024 10025 10026 10027 10028 10029 10030 10031 10032 10033 10034 10035 10036 10037 10038 10039 10040 10041 10042 10043 10044 10045 10046 10047 10048 10049 10050 10051 10052 10053 10054 10055 10056 10057 10058 10059 10060 10061 10062 10063 10064 10065 10066 10067 10068 10069 10070 10071 10072 10073 10074 10075 10076 10077 10078 10079 10080 10081 10082 10083 10084 10085 10086 10087 10088 10089 10090 10091 10092 10093 10094 10095 10096 10097 10098 10099 10100 10101 10102 10103 10104 10105 10106 10107 10108 10109 10110 10111 10112 10113 10114 10115 10116 10117 10118 10119 10120 10121 10122 10123 10124 10125 10126 10127 10128 10129 10130 10131 10132 10133 10134 10135 10136 10137 10138 10139 10140 10141 10142 10143 10144 10145 10146 10147 10148 10149 10150 10151 10152 10153 10154 10155 10156 10157 10158 10159 10160 10161 10162 10163 10164 10165 10166 10167 10168 10169 10170 10171 10172 10173 10174 10175 10176 10177 10178 10179 10180 10181 10182 10183 10184 10185 10186 10187 10188 10189 10190 10191 10192 10193 10194 10195 10196 10197 10198 10199 10200 10201 10202 10203 10204 10205 10206 10207 10208 10209 10210 10211 10212 10213 10214 10215 10216 10217 10218 10219 10220 10221 10222 10223 10224 10225 10226 10227 10228 10229 10230 10231 10232 10233 10234 10235 10236 10237 10238 10239 10240 10241 10242 10243 10244 10245 10246 10247 10248 10249 10250 10251 10252 10253 10254 10255 10256 10257 10258 10259 10260 10261 10262 10263 10264 10265 10266 10267 10268 10269 10270 10271 10272 10273 10274 10275 10276 10277 10278 10279 10280 10281 10282 10283 10284 10285 10286 10287 10288 10289 10290 10291 10292 10293 10294 10295 10296 10297 10298 10299 10300 10301 10302 10303 10304 10305 10306 10307 10308 10309 10310 10311 10312 10313 10314 10315 10316 10317 10318 10319 10320 10321 10322 10323 10324 10325 10326 10327 10328 10329 10330 10331 10332 10333 10334 10335 10336 10337 10338 10339 10340 10341 10342 10343 10344 10345 10346 10347 10348 10349 10350 10351 10352 10353 10354 10355 10356 10357 10358 10359 10360 10361 10362 10363 10364 10365 10366 10367 10368 10369 10370 10371 10372 10373 10374 10375 10376 10377 10378 10379 10380 10381 10382 10383 10384 10385 10386 10387 10388 10389 10390 10391 10392 10393 10394 10395 10396 10397 10398 10399 10400 10401 10402 10403 10404 10405 10406 10407 10408 10409 10410 10411 10412 10413 10414 10415 10416 10417 10418 10419 10420 10421 10422 10423 10424 10425 10426 10427 10428 10429 10430 10431 10432 10433 10434 10435 10436 10437 10438 10439 10440 10441 10442 10443 10444 10445 10446 10447 10448 10449 10450 10451 10452 10453 10454 10455 10456 10457 10458 10459 10460 10461 10462 10463 10464 10465 10466 10467 10468 10469 10470 10471 10472 10473 10474 10475 10476 10477 10478 10479 10480 10481 10482 10483 10484 10485 10486 10487 10488 10489 10490 10491 10492 10493 10494 10495 10496 10497 10498 10499 10500 10501 10502 10503 10504 10505 10506 10507 10508 10509 10510 10511 10512 10513 10514 10515 10516 10517 10518 10519 10520 10521 10522 10523 10524 10525 10526 10527 10528 10529 10530 10531 10532 10533 10534 10535 10536 10537 10538 10539 10540 10541 10542 10543 10544 10545 10546 10547 10548 10549 10550 10551 10552 10553 10554 10555 10556 10557 10558 10559 10560 10561 10562 10563 10564 10565 10566 10567 10568 10569 10570 10571 10572 10573 10574 10575 10576 10577 10578 10579 10580 10581 10582 10583 10584 10585 10586 10587 10588 10589 10590 10591 10592 10593 10594 10595 10596 10597 10598 10599 10600 10601 10602 10603 10604 10605 10606 10607 10608 10609 10610 10611 10612 10613 10614 10615 10616 10617 10618 10619 10620 10621 10622 10623 10624 10625 10626 10627 10628 10629 10630 10631 10632 10633 10634 10635 10636 10637 10638 10639 10640 10641 10642 10643 10644 10645 10646 10647 10648 10649 10650 10651 10652 10653 10654 10655 10656 10657 10658 10659 10660 10661 10662 10663 10664 10665 10666 10667 10668 10669 10670 10671 10672 10673 10674 10675 10676 10677 10678 10679 10680 10681 10682 10683 10684 10685 10686 10687 10688 10689 10690 10691 10692 10693 10694 10695 10696 10697 10698 10699 10700 10701 10702 10703 10704 10705 10706 10707 10708 10709 10710 10711 10712 10713 10714 10715 10716 10717 10718 10719 10720 10721 10722 10723 10724 10725 10726 10727 10728 10729 10730 10731 10732 10733 10734 10735 10736 10737 10738 10739 10740 10741 10742 10743 10744 10745 10746 10747 10748 10749 10750 10751 10752 10753 10754 10755 10756 10757 10758 10759 10760 10761 10762 10763 10764 10765 10766 10767 10768 10769 10770 10771 10772 10773 10774 10775 10776 10777 10778 10779 10780 10781 10782 10783 10784 10785 10786 10787 10788 10789 10790 10791 10792 10793 10794 10795 10796 10797 10798 10799 10800 10801 10802 10803 10804 10805 10806 10807 10808 10809 10810 10811 10812 10813 10814 10815 10816 10817 10818 10819 10820 10821 10822 10823 10824 10825 10826 10827 10828 10829 10830 10831 10832 10833 10834 10835 10836 10837 10838 10839 10840 10841 10842 10843 10844 10845 10846 10847 10848 10849 10850 10851 10852 10853 10854 10855 10856 10857 10858 10859 10860 10861 10862 10863 10864 10865 10866 10867 10868 10869 10870 10871 10872 10873 10874 10875 10876 10877 10878 10879 10880 10881 10882 10883 10884 10885 10886 10887 10888 10889 10890 10891 10892 10893 10894 10895 10896 10897 10898 10899 10900 10901 10902 10903 10904 10905 10906 10907 10908 10909 10910 10911 10912 10913 10914 10915 10916 10917 10918 10919 10920 10921 10922 10923 10924 10925 10926 10927 10928 10929 10930 10931 10932 10933 10934 10935 10936 10937 10938 10939 10940 10941 10942 10943 10944 10945 10946 10947 10948 10949 10950 10951 10952 10953 10954 10955 10956 10957 10958 10959 10960 10961 10962 10963 10964 10965 10966 10967 10968 10969 10970 10971 10972 10973 10974 10975 10976 10977 10978 10979 10980 10981 10982 10983 10984 10985 10986 10987 10988 10989 10990 10991 10992 10993 10994 10995 10996 10997 10998 10999 11000 11001 11002 11003 11004 11005 11006 11007 11008 11009 11010 11011 11012 11013 11014 11015 11016 11017 11018 11019 11020 11021 11022 11023 11024 11025 11026 11027 11028 11029 11030 11031 11032 11033 11034 11035 11036 11037 11038 11039 11040 11041 11042 11043 11044 11045 11046 11047 11048 11049 11050 11051 11052 11053 11054 11055 11056 11057 11058 11059 11060 11061 11062 11063 11064 11065 11066 11067 11068 11069 11070 11071 11072 11073 11074 11075 11076 11077 11078 11079 11080 11081 11082 11083 11084 11085 11086 11087 11088 11089 11090 11091 11092 11093 11094 11095 11096 11097 11098 11099 11100 11101 11102 11103 11104 11105 11106 11107 11108 11109 11110 11111 11112 11113 11114 11115 11116 11117 11118 11119 11120 11121 11122 11123 11124 11125 11126 11127 11128 11129 11130 11131 11132 11133 11134 11135 11136 11137 11138 11139 11140 11141 11142 11143 11144 11145 11146 11147 11148 11149 11150 11151 11152 11153 11154 11155 11156 11157 11158 11159 11160 11161 11162 11163 11164 11165 11166 11167 11168 11169 11170 11171 11172 11173 11174 11175 11176 11177 11178 11179 11180 11181 11182 11183 11184 11185 11186 11187 11188 11189 11190 11191 11192 11193 11194 11195 11196 11197 11198 11199 11200 11201 11202 11203 11204 11205 11206 11207 11208 11209 11210 11211 11212 11213 11214 11215 11216 11217 11218 11219 11220 11221 11222 11223 11224 11225 11226 11227 11228 11229 11230 11231 11232 11233 11234 11235 11236 11237 11238 11239 11240 11241 11242 11243 11244 11245 11246 11247 11248 11249 11250 11251 11252 11253 11254 11255 11256 11257 11258 11259 11260 11261 11262 11263 11264 11265 11266 11267 11268 11269 11270 11271 11272 11273 11274 11275 11276 11277 11278 11279 11280 11281 11282 11283 11284 11285 11286 11287 11288 11289 11290 11291 11292 11293 11294 11295 11296 11297 11298 11299 11300 11301 11302 11303 11304 11305 11306 11307 11308 11309 11310 11311 11312 11313 11314 11315 11316 11317 11318 11319 11320 11321 11322 11323 11324 11325 11326 11327 11328 11329 11330 11331 11332 11333 11334 11335 11336 11337 11338 11339 11340 11341 11342 11343 11344 11345 11346 11347 11348 11349 11350 11351 11352 11353 11354 11355 11356 11357 11358 11359 11360 11361 11362 11363 11364 11365 11366 11367 11368 11369 11370 11371 11372 11373 11374 11375 11376 11377 11378 11379 11380 11381 11382 11383 11384 11385 11386 11387 11388 11389 11390 11391 11392 11393 11394 11395 11396 11397 11398 11399 11400 11401 11402 11403 11404 11405 11406 11407 11408 11409 11410 11411 11412 11413 11414 11415 11416 11417 11418 11419 11420 11421 11422 11423 11424 11425 11426 11427 11428 11429 11430 11431 11432 11433 11434 11435 11436 11437 11438 11439 11440 11441 11442 11443 11444 11445 11446 11447 11448 11449 11450 11451 11452 11453 11454 11455 11456 11457 11458 11459 11460 11461 11462 11463 11464 11465 11466 11467 11468 11469 11470 11471 11472 11473 11474 11475 11476 11477 11478 11479 11480 11481 11482 11483 11484 11485 11486 11487 11488 11489 11490 11491 11492 11493 11494 11495 11496 11497 11498 11499 11500 11501 11502 11503 11504 11505 11506 11507 11508 11509 11510 11511 11512 11513 11514 11515 11516 11517 11518 11519 11520 11521 11522 11523 11524 11525 11526 11527 11528 11529 11530 11531 11532 11533 11534 11535 11536 11537 11538 11539 11540 11541 11542 11543 11544 11545 11546 11547 11548 11549 11550 11551 11552 11553 11554 11555 11556 11557 11558 11559 11560 11561 11562 11563 11564 11565 11566 11567 11568 11569 11570 11571 11572 11573 11574 11575 11576 11577 11578 11579 11580 11581 11582 11583 11584 11585 11586 11587 11588 11589 11590 11591 11592 11593 11594 11595 11596 11597 11598 11599 11600 11601 11602 11603 11604 11605 11606 11607 11608 11609 11610 11611 11612 11613 11614 11615 11616 11617 11618 11619 11620 11621 11622 11623 11624 11625 11626 11627 11628 11629 11630 11631 11632 11633 11634 11635 11636 11637 11638 11639 11640 11641 11642 11643 11644 11645 11646 11647 11648 11649 11650 11651 11652 11653 11654 11655 11656 11657 11658 11659 11660 11661 11662 11663 11664 11665 11666 11667 11668 11669 11670 11671 11672 11673 11674 11675 11676 11677 11678 11679 11680 11681 11682 11683 11684 11685 11686 11687 11688 11689 11690 11691 11692 11693 11694 11695 11696 11697 11698 11699 11700 11701 11702 11703 11704 11705 11706 11707 11708 11709 11710 11711 11712 11713 11714 11715 11716 11717 11718 11719 11720 11721 11722 11723 11724 11725 11726 11727 11728 11729 11730 11731 11732 11733 11734 11735 11736 11737 11738 11739 11740 11741 11742 11743 11744 11745 11746 11747 11748 11749 11750 11751 11752 11753 11754 11755 11756 11757 11758 11759 11760 11761 11762 11763 11764 11765 11766 11767 11768 11769 11770 11771 11772 11773 11774 11775 11776 11777 11778 11779 11780 11781 11782 11783 11784 11785 11786 11787 11788 11789 11790 11791 11792 11793 11794 11795 11796 11797 11798 11799 11800 11801 11802 11803 11804 11805 11806 11807 11808 11809 11810 11811 11812 11813 11814 11815 11816 11817 11818 11819 11820 11821 11822 11823 11824 11825 11826 11827 11828 11829 11830 11831 11832 11833 11834 11835 11836 11837 11838 11839 11840 11841 11842 11843 11844 11845 11846 11847 11848 11849 11850 11851 11852 11853 11854 11855 11856 11857 11858 11859 11860 11861 11862 11863 11864 11865 11866 11867 11868 11869 11870 11871 11872 11873 11874 11875 11876 11877 11878 11879 11880 11881 11882 11883 11884 11885 11886 11887 11888 11889 11890 11891 11892 11893 11894 11895 11896 11897 11898 11899 11900 11901 11902 11903 11904 11905 11906 11907 11908 11909 11910 11911 11912 11913 11914 11915 11916 11917 11918 11919 11920 11921 11922 11923 11924 11925 11926 11927 11928 11929 11930 11931 11932 11933 11934 11935 11936 11937 11938 11939 11940 11941 11942 11943 11944 11945 11946 11947 11948 11949 11950 11951 11952 11953 11954 11955 11956 11957 11958 11959 11960 11961 11962 11963 11964 11965 11966 11967 11968 11969 11970 11971 11972 11973 11974 11975 11976 11977 11978 11979 11980 11981 11982 11983 11984 11985 11986 11987 11988 11989 11990 11991 11992 11993 11994 11995 11996 11997 11998 11999 12000 12001 12002 12003 12004 12005 12006 12007 12008 12009 12010 12011 12012 12013 12014 12015 12016 12017 12018 12019 12020 12021 12022 12023 12024 12025 12026 12027 12028 12029 12030 12031 12032 12033 12034 12035 12036 12037 12038 12039 12040 12041 12042 12043 12044 12045 12046 12047 12048 12049 12050 12051 12052 12053 12054 12055 12056 12057 12058 12059 12060 12061 12062 12063 12064 12065 12066 12067 12068 12069 12070 12071 12072 12073 12074 12075 12076 12077 12078 12079 12080 12081 12082 12083 12084 12085 12086 12087 12088 12089 12090 12091 12092 12093 12094 12095 12096 12097 12098 12099 12100 12101 12102 12103 12104 12105 12106 12107 12108 12109 12110 12111 12112 12113 12114 12115 12116 12117 12118 12119 12120 12121 12122 12123 12124 12125 12126 12127 12128 12129 12130 12131 12132 12133 12134 12135 12136 12137 12138 12139 12140 12141 12142 12143 12144 12145 12146 12147 12148 12149 12150 12151 12152 12153 12154 12155 12156 12157 12158 12159 12160 12161 12162 12163 12164 12165 12166 12167 12168 12169 12170 12171 12172 12173 12174 12175 12176 12177 12178 12179 12180 12181 12182 12183 12184 12185 12186 12187 12188 12189 12190 12191 12192 12193 12194 12195 12196 12197 12198 12199 12200 12201 12202 12203 12204 12205 12206 12207 12208 12209 12210 12211 12212 12213 12214 12215 12216 12217 12218 12219 12220 12221 12222 12223 12224 12225 12226 12227 12228 12229 12230 12231 12232 12233 12234 12235 12236 12237 12238 12239 12240 12241 12242 12243 12244 12245 12246 12247 12248 12249 12250 12251 12252 12253 12254 12255 12256 12257 12258 12259 12260 12261 12262 12263 12264 12265 12266 12267 12268 12269 12270 12271 12272 12273 12274 12275 12276 12277 12278 12279 12280 12281 12282 12283 12284 12285 12286 12287 12288 12289 12290 12291 12292 12293 12294 12295 12296 12297 12298 12299 12300 12301 12302 12303 12304 12305 12306 12307 12308 12309 12310 12311 12312 12313 12314 12315 12316 12317 12318 12319 12320 12321 12322 12323 12324 12325 12326 12327 12328 12329 12330 12331 12332 12333 12334 12335 12336 12337 12338 12339 12340 12341 12342 12343 12344 12345 12346 12347 12348 12349 12350 12351 12352 12353 12354 12355 12356 12357 12358 12359 12360 12361 12362 12363 12364 12365 12366 12367 12368 12369 12370 12371 12372 12373 12374 12375 12376 12377 12378 12379 12380 12381 12382 12383 12384 12385 12386 12387 12388 12389 12390 12391 12392 12393 12394 12395 12396 12397 12398 12399 12400 12401 12402 12403 12404 12405 12406 12407 12408 12409 12410 12411 12412 12413 12414 12415 12416 12417 12418 12419 12420 12421 12422 12423 12424 12425 12426 12427 12428 12429 12430 12431 12432 12433 12434 12435 12436 12437 12438 12439 12440 12441 12442 12443 12444 12445 12446 12447 12448 12449 12450 12451 12452 12453 12454 12455 12456 12457 12458 12459 12460 12461 12462 12463 12464 12465 12466 12467 12468 12469 12470 12471 12472 12473 12474 12475 12476 12477 12478 12479 12480 12481 12482 12483 12484 12485 12486 12487 12488 12489 12490 12491 12492 12493 12494 12495 12496 12497 12498 12499 12500 12501 12502 12503 12504 12505 12506 12507 12508 12509 12510 12511 12512 12513 12514 12515 12516 12517 12518 12519 12520 12521 12522 12523 12524 12525 12526 12527 12528 12529 12530 12531 12532 12533 12534 12535 12536 12537 12538 12539 12540 12541 12542 12543 12544 12545 12546 12547 12548 12549 12550 12551 12552 12553 12554 12555 12556 12557 12558 12559 12560 12561 12562 12563 12564 12565 12566 12567 12568 12569 12570 12571 12572 12573 12574 12575 12576 12577 12578 12579 12580 12581 12582 12583 12584 12585 12586 12587 12588 12589 12590 12591 12592 12593 12594 12595 12596 12597 12598 12599 12600 12601 12602 12603 12604 12605 12606 12607 12608 12609 12610 12611 12612 12613 12614 12615 12616 12617 12618 12619 12620 12621 12622 12623 12624 12625 12626 12627 12628 12629 12630 12631 12632 12633 12634 12635 12636 12637 12638 12639 12640 12641 12642 12643 12644 12645 12646 12647 12648 12649 12650 12651 12652 12653 12654 12655 12656 12657 12658 12659 12660 12661 12662 12663 12664 12665 12666 12667 12668 12669 12670 12671 12672 12673 12674 12675 12676 12677 12678 12679 12680 12681 12682 12683 12684 12685 12686 12687 12688 12689 12690 12691 12692 12693 12694 12695 12696 12697 12698 12699 12700 12701 12702 12703 12704 12705 12706 12707 12708 12709 12710 12711 12712 12713 12714 12715 12716 12717 12718 12719 12720 12721 12722 12723 12724 12725 12726 12727 12728 12729 12730 12731 12732 12733 12734 12735 12736 12737 12738 12739 12740 12741 12742 12743 12744 12745 12746 12747 12748 12749 12750 12751 12752 12753 12754 12755 12756 12757 12758 12759 12760 12761 12762 12763 12764 12765 12766 12767 12768 12769 12770 12771 12772 12773 12774 12775 12776 12777 12778 12779 12780 12781 12782 12783 12784 12785 12786 12787 12788 12789 12790 12791 12792 12793 12794 12795 12796 12797 12798 12799 12800 12801 12802 12803 12804 12805 12806 12807 12808 12809 12810 12811 12812 12813 12814 12815 12816 12817 12818 12819 12820 12821 12822 12823 12824 12825 12826 12827 12828 12829 12830 12831 12832 12833 12834 12835 12836 12837 12838 12839 12840 12841 12842 12843 12844 12845 12846 12847 12848 12849 12850 12851 12852 12853 12854 12855 12856 12857 12858 12859 12860 12861 12862 12863 12864 12865 12866 12867 12868 12869 12870 12871 12872 12873 12874 12875 12876 12877 12878 12879 12880 12881 12882 12883 12884 12885 12886 12887 12888 12889 12890 12891 12892 12893 12894 12895 12896 12897 12898 12899 12900 12901 12902 12903 12904 12905 12906 12907 12908 12909 12910 12911 12912 12913 12914 12915 12916 12917 12918 12919 12920 12921 12922 12923 12924 12925 12926 12927 12928 12929 12930 12931 12932 12933 12934 12935 12936 12937 12938 12939 12940 12941 12942 12943 12944 12945 12946 12947 12948 12949 12950 12951 12952 12953 12954 12955 12956 12957 12958 12959 12960 12961 12962 12963 12964 12965 12966 12967 12968 12969 12970 12971 12972 12973 12974 12975 12976 12977 12978 12979 12980 12981 12982 12983 12984 12985 12986 12987 12988 12989 12990 12991 12992 12993 12994 12995 12996 12997 12998 12999 13000 13001 13002 13003 13004 13005 13006 13007 13008 13009 13010 13011 13012 13013 13014 13015 13016 13017 13018 13019 13020 13021 13022 13023 13024 13025 13026 13027 13028 13029 13030 13031 13032 13033 13034 13035 13036 13037 13038 13039 13040 13041 13042 13043 13044 13045 13046 13047 13048 13049 13050 13051 13052 13053 13054 13055 13056 13057 13058 13059 13060 13061 13062 13063 13064 13065 13066 13067 13068 13069 13070 13071 13072 13073 13074 13075 13076 13077 13078 13079 13080 13081 13082 13083 13084 13085 13086 13087 13088 13089 13090 13091 13092 13093 13094 13095 13096 13097 13098 13099 13100 13101 13102 13103 13104 13105 13106 13107 13108 13109 13110 13111 13112 13113 13114 13115 13116 13117 13118 13119 13120 13121 13122 13123 13124 13125 13126 13127 13128 13129 13130 13131 13132 13133 13134 13135 13136 13137 13138 13139 13140 13141 13142 13143 13144 13145 13146 13147 13148 13149 13150 13151 13152 13153 13154 13155 13156 13157 13158 13159 13160 13161 13162 13163 13164 13165 13166 13167 13168 13169 13170 13171 13172 13173 13174 13175 13176 13177 13178 13179 13180 13181 13182 13183 13184 13185 13186 13187 13188 13189 13190 13191 13192 13193 13194 13195 13196 13197 13198 13199 13200 13201 13202 13203 13204 13205 13206 13207 13208 13209 13210 13211 13212 13213 13214 13215 13216 13217 13218 13219 13220 13221 13222 13223 13224 13225 13226 13227 13228 13229 13230 13231 13232 13233 13234 13235 13236 13237 13238 13239 13240 13241 13242 13243 13244 13245 13246 13247 13248 13249 13250 13251 13252 13253 13254 13255 13256 13257 13258 13259 13260 13261 13262 13263 13264 13265 13266 13267 13268 13269 13270 13271 13272 13273 13274 13275 13276 13277 13278 13279 13280 13281 13282 13283 13284 13285 13286 13287 13288 13289 13290 13291 13292 13293 13294 13295 13296 13297 13298 13299 13300 13301 13302 13303 13304 13305 13306 13307 13308 13309 13310 13311 13312 13313 13314 13315 13316 13317 13318 13319 13320 13321 13322 13323 13324 13325 13326 13327 13328 13329 13330 13331 13332 13333 13334 13335 13336 13337 13338 13339 13340 13341 13342 13343 13344 13345 13346 13347 13348 13349 13350 13351 13352 13353 13354 13355 13356 13357 13358 13359 13360 13361 13362 13363 13364 13365 13366 13367 13368 13369 13370 13371 13372 13373 13374 13375 13376 13377 13378 13379 13380 13381 13382 13383 13384 13385 13386 13387 13388 13389 13390 13391 13392 13393 13394 13395 13396 13397 13398 13399 13400 13401 13402 13403 13404 13405 13406 13407 13408 13409 13410 13411 13412 13413 13414 13415 13416 13417 13418 13419 13420 13421 13422 13423 13424 13425 13426 13427 13428 13429 13430 13431 13432 13433 13434 13435 13436 13437 13438 13439 13440 13441 13442 13443 13444 13445 13446 13447 13448 13449 13450 13451 13452 13453 13454 13455 13456 13457 13458 13459 13460 13461 13462 13463 13464 13465 13466 13467 13468 13469 13470 13471 13472 13473 13474 13475 13476 13477 13478 13479 13480 13481 13482 13483 13484 13485 13486 13487 13488 13489 13490 13491 13492 13493 13494 13495 13496 13497 13498 13499 13500 13501 13502 13503 13504 13505 13506 13507 13508 13509 13510 13511 13512 13513 13514 13515 13516 13517 13518 13519 13520 13521 13522 13523 13524 13525 13526 13527 13528 13529 13530 13531 13532 13533 13534 13535 13536 13537 13538 13539 13540 13541 13542 13543 13544 13545 13546 13547 13548 13549 13550 13551 13552 13553 13554 13555 13556 13557 13558 13559 13560 13561 13562 13563 13564 13565 13566 13567 13568 13569 13570 13571 13572 13573 13574 13575 13576 13577 13578 13579 13580 13581 13582 13583 13584 13585 13586 13587 13588 13589 13590 13591 13592 13593 13594 13595 13596 13597 13598 13599 13600 13601 13602 13603 13604 13605 13606 13607 13608 13609 13610 13611 13612 13613 13614 13615 13616 13617 13618 13619 13620 13621 13622 13623 13624 13625 13626 13627 13628 13629 13630 13631 13632 13633 13634 13635 13636 13637 13638 13639 13640 13641 13642 13643 13644 13645 13646 13647 13648 13649 13650 13651 13652 13653 13654 13655 13656 13657 13658 13659 13660 13661 13662 13663 13664 13665 13666 13667 13668 13669 13670 13671 13672 13673 13674 13675 13676 13677 13678 13679 13680 13681 13682 13683 13684 13685 13686 13687 13688 13689 13690 13691 13692 13693 13694 13695 13696 13697 13698 13699 13700 13701 13702 13703 13704 13705 13706 13707 13708 13709 13710 13711 13712 13713 13714 13715 13716 13717 13718 13719 13720 13721 13722 13723 13724 13725 13726 13727 13728 13729 13730 13731 13732 13733 13734 13735 13736 13737 13738 13739 13740 13741 13742 13743 13744 13745 13746 13747 13748 13749 13750 13751 13752 13753 13754 13755 13756 13757 13758 13759 13760 13761 13762 13763 13764 13765 13766 13767 13768 13769 13770 13771 13772 13773 13774 13775 13776 13777 13778 13779 13780 13781 13782 13783 13784 13785 13786 13787 13788 13789 13790 13791 13792 13793 13794 13795 13796 13797 13798 13799 13800 13801 13802 13803 13804 13805 13806 13807 13808 13809 13810 13811 13812 13813 13814 13815 13816 13817 13818 13819 13820 13821 13822 13823 13824 13825 13826 13827 13828 13829 13830 13831 13832 13833 13834 13835 13836 13837 13838 13839 13840 13841 13842 13843 13844 13845 13846 13847 13848 13849 13850 13851 13852 13853 13854 13855 13856 13857 13858 13859 13860 13861 13862 13863 13864 13865 13866 13867 13868 13869 13870 13871 13872 13873 13874 13875 13876 13877 13878 13879 13880 13881 13882 13883 13884 13885 13886 13887 13888 13889 13890 13891 13892 13893 13894 13895 13896 13897 13898 13899 13900 13901 13902 13903 13904 13905 13906 13907 13908 13909 13910 13911 13912 13913 13914 13915 13916 13917 13918 13919 13920 13921 13922 13923 13924 13925 13926 13927 13928 13929 13930 13931 13932 13933 13934 13935 13936 13937 13938 13939 13940 13941 13942 13943 13944 13945 13946 13947 13948 13949 13950 13951 13952 13953 13954 13955 13956 13957 13958 13959 13960 13961 13962 13963 13964 13965 13966 13967 13968 13969 13970 13971 13972 13973 13974 13975 13976 13977 13978 13979 13980 13981 13982 13983 13984 13985 13986 13987 13988 13989 13990 13991 13992 13993 13994 13995 13996 13997 13998 13999 14000 14001 14002 14003 14004 14005 14006 14007 14008 14009 14010 14011 14012 14013 14014 14015 14016 14017 14018 14019 14020 14021 14022 14023 14024 14025 14026 14027 14028 14029 14030 14031 14032 14033 14034 14035 14036 14037 14038 14039 14040 14041 14042 14043 14044 14045 14046 14047 14048 14049 14050 14051 14052 14053 14054 14055 14056 14057 14058 14059 14060 14061 14062 14063 14064 14065 14066 14067 14068 14069 14070 14071 14072 14073 14074 14075 14076 14077 14078 14079 14080 14081 14082 14083 14084 14085 14086 14087 14088 14089 14090 14091 14092 14093 14094 14095 14096 14097 14098 14099 14100 14101 14102 14103 14104 14105 14106 14107 14108 14109 14110 14111 14112 14113 14114 14115 14116 14117 14118 14119 14120 14121 14122 14123 14124 14125 14126 14127 14128 14129 14130 14131 14132 14133 14134 14135 14136 14137 14138 14139 14140 14141 14142 14143 14144 14145 14146 14147 14148 14149 14150 14151 14152 14153 14154 14155 14156 14157 14158 14159 14160 14161 14162 14163 14164 14165 14166 14167 14168 14169 14170 14171 14172 14173 14174 14175 14176 14177 14178 14179 14180 14181 14182 14183 14184 14185 14186 14187 14188 14189 14190 14191 14192 14193 14194 14195 14196 14197 14198 14199 14200 14201 14202 14203 14204 14205 14206 14207 14208 14209 14210 14211 14212 14213 14214 14215 14216 14217 14218 14219 14220 14221 14222 14223 14224 14225 14226 14227 14228 14229 14230 14231 14232 14233 14234 14235 14236 14237 14238 14239 14240 14241 14242 14243 14244 14245 14246 14247 14248 14249 14250 14251 14252 14253 14254 14255 14256 14257 14258 14259 14260 14261 14262 14263 14264 14265 14266 14267 14268 14269 14270 14271 14272 14273 14274 14275 14276 14277 14278 14279 14280 14281 14282 14283 14284 14285 14286 14287 14288 14289 14290 14291 14292 14293 14294 14295 14296 14297 14298 14299 14300 14301 14302 14303 14304 14305 14306 14307 14308 14309 14310 14311 14312 14313 14314 14315 14316 14317 14318 14319 14320 14321 14322 14323 14324 14325 14326 14327 14328 14329 14330 14331 14332 14333 14334 14335 14336 14337 14338 14339 14340 14341 14342 14343 14344 14345 14346 14347 14348 14349 14350 14351 14352 14353 14354 14355 14356 14357 14358 14359 14360 14361 14362 14363 14364 14365 14366 14367 14368 14369 14370 14371 14372 14373 14374 14375 14376 14377 14378 14379 14380 14381 14382 14383 14384 14385 14386 14387 14388 14389 14390 14391 14392 14393 14394 14395 14396 14397 14398 14399 14400 14401 14402 14403 14404 14405 14406 14407 14408 14409 14410 14411 14412 14413 14414 14415 14416 14417 14418 14419 14420 14421 14422 14423 14424 14425 14426 14427 14428 14429 14430 14431 14432 14433 14434 14435 14436 14437 14438 14439 14440 14441 14442 14443 14444 14445 14446 14447 14448 14449 14450 14451 14452 14453 14454 14455 14456 14457 14458 14459 14460 14461 14462 14463 14464 14465 14466 14467 14468 14469 14470 14471 14472 14473 14474 14475 14476 14477 14478 14479 14480 14481 14482 14483 14484 14485 14486 14487 14488 14489 14490 14491 14492 14493 14494 14495 14496 14497 14498 14499 14500 14501 14502 14503 14504 14505 14506 14507 14508 14509 14510 14511 14512 14513 14514 14515 14516 14517 14518 14519 14520 14521 14522 14523 14524 14525 14526 14527 14528 14529 14530 14531 14532 14533 14534 14535 14536 14537 14538 14539 14540 14541 14542 14543 14544 14545 14546 14547 14548 14549 14550 14551 14552 14553 14554 14555 14556 14557 14558 14559 14560 14561 14562 14563 14564 14565 14566 14567 14568 14569 14570 14571 14572 14573 14574 14575 14576 14577 14578 14579 14580 14581 14582 14583 14584 14585 14586 14587 14588 14589 14590 14591 14592 14593 14594 14595 14596 14597 14598 14599 14600 14601 14602 14603 14604 14605 14606 14607 14608 14609 14610 14611 14612 14613 14614 14615 14616 14617 14618 14619 14620 14621 14622 14623 14624 14625 14626 14627 14628 14629 14630 14631 14632 14633 14634 14635 14636 14637 14638 14639 14640 14641 14642 14643 14644 14645 14646 14647 14648 14649 14650 14651 14652 14653 14654 14655 14656 14657 14658 14659 14660 14661 14662 14663 14664 14665 14666 14667 14668 14669 14670 14671 14672 14673 14674 14675 14676 14677 14678 14679 14680 14681 14682 14683 14684 14685 14686 14687 14688 14689 14690 14691 14692 14693 14694 14695 14696 14697 14698 14699 14700 14701 14702 14703 14704 14705 14706 14707 14708 14709 14710 14711 14712 14713 14714 14715 14716 14717 14718 14719 14720 14721 14722 14723 14724 14725 14726 14727 14728 14729 14730 14731 14732 14733 14734 14735 14736 14737 14738 14739 14740 14741 14742 14743 14744 14745 14746 14747 14748 14749 14750 14751 14752 14753 14754 14755 14756 14757 14758 14759 14760 14761 14762 14763 14764 14765 14766 14767 14768 14769 14770 14771 14772 14773 14774 14775 14776 14777 14778 14779 14780 14781 14782 14783 14784 14785 14786 14787 14788 14789 14790 14791 14792 14793 14794 14795 14796 14797 14798 14799 14800 14801 14802 14803 14804 14805 14806 14807 14808 14809 14810 14811 14812 14813 14814 14815 14816 14817 14818 14819 14820 14821 14822 14823 14824 14825 14826 14827 14828 14829 14830 14831 14832 14833 14834 14835 14836 14837 14838 14839 14840 14841 14842 14843 14844 14845 14846 14847 14848 14849 14850 14851 14852 14853 14854 14855 14856 14857 14858 14859 14860 14861 14862 14863 14864 14865 14866 14867 14868 14869 14870 14871 14872 14873 14874 14875 14876 14877 14878 14879 14880 14881 14882 14883 14884 14885 14886 14887 14888 14889 14890 14891 14892 14893 14894 14895 14896 14897 14898 14899 14900 14901 14902 14903 14904 14905 14906 14907 14908 14909 14910 14911 14912 14913 14914 14915 // SPDX-License-Identifier: GPL-2.0 /* * Performance events core code: * * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> */ #include <linux/fs.h> #include <linux/mm.h> #include <linux/cpu.h> #include <linux/smp.h> #include <linux/idr.h> #include <linux/file.h> #include <linux/poll.h> #include <linux/slab.h> #include <linux/hash.h> #include <linux/tick.h> #include <linux/sysfs.h> #include <linux/dcache.h> #include <linux/percpu.h> #include <linux/ptrace.h> #include <linux/reboot.h> #include <linux/vmstat.h> #include <linux/device.h> #include <linux/export.h> #include <linux/vmalloc.h> #include <linux/hardirq.h> #include <linux/hugetlb.h> #include <linux/rculist.h> #include <linux/uaccess.h> #include <linux/syscalls.h> #include <linux/anon_inodes.h> #include <linux/kernel_stat.h> #include <linux/cgroup.h> #include <linux/perf_event.h> #include <linux/trace_events.h> #include <linux/hw_breakpoint.h> #include <linux/mm_types.h> #include <linux/module.h> #include <linux/mman.h> #include <linux/compat.h> #include <linux/bpf.h> #include <linux/filter.h> #include <linux/namei.h> #include <linux/parser.h> #include <linux/sched/clock.h> #include <linux/sched/mm.h> #include <linux/proc_ns.h> #include <linux/mount.h> #include <linux/min_heap.h> #include <linux/highmem.h> #include <linux/pgtable.h> #include <linux/buildid.h> #include <linux/task_work.h> #include <linux/percpu-rwsem.h> #include "internal.h" #include <asm/irq_regs.h> typedef int (*remote_function_f)(void *); struct remote_function_call { struct task_struct *p; remote_function_f func; void *info; int ret; }; static void remote_function(void *data) { struct remote_function_call *tfc = data; struct task_struct *p = tfc->p; if (p) { /* -EAGAIN */ if (task_cpu(p) != smp_processor_id()) return; /* * Now that we're on right CPU with IRQs disabled, we can test * if we hit the right task without races. */ tfc->ret = -ESRCH; /* No such (running) process */ if (p != current) return; } tfc->ret = tfc->func(tfc->info); } /** * task_function_call - call a function on the cpu on which a task runs * @p: the task to evaluate * @func: the function to be called * @info: the function call argument * * Calls the function @func when the task is currently running. This might * be on the current CPU, which just calls the function directly. This will * retry due to any failures in smp_call_function_single(), such as if the * task_cpu() goes offline concurrently. * * returns @func return value or -ESRCH or -ENXIO when the process isn't running */ static int task_function_call(struct task_struct *p, remote_function_f func, void *info) { struct remote_function_call data = { .p = p, .func = func, .info = info, .ret = -EAGAIN, }; int ret; for (;;) { ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1); if (!ret) ret = data.ret; if (ret != -EAGAIN) break; cond_resched(); } return ret; } /** * cpu_function_call - call a function on the cpu * @cpu: target cpu to queue this function * @func: the function to be called * @info: the function call argument * * Calls the function @func on the remote cpu. * * returns: @func return value or -ENXIO when the cpu is offline */ static int cpu_function_call(int cpu, remote_function_f func, void *info) { struct remote_function_call data = { .p = NULL, .func = func, .info = info, .ret = -ENXIO, /* No such CPU */ }; smp_call_function_single(cpu, remote_function, &data, 1); return data.ret; } enum event_type_t { EVENT_FLEXIBLE = 0x01, EVENT_PINNED = 0x02, EVENT_TIME = 0x04, EVENT_FROZEN = 0x08, /* see ctx_resched() for details */ EVENT_CPU = 0x10, EVENT_CGROUP = 0x20, /* compound helpers */ EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, EVENT_TIME_FROZEN = EVENT_TIME | EVENT_FROZEN, }; static inline void __perf_ctx_lock(struct perf_event_context *ctx) { raw_spin_lock(&ctx->lock); WARN_ON_ONCE(ctx->is_active & EVENT_FROZEN); } static void perf_ctx_lock(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { __perf_ctx_lock(&cpuctx->ctx); if (ctx) __perf_ctx_lock(ctx); } static inline void __perf_ctx_unlock(struct perf_event_context *ctx) { /* * If ctx_sched_in() didn't again set any ALL flags, clean up * after ctx_sched_out() by clearing is_active. */ if (ctx->is_active & EVENT_FROZEN) { if (!(ctx->is_active & EVENT_ALL)) ctx->is_active = 0; else ctx->is_active &= ~EVENT_FROZEN; } raw_spin_unlock(&ctx->lock); } static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { if (ctx) __perf_ctx_unlock(ctx); __perf_ctx_unlock(&cpuctx->ctx); } typedef struct { struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; } class_perf_ctx_lock_t; static inline void class_perf_ctx_lock_destructor(class_perf_ctx_lock_t *_T) { perf_ctx_unlock(_T->cpuctx, _T->ctx); } static inline class_perf_ctx_lock_t class_perf_ctx_lock_constructor(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { perf_ctx_lock(cpuctx, ctx); return (class_perf_ctx_lock_t){ cpuctx, ctx }; } #define TASK_TOMBSTONE ((void *)-1L) static bool is_kernel_event(struct perf_event *event) { return READ_ONCE(event->owner) == TASK_TOMBSTONE; } static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); struct perf_event_context *perf_cpu_task_ctx(void) { lockdep_assert_irqs_disabled(); return this_cpu_ptr(&perf_cpu_context)->task_ctx; } /* * On task ctx scheduling... * * When !ctx->nr_events a task context will not be scheduled. This means * we can disable the scheduler hooks (for performance) without leaving * pending task ctx state. * * This however results in two special cases: * * - removing the last event from a task ctx; this is relatively straight * forward and is done in __perf_remove_from_context. * * - adding the first event to a task ctx; this is tricky because we cannot * rely on ctx->is_active and therefore cannot use event_function_call(). * See perf_install_in_context(). * * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set. */ typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *, struct perf_event_context *, void *); struct event_function_struct { struct perf_event *event; event_f func; void *data; }; static int event_function(void *info) { struct event_function_struct *efs = info; struct perf_event *event = efs->event; struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_context *task_ctx = cpuctx->task_ctx; int ret = 0; lockdep_assert_irqs_disabled(); perf_ctx_lock(cpuctx, task_ctx); /* * Since we do the IPI call without holding ctx->lock things can have * changed, double check we hit the task we set out to hit. */ if (ctx->task) { if (ctx->task != current) { ret = -ESRCH; goto unlock; } /* * We only use event_function_call() on established contexts, * and event_function() is only ever called when active (or * rather, we'll have bailed in task_function_call() or the * above ctx->task != current test), therefore we must have * ctx->is_active here. */ WARN_ON_ONCE(!ctx->is_active); /* * And since we have ctx->is_active, cpuctx->task_ctx must * match. */ WARN_ON_ONCE(task_ctx != ctx); } else { WARN_ON_ONCE(&cpuctx->ctx != ctx); } efs->func(event, cpuctx, ctx, efs->data); unlock: perf_ctx_unlock(cpuctx, task_ctx); return ret; } static void event_function_call(struct perf_event *event, event_f func, void *data) { struct perf_event_context *ctx = event->ctx; struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */ struct perf_cpu_context *cpuctx; struct event_function_struct efs = { .event = event, .func = func, .data = data, }; if (!event->parent) { /* * If this is a !child event, we must hold ctx::mutex to * stabilize the event->ctx relation. See * perf_event_ctx_lock(). */ lockdep_assert_held(&ctx->mutex); } if (!task) { cpu_function_call(event->cpu, event_function, &efs); return; } if (task == TASK_TOMBSTONE) return; again: if (!task_function_call(task, event_function, &efs)) return; local_irq_disable(); cpuctx = this_cpu_ptr(&perf_cpu_context); perf_ctx_lock(cpuctx, ctx); /* * Reload the task pointer, it might have been changed by * a concurrent perf_event_context_sched_out(). */ task = ctx->task; if (task == TASK_TOMBSTONE) goto unlock; if (ctx->is_active) { perf_ctx_unlock(cpuctx, ctx); local_irq_enable(); goto again; } func(event, NULL, ctx, data); unlock: perf_ctx_unlock(cpuctx, ctx); local_irq_enable(); } /* * Similar to event_function_call() + event_function(), but hard assumes IRQs * are already disabled and we're on the right CPU. */ static void event_function_local(struct perf_event *event, event_f func, void *data) { struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct task_struct *task = READ_ONCE(ctx->task); struct perf_event_context *task_ctx = NULL; lockdep_assert_irqs_disabled(); if (task) { if (task == TASK_TOMBSTONE) return; task_ctx = ctx; } perf_ctx_lock(cpuctx, task_ctx); task = ctx->task; if (task == TASK_TOMBSTONE) goto unlock; if (task) { /* * We must be either inactive or active and the right task, * otherwise we're screwed, since we cannot IPI to somewhere * else. */ if (ctx->is_active) { if (WARN_ON_ONCE(task != current)) goto unlock; if (WARN_ON_ONCE(cpuctx->task_ctx != ctx)) goto unlock; } } else { WARN_ON_ONCE(&cpuctx->ctx != ctx); } func(event, cpuctx, ctx, data); unlock: perf_ctx_unlock(cpuctx, task_ctx); } #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ PERF_FLAG_FD_OUTPUT |\ PERF_FLAG_PID_CGROUP |\ PERF_FLAG_FD_CLOEXEC) /* * branch priv levels that need permission checks */ #define PERF_SAMPLE_BRANCH_PERM_PLM \ (PERF_SAMPLE_BRANCH_KERNEL |\ PERF_SAMPLE_BRANCH_HV) /* * perf_sched_events : >0 events exist */ static void perf_sched_delayed(struct work_struct *work); DEFINE_STATIC_KEY_FALSE(perf_sched_events); static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed); static DEFINE_MUTEX(perf_sched_mutex); static atomic_t perf_sched_count; static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events); static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; static atomic_t nr_namespaces_events __read_mostly; static atomic_t nr_task_events __read_mostly; static atomic_t nr_freq_events __read_mostly; static atomic_t nr_switch_events __read_mostly; static atomic_t nr_ksymbol_events __read_mostly; static atomic_t nr_bpf_events __read_mostly; static atomic_t nr_cgroup_events __read_mostly; static atomic_t nr_text_poke_events __read_mostly; static atomic_t nr_build_id_events __read_mostly; static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); static struct srcu_struct pmus_srcu; static cpumask_var_t perf_online_mask; static cpumask_var_t perf_online_core_mask; static cpumask_var_t perf_online_die_mask; static cpumask_var_t perf_online_cluster_mask; static cpumask_var_t perf_online_pkg_mask; static cpumask_var_t perf_online_sys_mask; static struct kmem_cache *perf_event_cache; /* * perf event paranoia level: * -1 - not paranoid at all * 0 - disallow raw tracepoint access for unpriv * 1 - disallow cpu events for unpriv * 2 - disallow kernel profiling for unpriv */ int sysctl_perf_event_paranoid __read_mostly = 2; /* Minimum for 512 kiB + 1 user control page. 'free' kiB per user. */ static int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* * max perf event sample rate */ #define DEFAULT_MAX_SAMPLE_RATE 100000 #define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE) #define DEFAULT_CPU_TIME_MAX_PERCENT 25 int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; static int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT; static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; static int perf_sample_allowed_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100; static void update_perf_cpu_limits(void) { u64 tmp = perf_sample_period_ns; tmp *= sysctl_perf_cpu_time_max_percent; tmp = div_u64(tmp, 100); if (!tmp) tmp = 1; WRITE_ONCE(perf_sample_allowed_ns, tmp); } static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc); static int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; int perf_cpu = sysctl_perf_cpu_time_max_percent; /* * If throttling is disabled don't allow the write: */ if (write && (perf_cpu == 100 || perf_cpu == 0)) return -EINVAL; ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret || !write) return ret; max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; update_perf_cpu_limits(); return 0; } static int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret || !write) return ret; if (sysctl_perf_cpu_time_max_percent == 100 || sysctl_perf_cpu_time_max_percent == 0) { printk(KERN_WARNING "perf: Dynamic interrupt throttling disabled, can hang your system!\n"); WRITE_ONCE(perf_sample_allowed_ns, 0); } else { update_perf_cpu_limits(); } return 0; } static const struct ctl_table events_core_sysctl_table[] = { /* * User-space relies on this file as a feature check for * perf_events being enabled. It's an ABI, do not remove! */ { .procname = "perf_event_paranoid", .data = &sysctl_perf_event_paranoid, .maxlen = sizeof(sysctl_perf_event_paranoid), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "perf_event_mlock_kb", .data = &sysctl_perf_event_mlock, .maxlen = sizeof(sysctl_perf_event_mlock), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "perf_event_max_sample_rate", .data = &sysctl_perf_event_sample_rate, .maxlen = sizeof(sysctl_perf_event_sample_rate), .mode = 0644, .proc_handler = perf_event_max_sample_rate_handler, .extra1 = SYSCTL_ONE, }, { .procname = "perf_cpu_time_max_percent", .data = &sysctl_perf_cpu_time_max_percent, .maxlen = sizeof(sysctl_perf_cpu_time_max_percent), .mode = 0644, .proc_handler = perf_cpu_time_max_percent_handler, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE_HUNDRED, }, }; static int __init init_events_core_sysctls(void) { register_sysctl_init("kernel", events_core_sysctl_table); return 0; } core_initcall(init_events_core_sysctls); /* * perf samples are done in some very critical code paths (NMIs). * If they take too much CPU time, the system can lock up and not * get any real work done. This will drop the sample rate when * we detect that events are taking too long. */ #define NR_ACCUMULATED_SAMPLES 128 static DEFINE_PER_CPU(u64, running_sample_length); static u64 __report_avg; static u64 __report_allowed; static void perf_duration_warn(struct irq_work *w) { printk_ratelimited(KERN_INFO "perf: interrupt took too long (%lld > %lld), lowering " "kernel.perf_event_max_sample_rate to %d\n", __report_avg, __report_allowed, sysctl_perf_event_sample_rate); } static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn); void perf_sample_event_took(u64 sample_len_ns) { u64 max_len = READ_ONCE(perf_sample_allowed_ns); u64 running_len; u64 avg_len; u32 max; if (max_len == 0) return; /* Decay the counter by 1 average sample. */ running_len = __this_cpu_read(running_sample_length); running_len -= running_len/NR_ACCUMULATED_SAMPLES; running_len += sample_len_ns; __this_cpu_write(running_sample_length, running_len); /* * Note: this will be biased artificially low until we have * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us * from having to maintain a count. */ avg_len = running_len/NR_ACCUMULATED_SAMPLES; if (avg_len <= max_len) return; __report_avg = avg_len; __report_allowed = max_len; /* * Compute a throttle threshold 25% below the current duration. */ avg_len += avg_len / 4; max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent; if (avg_len < max) max /= (u32)avg_len; else max = 1; WRITE_ONCE(perf_sample_allowed_ns, avg_len); WRITE_ONCE(max_samples_per_tick, max); sysctl_perf_event_sample_rate = max * HZ; perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; if (!irq_work_queue(&perf_duration_work)) { early_printk("perf: interrupt took too long (%lld > %lld), lowering " "kernel.perf_event_max_sample_rate to %d\n", __report_avg, __report_allowed, sysctl_perf_event_sample_rate); } } static atomic64_t perf_event_id; static void update_context_time(struct perf_event_context *ctx); static u64 perf_event_time(struct perf_event *event); void __weak perf_event_print_debug(void) { } static inline u64 perf_clock(void) { return local_clock(); } static inline u64 perf_event_clock(struct perf_event *event) { return event->clock(); } /* * State based event timekeeping... * * The basic idea is to use event->state to determine which (if any) time * fields to increment with the current delta. This means we only need to * update timestamps when we change state or when they are explicitly requested * (read). * * Event groups make things a little more complicated, but not terribly so. The * rules for a group are that if the group leader is OFF the entire group is * OFF, irrespective of what the group member states are. This results in * __perf_effective_state(). * * A further ramification is that when a group leader flips between OFF and * !OFF, we need to update all group member times. * * * NOTE: perf_event_time() is based on the (cgroup) context time, and thus we * need to make sure the relevant context time is updated before we try and * update our timestamps. */ static __always_inline enum perf_event_state __perf_effective_state(struct perf_event *event) { struct perf_event *leader = event->group_leader; if (leader->state <= PERF_EVENT_STATE_OFF) return leader->state; return event->state; } static __always_inline void __perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running) { enum perf_event_state state = __perf_effective_state(event); u64 delta = now - event->tstamp; *enabled = event->total_time_enabled; if (state >= PERF_EVENT_STATE_INACTIVE) *enabled += delta; *running = event->total_time_running; if (state >= PERF_EVENT_STATE_ACTIVE) *running += delta; } static void perf_event_update_time(struct perf_event *event) { u64 now = perf_event_time(event); __perf_update_times(event, now, &event->total_time_enabled, &event->total_time_running); event->tstamp = now; } static void perf_event_update_sibling_time(struct perf_event *leader) { struct perf_event *sibling; for_each_sibling_event(sibling, leader) perf_event_update_time(sibling); } static void perf_event_set_state(struct perf_event *event, enum perf_event_state state) { if (event->state == state) return; perf_event_update_time(event); /* * If a group leader gets enabled/disabled all its siblings * are affected too. */ if ((event->state < 0) ^ (state < 0)) perf_event_update_sibling_time(event); WRITE_ONCE(event->state, state); } /* * UP store-release, load-acquire */ #define __store_release(ptr, val) \ do { \ barrier(); \ WRITE_ONCE(*(ptr), (val)); \ } while (0) #define __load_acquire(ptr) \ ({ \ __unqual_scalar_typeof(*(ptr)) ___p = READ_ONCE(*(ptr)); \ barrier(); \ ___p; \ }) #define for_each_epc(_epc, _ctx, _pmu, _cgroup) \ list_for_each_entry(_epc, &((_ctx)->pmu_ctx_list), pmu_ctx_entry) \ if (_cgroup && !_epc->nr_cgroups) \ continue; \ else if (_pmu && _epc->pmu != _pmu) \ continue; \ else static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup) { struct perf_event_pmu_context *pmu_ctx; for_each_epc(pmu_ctx, ctx, NULL, cgroup) perf_pmu_disable(pmu_ctx->pmu); } static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup) { struct perf_event_pmu_context *pmu_ctx; for_each_epc(pmu_ctx, ctx, NULL, cgroup) perf_pmu_enable(pmu_ctx->pmu); } static void ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type); static void ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type); #ifdef CONFIG_CGROUP_PERF static inline bool perf_cgroup_match(struct perf_event *event) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); /* @event doesn't care about cgroup */ if (!event->cgrp) return true; /* wants specific cgroup scope but @cpuctx isn't associated with any */ if (!cpuctx->cgrp) return false; /* * Cgroup scoping is recursive. An event enabled for a cgroup is * also enabled for all its descendant cgroups. If @cpuctx's * cgroup is a descendant of @event's (the test covers identity * case), it's a match. */ return cgroup_is_descendant(cpuctx->cgrp->css.cgroup, event->cgrp->css.cgroup); } static inline void perf_detach_cgroup(struct perf_event *event) { css_put(&event->cgrp->css); event->cgrp = NULL; } static inline int is_cgroup_event(struct perf_event *event) { return event->cgrp != NULL; } static inline u64 perf_cgroup_event_time(struct perf_event *event) { struct perf_cgroup_info *t; t = per_cpu_ptr(event->cgrp->info, event->cpu); return t->time; } static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) { struct perf_cgroup_info *t; t = per_cpu_ptr(event->cgrp->info, event->cpu); if (!__load_acquire(&t->active)) return t->time; now += READ_ONCE(t->timeoffset); return now; } static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv) { if (adv) info->time += now - info->timestamp; info->timestamp = now; /* * see update_context_time() */ WRITE_ONCE(info->timeoffset, info->time - info->timestamp); } static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final) { struct perf_cgroup *cgrp = cpuctx->cgrp; struct cgroup_subsys_state *css; struct perf_cgroup_info *info; if (cgrp) { u64 now = perf_clock(); for (css = &cgrp->css; css; css = css->parent) { cgrp = container_of(css, struct perf_cgroup, css); info = this_cpu_ptr(cgrp->info); __update_cgrp_time(info, now, true); if (final) __store_release(&info->active, 0); } } } static inline void update_cgrp_time_from_event(struct perf_event *event) { struct perf_cgroup_info *info; /* * ensure we access cgroup data only when needed and * when we know the cgroup is pinned (css_get) */ if (!is_cgroup_event(event)) return; info = this_cpu_ptr(event->cgrp->info); /* * Do not update time when cgroup is not active */ if (info->active) __update_cgrp_time(info, perf_clock(), true); } static inline void perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) { struct perf_event_context *ctx = &cpuctx->ctx; struct perf_cgroup *cgrp = cpuctx->cgrp; struct perf_cgroup_info *info; struct cgroup_subsys_state *css; /* * ctx->lock held by caller * ensure we do not access cgroup data * unless we have the cgroup pinned (css_get) */ if (!cgrp) return; WARN_ON_ONCE(!ctx->nr_cgroups); for (css = &cgrp->css; css; css = css->parent) { cgrp = container_of(css, struct perf_cgroup, css); info = this_cpu_ptr(cgrp->info); __update_cgrp_time(info, ctx->timestamp, false); __store_release(&info->active, 1); } } /* * reschedule events based on the cgroup constraint of task. */ static void perf_cgroup_switch(struct task_struct *task) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_cgroup *cgrp; /* * cpuctx->cgrp is set when the first cgroup event enabled, * and is cleared when the last cgroup event disabled. */ if (READ_ONCE(cpuctx->cgrp) == NULL) return; cgrp = perf_cgroup_from_task(task, NULL); if (READ_ONCE(cpuctx->cgrp) == cgrp) return; guard(perf_ctx_lock)(cpuctx, cpuctx->task_ctx); /* * Re-check, could've raced vs perf_remove_from_context(). */ if (READ_ONCE(cpuctx->cgrp) == NULL) return; WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); perf_ctx_disable(&cpuctx->ctx, true); ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); /* * must not be done before ctxswout due * to update_cgrp_time_from_cpuctx() in * ctx_sched_out() */ cpuctx->cgrp = cgrp; /* * set cgrp before ctxsw in to allow * perf_cgroup_set_timestamp() in ctx_sched_in() * to not have to pass task around */ ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); perf_ctx_enable(&cpuctx->ctx, true); } static int perf_cgroup_ensure_storage(struct perf_event *event, struct cgroup_subsys_state *css) { struct perf_cpu_context *cpuctx; struct perf_event **storage; int cpu, heap_size, ret = 0; /* * Allow storage to have sufficient space for an iterator for each * possibly nested cgroup plus an iterator for events with no cgroup. */ for (heap_size = 1; css; css = css->parent) heap_size++; for_each_possible_cpu(cpu) { cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); if (heap_size <= cpuctx->heap_size) continue; storage = kmalloc_node(heap_size * sizeof(struct perf_event *), GFP_KERNEL, cpu_to_node(cpu)); if (!storage) { ret = -ENOMEM; break; } raw_spin_lock_irq(&cpuctx->ctx.lock); if (cpuctx->heap_size < heap_size) { swap(cpuctx->heap, storage); if (storage == cpuctx->heap_default) storage = NULL; cpuctx->heap_size = heap_size; } raw_spin_unlock_irq(&cpuctx->ctx.lock); kfree(storage); } return ret; } static inline int perf_cgroup_connect(int fd, struct perf_event *event, struct perf_event_attr *attr, struct perf_event *group_leader) { struct perf_cgroup *cgrp; struct cgroup_subsys_state *css; CLASS(fd, f)(fd); int ret = 0; if (fd_empty(f)) return -EBADF; css = css_tryget_online_from_dir(fd_file(f)->f_path.dentry, &perf_event_cgrp_subsys); if (IS_ERR(css)) return PTR_ERR(css); ret = perf_cgroup_ensure_storage(event, css); if (ret) return ret; cgrp = container_of(css, struct perf_cgroup, css); event->cgrp = cgrp; /* * all events in a group must monitor * the same cgroup because a task belongs * to only one perf cgroup at a time */ if (group_leader && group_leader->cgrp != cgrp) { perf_detach_cgroup(event); ret = -EINVAL; } return ret; } static inline void perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx) { struct perf_cpu_context *cpuctx; if (!is_cgroup_event(event)) return; event->pmu_ctx->nr_cgroups++; /* * Because cgroup events are always per-cpu events, * @ctx == &cpuctx->ctx. */ cpuctx = container_of(ctx, struct perf_cpu_context, ctx); if (ctx->nr_cgroups++) return; cpuctx->cgrp = perf_cgroup_from_task(current, ctx); } static inline void perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx) { struct perf_cpu_context *cpuctx; if (!is_cgroup_event(event)) return; event->pmu_ctx->nr_cgroups--; /* * Because cgroup events are always per-cpu events, * @ctx == &cpuctx->ctx. */ cpuctx = container_of(ctx, struct perf_cpu_context, ctx); if (--ctx->nr_cgroups) return; cpuctx->cgrp = NULL; } #else /* !CONFIG_CGROUP_PERF */ static inline bool perf_cgroup_match(struct perf_event *event) { return true; } static inline void perf_detach_cgroup(struct perf_event *event) {} static inline int is_cgroup_event(struct perf_event *event) { return 0; } static inline void update_cgrp_time_from_event(struct perf_event *event) { } static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final) { } static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, struct perf_event_attr *attr, struct perf_event *group_leader) { return -EINVAL; } static inline void perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) { } static inline u64 perf_cgroup_event_time(struct perf_event *event) { return 0; } static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) { return 0; } static inline void perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx) { } static inline void perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx) { } static void perf_cgroup_switch(struct task_struct *task) { } #endif /* * set default to be dependent on timer tick just * like original code */ #define PERF_CPU_HRTIMER (1000 / HZ) /* * function must be called with interrupts disabled */ static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr) { struct perf_cpu_pmu_context *cpc; bool rotations; lockdep_assert_irqs_disabled(); cpc = container_of(hr, struct perf_cpu_pmu_context, hrtimer); rotations = perf_rotate_context(cpc); raw_spin_lock(&cpc->hrtimer_lock); if (rotations) hrtimer_forward_now(hr, cpc->hrtimer_interval); else cpc->hrtimer_active = 0; raw_spin_unlock(&cpc->hrtimer_lock); return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART; } static void __perf_mux_hrtimer_init(struct perf_cpu_pmu_context *cpc, int cpu) { struct hrtimer *timer = &cpc->hrtimer; struct pmu *pmu = cpc->epc.pmu; u64 interval; /* * check default is sane, if not set then force to * default interval (1/tick) */ interval = pmu->hrtimer_interval_ms; if (interval < 1) interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER; cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval); raw_spin_lock_init(&cpc->hrtimer_lock); hrtimer_setup(timer, perf_mux_hrtimer_handler, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); } static int perf_mux_hrtimer_restart(struct perf_cpu_pmu_context *cpc) { struct hrtimer *timer = &cpc->hrtimer; unsigned long flags; raw_spin_lock_irqsave(&cpc->hrtimer_lock, flags); if (!cpc->hrtimer_active) { cpc->hrtimer_active = 1; hrtimer_forward_now(timer, cpc->hrtimer_interval); hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD); } raw_spin_unlock_irqrestore(&cpc->hrtimer_lock, flags); return 0; } static int perf_mux_hrtimer_restart_ipi(void *arg) { return perf_mux_hrtimer_restart(arg); } static __always_inline struct perf_cpu_pmu_context *this_cpc(struct pmu *pmu) { return *this_cpu_ptr(pmu->cpu_pmu_context); } void perf_pmu_disable(struct pmu *pmu) { int *count = &this_cpc(pmu)->pmu_disable_count; if (!(*count)++) pmu->pmu_disable(pmu); } void perf_pmu_enable(struct pmu *pmu) { int *count = &this_cpc(pmu)->pmu_disable_count; if (!--(*count)) pmu->pmu_enable(pmu); } static void perf_assert_pmu_disabled(struct pmu *pmu) { int *count = &this_cpc(pmu)->pmu_disable_count; WARN_ON_ONCE(*count == 0); } static inline void perf_pmu_read(struct perf_event *event) { if (event->state == PERF_EVENT_STATE_ACTIVE) event->pmu->read(event); } static void get_ctx(struct perf_event_context *ctx) { refcount_inc(&ctx->refcount); } static void free_ctx(struct rcu_head *head) { struct perf_event_context *ctx; ctx = container_of(head, struct perf_event_context, rcu_head); kfree(ctx); } static void put_ctx(struct perf_event_context *ctx) { if (refcount_dec_and_test(&ctx->refcount)) { if (ctx->parent_ctx) put_ctx(ctx->parent_ctx); if (ctx->task && ctx->task != TASK_TOMBSTONE) put_task_struct(ctx->task); call_rcu(&ctx->rcu_head, free_ctx); } else { smp_mb__after_atomic(); /* pairs with wait_var_event() */ if (ctx->task == TASK_TOMBSTONE) wake_up_var(&ctx->refcount); } } /* * Because of perf_event::ctx migration in sys_perf_event_open::move_group and * perf_pmu_migrate_context() we need some magic. * * Those places that change perf_event::ctx will hold both * perf_event_ctx::mutex of the 'old' and 'new' ctx value. * * Lock ordering is by mutex address. There are two other sites where * perf_event_context::mutex nests and those are: * * - perf_event_exit_task_context() [ child , 0 ] * perf_event_exit_event() * put_event() [ parent, 1 ] * * - perf_event_init_context() [ parent, 0 ] * inherit_task_group() * inherit_group() * inherit_event() * perf_event_alloc() * perf_init_event() * perf_try_init_event() [ child , 1 ] * * While it appears there is an obvious deadlock here -- the parent and child * nesting levels are inverted between the two. This is in fact safe because * life-time rules separate them. That is an exiting task cannot fork, and a * spawning task cannot (yet) exit. * * But remember that these are parent<->child context relations, and * migration does not affect children, therefore these two orderings should not * interact. * * The change in perf_event::ctx does not affect children (as claimed above) * because the sys_perf_event_open() case will install a new event and break * the ctx parent<->child relation, and perf_pmu_migrate_context() is only * concerned with cpuctx and that doesn't have children. * * The places that change perf_event::ctx will issue: * * perf_remove_from_context(); * synchronize_rcu(); * perf_install_in_context(); * * to affect the change. The remove_from_context() + synchronize_rcu() should * quiesce the event, after which we can install it in the new location. This * means that only external vectors (perf_fops, prctl) can perturb the event * while in transit. Therefore all such accessors should also acquire * perf_event_context::mutex to serialize against this. * * However; because event->ctx can change while we're waiting to acquire * ctx->mutex we must be careful and use the below perf_event_ctx_lock() * function. * * Lock order: * exec_update_lock * task_struct::perf_event_mutex * perf_event_context::mutex * perf_event::child_mutex; * perf_event_context::lock * mmap_lock * perf_event::mmap_mutex * perf_buffer::aux_mutex * perf_addr_filters_head::lock * * cpu_hotplug_lock * pmus_lock * cpuctx->mutex / perf_event_context::mutex */ static struct perf_event_context * perf_event_ctx_lock_nested(struct perf_event *event, int nesting) { struct perf_event_context *ctx; again: rcu_read_lock(); ctx = READ_ONCE(event->ctx); if (!refcount_inc_not_zero(&ctx->refcount)) { rcu_read_unlock(); goto again; } rcu_read_unlock(); mutex_lock_nested(&ctx->mutex, nesting); if (event->ctx != ctx) { mutex_unlock(&ctx->mutex); put_ctx(ctx); goto again; } return ctx; } static inline struct perf_event_context * perf_event_ctx_lock(struct perf_event *event) { return perf_event_ctx_lock_nested(event, 0); } static void perf_event_ctx_unlock(struct perf_event *event, struct perf_event_context *ctx) { mutex_unlock(&ctx->mutex); put_ctx(ctx); } /* * This must be done under the ctx->lock, such as to serialize against * context_equiv(), therefore we cannot call put_ctx() since that might end up * calling scheduler related locks and ctx->lock nests inside those. */ static __must_check struct perf_event_context * unclone_ctx(struct perf_event_context *ctx) { struct perf_event_context *parent_ctx = ctx->parent_ctx; lockdep_assert_held(&ctx->lock); if (parent_ctx) ctx->parent_ctx = NULL; ctx->generation++; return parent_ctx; } static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p, enum pid_type type) { u32 nr; /* * only top level events have the pid namespace they were created in */ if (event->parent) event = event->parent; nr = __task_pid_nr_ns(p, type, event->ns); /* avoid -1 if it is idle thread or runs in another ns */ if (!nr && !pid_alive(p)) nr = -1; return nr; } static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) { return perf_event_pid_type(event, p, PIDTYPE_TGID); } static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) { return perf_event_pid_type(event, p, PIDTYPE_PID); } /* * If we inherit events we want to return the parent event id * to userspace. */ static u64 primary_event_id(struct perf_event *event) { u64 id = event->id; if (event->parent) id = event->parent->id; return id; } /* * Get the perf_event_context for a task and lock it. * * This has to cope with the fact that until it is locked, * the context could get moved to another task. */ static struct perf_event_context * perf_lock_task_context(struct task_struct *task, unsigned long *flags) { struct perf_event_context *ctx; retry: /* * One of the few rules of preemptible RCU is that one cannot do * rcu_read_unlock() while holding a scheduler (or nested) lock when * part of the read side critical section was irqs-enabled -- see * rcu_read_unlock_special(). * * Since ctx->lock nests under rq->lock we must ensure the entire read * side critical section has interrupts disabled. */ local_irq_save(*flags); rcu_read_lock(); ctx = rcu_dereference(task->perf_event_ctxp); if (ctx) { /* * If this context is a clone of another, it might * get swapped for another underneath us by * perf_event_task_sched_out, though the * rcu_read_lock() protects us from any context * getting freed. Lock the context and check if it * got swapped before we could get the lock, and retry * if so. If we locked the right context, then it * can't get swapped on us any more. */ raw_spin_lock(&ctx->lock); if (ctx != rcu_dereference(task->perf_event_ctxp)) { raw_spin_unlock(&ctx->lock); rcu_read_unlock(); local_irq_restore(*flags); goto retry; } if (ctx->task == TASK_TOMBSTONE || !refcount_inc_not_zero(&ctx->refcount)) { raw_spin_unlock(&ctx->lock); ctx = NULL; } else { WARN_ON_ONCE(ctx->task != task); } } rcu_read_unlock(); if (!ctx) local_irq_restore(*flags); return ctx; } /* * Get the context for a task and increment its pin_count so it * can't get swapped to another task. This also increments its * reference count so that the context can't get freed. */ static struct perf_event_context * perf_pin_task_context(struct task_struct *task) { struct perf_event_context *ctx; unsigned long flags; ctx = perf_lock_task_context(task, &flags); if (ctx) { ++ctx->pin_count; raw_spin_unlock_irqrestore(&ctx->lock, flags); } return ctx; } static void perf_unpin_context(struct perf_event_context *ctx) { unsigned long flags; raw_spin_lock_irqsave(&ctx->lock, flags); --ctx->pin_count; raw_spin_unlock_irqrestore(&ctx->lock, flags); } /* * Update the record of the current time in a context. */ static void __update_context_time(struct perf_event_context *ctx, bool adv) { u64 now = perf_clock(); lockdep_assert_held(&ctx->lock); if (adv) ctx->time += now - ctx->timestamp; ctx->timestamp = now; /* * The above: time' = time + (now - timestamp), can be re-arranged * into: time` = now + (time - timestamp), which gives a single value * offset to compute future time without locks on. * * See perf_event_time_now(), which can be used from NMI context where * it's (obviously) not possible to acquire ctx->lock in order to read * both the above values in a consistent manner. */ WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp); } static void update_context_time(struct perf_event_context *ctx) { __update_context_time(ctx, true); } static u64 perf_event_time(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; if (unlikely(!ctx)) return 0; if (is_cgroup_event(event)) return perf_cgroup_event_time(event); return ctx->time; } static u64 perf_event_time_now(struct perf_event *event, u64 now) { struct perf_event_context *ctx = event->ctx; if (unlikely(!ctx)) return 0; if (is_cgroup_event(event)) return perf_cgroup_event_time_now(event, now); if (!(__load_acquire(&ctx->is_active) & EVENT_TIME)) return ctx->time; now += READ_ONCE(ctx->timeoffset); return now; } static enum event_type_t get_event_type(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; enum event_type_t event_type; lockdep_assert_held(&ctx->lock); /* * It's 'group type', really, because if our group leader is * pinned, so are we. */ if (event->group_leader != event) event = event->group_leader; event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE; if (!ctx->task) event_type |= EVENT_CPU; return event_type; } /* * Helper function to initialize event group nodes. */ static void init_event_group(struct perf_event *event) { RB_CLEAR_NODE(&event->group_node); event->group_index = 0; } /* * Extract pinned or flexible groups from the context * based on event attrs bits. */ static struct perf_event_groups * get_event_groups(struct perf_event *event, struct perf_event_context *ctx) { if (event->attr.pinned) return &ctx->pinned_groups; else return &ctx->flexible_groups; } /* * Helper function to initializes perf_event_group trees. */ static void perf_event_groups_init(struct perf_event_groups *groups) { groups->tree = RB_ROOT; groups->index = 0; } static inline struct cgroup *event_cgroup(const struct perf_event *event) { struct cgroup *cgroup = NULL; #ifdef CONFIG_CGROUP_PERF if (event->cgrp) cgroup = event->cgrp->css.cgroup; #endif return cgroup; } /* * Compare function for event groups; * * Implements complex key that first sorts by CPU and then by virtual index * which provides ordering when rotating groups for the same CPU. */ static __always_inline int perf_event_groups_cmp(const int left_cpu, const struct pmu *left_pmu, const struct cgroup *left_cgroup, const u64 left_group_index, const struct perf_event *right) { if (left_cpu < right->cpu) return -1; if (left_cpu > right->cpu) return 1; if (left_pmu) { if (left_pmu < right->pmu_ctx->pmu) return -1; if (left_pmu > right->pmu_ctx->pmu) return 1; } #ifdef CONFIG_CGROUP_PERF { const struct cgroup *right_cgroup = event_cgroup(right); if (left_cgroup != right_cgroup) { if (!left_cgroup) { /* * Left has no cgroup but right does, no * cgroups come first. */ return -1; } if (!right_cgroup) { /* * Right has no cgroup but left does, no * cgroups come first. */ return 1; } /* Two dissimilar cgroups, order by id. */ if (cgroup_id(left_cgroup) < cgroup_id(right_cgroup)) return -1; return 1; } } #endif if (left_group_index < right->group_index) return -1; if (left_group_index > right->group_index) return 1; return 0; } #define __node_2_pe(node) \ rb_entry((node), struct perf_event, group_node) static inline bool __group_less(struct rb_node *a, const struct rb_node *b) { struct perf_event *e = __node_2_pe(a); return perf_event_groups_cmp(e->cpu, e->pmu_ctx->pmu, event_cgroup(e), e->group_index, __node_2_pe(b)) < 0; } struct __group_key { int cpu; struct pmu *pmu; struct cgroup *cgroup; }; static inline int __group_cmp(const void *key, const struct rb_node *node) { const struct __group_key *a = key; const struct perf_event *b = __node_2_pe(node); /* partial/subtree match: @cpu, @pmu, @cgroup; ignore: @group_index */ return perf_event_groups_cmp(a->cpu, a->pmu, a->cgroup, b->group_index, b); } static inline int __group_cmp_ignore_cgroup(const void *key, const struct rb_node *node) { const struct __group_key *a = key; const struct perf_event *b = __node_2_pe(node); /* partial/subtree match: @cpu, @pmu, ignore: @cgroup, @group_index */ return perf_event_groups_cmp(a->cpu, a->pmu, event_cgroup(b), b->group_index, b); } /* * Insert @event into @groups' tree; using * {@event->cpu, @event->pmu_ctx->pmu, event_cgroup(@event), ++@groups->index} * as key. This places it last inside the {cpu,pmu,cgroup} subtree. */ static void perf_event_groups_insert(struct perf_event_groups *groups, struct perf_event *event) { event->group_index = ++groups->index; rb_add(&event->group_node, &groups->tree, __group_less); } /* * Helper function to insert event into the pinned or flexible groups. */ static void add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event_groups *groups; groups = get_event_groups(event, ctx); perf_event_groups_insert(groups, event); } /* * Delete a group from a tree. */ static void perf_event_groups_delete(struct perf_event_groups *groups, struct perf_event *event) { WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) || RB_EMPTY_ROOT(&groups->tree)); rb_erase(&event->group_node, &groups->tree); init_event_group(event); } /* * Helper function to delete event from its groups. */ static void del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event_groups *groups; groups = get_event_groups(event, ctx); perf_event_groups_delete(groups, event); } /* * Get the leftmost event in the {cpu,pmu,cgroup} subtree. */ static struct perf_event * perf_event_groups_first(struct perf_event_groups *groups, int cpu, struct pmu *pmu, struct cgroup *cgrp) { struct __group_key key = { .cpu = cpu, .pmu = pmu, .cgroup = cgrp, }; struct rb_node *node; node = rb_find_first(&key, &groups->tree, __group_cmp); if (node) return __node_2_pe(node); return NULL; } static struct perf_event * perf_event_groups_next(struct perf_event *event, struct pmu *pmu) { struct __group_key key = { .cpu = event->cpu, .pmu = pmu, .cgroup = event_cgroup(event), }; struct rb_node *next; next = rb_next_match(&key, &event->group_node, __group_cmp); if (next) return __node_2_pe(next); return NULL; } #define perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) \ for (event = perf_event_groups_first(groups, cpu, pmu, NULL); \ event; event = perf_event_groups_next(event, pmu)) /* * Iterate through the whole groups tree. */ #define perf_event_groups_for_each(event, groups) \ for (event = rb_entry_safe(rb_first(&((groups)->tree)), \ typeof(*event), group_node); event; \ event = rb_entry_safe(rb_next(&event->group_node), \ typeof(*event), group_node)) /* * Does the event attribute request inherit with PERF_SAMPLE_READ */ static inline bool has_inherit_and_sample_read(struct perf_event_attr *attr) { return attr->inherit && (attr->sample_type & PERF_SAMPLE_READ); } /* * Add an event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. */ static void list_add_event(struct perf_event *event, struct perf_event_context *ctx) { lockdep_assert_held(&ctx->lock); WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); event->attach_state |= PERF_ATTACH_CONTEXT; event->tstamp = perf_event_time(event); /* * If we're a stand alone event or group leader, we go to the context * list, group events are kept attached to the group so that * perf_group_detach can, at all times, locate all siblings. */ if (event->group_leader == event) { event->group_caps = event->event_caps; add_event_to_groups(event, ctx); } list_add_rcu(&event->event_entry, &ctx->event_list); ctx->nr_events++; if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT) ctx->nr_user++; if (event->attr.inherit_stat) ctx->nr_stat++; if (has_inherit_and_sample_read(&event->attr)) local_inc(&ctx->nr_no_switch_fast); if (event->state > PERF_EVENT_STATE_OFF) perf_cgroup_event_enable(event, ctx); ctx->generation++; event->pmu_ctx->nr_events++; } /* * Initialize event state based on the perf_event_attr::disabled. */ static inline void perf_event__state_init(struct perf_event *event) { event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF : PERF_EVENT_STATE_INACTIVE; } static int __perf_event_read_size(u64 read_format, int nr_siblings) { int entry = sizeof(u64); /* value */ int size = 0; int nr = 1; if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) size += sizeof(u64); if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) size += sizeof(u64); if (read_format & PERF_FORMAT_ID) entry += sizeof(u64); if (read_format & PERF_FORMAT_LOST) entry += sizeof(u64); if (read_format & PERF_FORMAT_GROUP) { nr += nr_siblings; size += sizeof(u64); } /* * Since perf_event_validate_size() limits this to 16k and inhibits * adding more siblings, this will never overflow. */ return size + nr * entry; } static void __perf_event_header_size(struct perf_event *event, u64 sample_type) { struct perf_sample_data *data; u16 size = 0; if (sample_type & PERF_SAMPLE_IP) size += sizeof(data->ip); if (sample_type & PERF_SAMPLE_ADDR) size += sizeof(data->addr); if (sample_type & PERF_SAMPLE_PERIOD) size += sizeof(data->period); if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) size += sizeof(data->weight.full); if (sample_type & PERF_SAMPLE_READ) size += event->read_size; if (sample_type & PERF_SAMPLE_DATA_SRC) size += sizeof(data->data_src.val); if (sample_type & PERF_SAMPLE_TRANSACTION) size += sizeof(data->txn); if (sample_type & PERF_SAMPLE_PHYS_ADDR) size += sizeof(data->phys_addr); if (sample_type & PERF_SAMPLE_CGROUP) size += sizeof(data->cgroup); if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) size += sizeof(data->data_page_size); if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) size += sizeof(data->code_page_size); event->header_size = size; } /* * Called at perf_event creation and when events are attached/detached from a * group. */ static void perf_event__header_size(struct perf_event *event) { event->read_size = __perf_event_read_size(event->attr.read_format, event->group_leader->nr_siblings); __perf_event_header_size(event, event->attr.sample_type); } static void perf_event__id_header_size(struct perf_event *event) { struct perf_sample_data *data; u64 sample_type = event->attr.sample_type; u16 size = 0; if (sample_type & PERF_SAMPLE_TID) size += sizeof(data->tid_entry); if (sample_type & PERF_SAMPLE_TIME) size += sizeof(data->time); if (sample_type & PERF_SAMPLE_IDENTIFIER) size += sizeof(data->id); if (sample_type & PERF_SAMPLE_ID) size += sizeof(data->id); if (sample_type & PERF_SAMPLE_STREAM_ID) size += sizeof(data->stream_id); if (sample_type & PERF_SAMPLE_CPU) size += sizeof(data->cpu_entry); event->id_header_size = size; } /* * Check that adding an event to the group does not result in anybody * overflowing the 64k event limit imposed by the output buffer. * * Specifically, check that the read_size for the event does not exceed 16k, * read_size being the one term that grows with groups size. Since read_size * depends on per-event read_format, also (re)check the existing events. * * This leaves 48k for the constant size fields and things like callchains, * branch stacks and register sets. */ static bool perf_event_validate_size(struct perf_event *event) { struct perf_event *sibling, *group_leader = event->group_leader; if (__perf_event_read_size(event->attr.read_format, group_leader->nr_siblings + 1) > 16*1024) return false; if (__perf_event_read_size(group_leader->attr.read_format, group_leader->nr_siblings + 1) > 16*1024) return false; /* * When creating a new group leader, group_leader->ctx is initialized * after the size has been validated, but we cannot safely use * for_each_sibling_event() until group_leader->ctx is set. A new group * leader cannot have any siblings yet, so we can safely skip checking * the non-existent siblings. */ if (event == group_leader) return true; for_each_sibling_event(sibling, group_leader) { if (__perf_event_read_size(sibling->attr.read_format, group_leader->nr_siblings + 1) > 16*1024) return false; } return true; } static void perf_group_attach(struct perf_event *event) { struct perf_event *group_leader = event->group_leader, *pos; lockdep_assert_held(&event->ctx->lock); /* * We can have double attach due to group movement (move_group) in * perf_event_open(). */ if (event->attach_state & PERF_ATTACH_GROUP) return; event->attach_state |= PERF_ATTACH_GROUP; if (group_leader == event) return; WARN_ON_ONCE(group_leader->ctx != event->ctx); group_leader->group_caps &= event->event_caps; list_add_tail(&event->sibling_list, &group_leader->sibling_list); group_leader->nr_siblings++; group_leader->group_generation++; perf_event__header_size(group_leader); for_each_sibling_event(pos, group_leader) perf_event__header_size(pos); } /* * Remove an event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. */ static void list_del_event(struct perf_event *event, struct perf_event_context *ctx) { WARN_ON_ONCE(event->ctx != ctx); lockdep_assert_held(&ctx->lock); /* * We can have double detach due to exit/hot-unplug + close. */ if (!(event->attach_state & PERF_ATTACH_CONTEXT)) return; event->attach_state &= ~PERF_ATTACH_CONTEXT; ctx->nr_events--; if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT) ctx->nr_user--; if (event->attr.inherit_stat) ctx->nr_stat--; if (has_inherit_and_sample_read(&event->attr)) local_dec(&ctx->nr_no_switch_fast); list_del_rcu(&event->event_entry); if (event->group_leader == event) del_event_from_groups(event, ctx); ctx->generation++; event->pmu_ctx->nr_events--; } static int perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event) { if (!has_aux(aux_event)) return 0; if (!event->pmu->aux_output_match) return 0; return event->pmu->aux_output_match(aux_event); } static void put_event(struct perf_event *event); static void __event_disable(struct perf_event *event, struct perf_event_context *ctx, enum perf_event_state state); static void perf_put_aux_event(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct perf_event *iter; /* * If event uses aux_event tear down the link */ if (event->aux_event) { iter = event->aux_event; event->aux_event = NULL; put_event(iter); return; } /* * If the event is an aux_event, tear down all links to * it from other events. */ for_each_sibling_event(iter, event) { if (iter->aux_event != event) continue; iter->aux_event = NULL; put_event(event); /* * If it's ACTIVE, schedule it out and put it into ERROR * state so that we don't try to schedule it again. Note * that perf_event_enable() will clear the ERROR status. */ __event_disable(iter, ctx, PERF_EVENT_STATE_ERROR); } } static bool perf_need_aux_event(struct perf_event *event) { return event->attr.aux_output || has_aux_action(event); } static int perf_get_aux_event(struct perf_event *event, struct perf_event *group_leader) { /* * Our group leader must be an aux event if we want to be * an aux_output. This way, the aux event will precede its * aux_output events in the group, and therefore will always * schedule first. */ if (!group_leader) return 0; /* * aux_output and aux_sample_size are mutually exclusive. */ if (event->attr.aux_output && event->attr.aux_sample_size) return 0; if (event->attr.aux_output && !perf_aux_output_match(event, group_leader)) return 0; if ((event->attr.aux_pause || event->attr.aux_resume) && !(group_leader->pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE)) return 0; if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux) return 0; if (!atomic_long_inc_not_zero(&group_leader->refcount)) return 0; /* * Link aux_outputs to their aux event; this is undone in * perf_group_detach() by perf_put_aux_event(). When the * group in torn down, the aux_output events loose their * link to the aux_event and can't schedule any more. */ event->aux_event = group_leader; return 1; } static inline struct list_head *get_event_list(struct perf_event *event) { return event->attr.pinned ? &event->pmu_ctx->pinned_active : &event->pmu_ctx->flexible_active; } static void perf_group_detach(struct perf_event *event) { struct perf_event *leader = event->group_leader; struct perf_event *sibling, *tmp; struct perf_event_context *ctx = event->ctx; lockdep_assert_held(&ctx->lock); /* * We can have double detach due to exit/hot-unplug + close. */ if (!(event->attach_state & PERF_ATTACH_GROUP)) return; event->attach_state &= ~PERF_ATTACH_GROUP; perf_put_aux_event(event); /* * If this is a sibling, remove it from its group. */ if (leader != event) { list_del_init(&event->sibling_list); event->group_leader->nr_siblings--; event->group_leader->group_generation++; goto out; } /* * If this was a group event with sibling events then * upgrade the siblings to singleton events by adding them * to whatever list we are on. */ list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) { /* * Events that have PERF_EV_CAP_SIBLING require being part of * a group and cannot exist on their own, schedule them out * and move them into the ERROR state. Also see * _perf_event_enable(), it will not be able to recover this * ERROR state. */ if (sibling->event_caps & PERF_EV_CAP_SIBLING) __event_disable(sibling, ctx, PERF_EVENT_STATE_ERROR); sibling->group_leader = sibling; list_del_init(&sibling->sibling_list); /* Inherit group flags from the previous leader */ sibling->group_caps = event->group_caps; if (sibling->attach_state & PERF_ATTACH_CONTEXT) { add_event_to_groups(sibling, event->ctx); if (sibling->state == PERF_EVENT_STATE_ACTIVE) list_add_tail(&sibling->active_list, get_event_list(sibling)); } WARN_ON_ONCE(sibling->ctx != event->ctx); } out: for_each_sibling_event(tmp, leader) perf_event__header_size(tmp); perf_event__header_size(leader); } static void sync_child_event(struct perf_event *child_event); static void perf_child_detach(struct perf_event *event) { struct perf_event *parent_event = event->parent; if (!(event->attach_state & PERF_ATTACH_CHILD)) return; event->attach_state &= ~PERF_ATTACH_CHILD; if (WARN_ON_ONCE(!parent_event)) return; /* * Can't check this from an IPI, the holder is likey another CPU. * lockdep_assert_held(&parent_event->child_mutex); */ sync_child_event(event); list_del_init(&event->child_list); } static bool is_orphaned_event(struct perf_event *event) { return event->state == PERF_EVENT_STATE_DEAD; } static inline int event_filter_match(struct perf_event *event) { return (event->cpu == -1 || event->cpu == smp_processor_id()) && perf_cgroup_match(event); } static inline bool is_event_in_freq_mode(struct perf_event *event) { return event->attr.freq && event->attr.sample_freq; } static void event_sched_out(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event_pmu_context *epc = event->pmu_ctx; struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu); enum perf_event_state state = PERF_EVENT_STATE_INACTIVE; // XXX cpc serialization, probably per-cpu IRQ disabled WARN_ON_ONCE(event->ctx != ctx); lockdep_assert_held(&ctx->lock); if (event->state != PERF_EVENT_STATE_ACTIVE) return; /* * Asymmetry; we only schedule events _IN_ through ctx_sched_in(), but * we can schedule events _OUT_ individually through things like * __perf_remove_from_context(). */ list_del_init(&event->active_list); perf_pmu_disable(event->pmu); event->pmu->del(event, 0); event->oncpu = -1; if (event->pending_disable) { event->pending_disable = 0; perf_cgroup_event_disable(event, ctx); state = PERF_EVENT_STATE_OFF; } perf_event_set_state(event, state); if (!is_software_event(event)) cpc->active_oncpu--; if (is_event_in_freq_mode(event)) { ctx->nr_freq--; epc->nr_freq--; } if (event->attr.exclusive || !cpc->active_oncpu) cpc->exclusive = 0; perf_pmu_enable(event->pmu); } static void group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx) { struct perf_event *event; if (group_event->state != PERF_EVENT_STATE_ACTIVE) return; perf_assert_pmu_disabled(group_event->pmu_ctx->pmu); event_sched_out(group_event, ctx); /* * Schedule out siblings (if any): */ for_each_sibling_event(event, group_event) event_sched_out(event, ctx); } static inline void __ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, bool final) { if (ctx->is_active & EVENT_TIME) { if (ctx->is_active & EVENT_FROZEN) return; update_context_time(ctx); update_cgrp_time_from_cpuctx(cpuctx, final); } } static inline void ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { __ctx_time_update(cpuctx, ctx, false); } /* * To be used inside perf_ctx_lock() / perf_ctx_unlock(). Lasts until perf_ctx_unlock(). */ static inline void ctx_time_freeze(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { ctx_time_update(cpuctx, ctx); if (ctx->is_active & EVENT_TIME) ctx->is_active |= EVENT_FROZEN; } static inline void ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event) { if (ctx->is_active & EVENT_TIME) { if (ctx->is_active & EVENT_FROZEN) return; update_context_time(ctx); update_cgrp_time_from_event(event); } } #define DETACH_GROUP 0x01UL #define DETACH_CHILD 0x02UL #define DETACH_EXIT 0x04UL #define DETACH_REVOKE 0x08UL #define DETACH_DEAD 0x10UL /* * Cross CPU call to remove a performance event * * We disable the event on the hardware level first. After that we * remove it from the context list. */ static void __perf_remove_from_context(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, void *info) { struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx; enum perf_event_state state = PERF_EVENT_STATE_OFF; unsigned long flags = (unsigned long)info; ctx_time_update(cpuctx, ctx); /* * Ensure event_sched_out() switches to OFF, at the very least * this avoids raising perf_pending_task() at this time. */ if (flags & DETACH_EXIT) state = PERF_EVENT_STATE_EXIT; if (flags & DETACH_REVOKE) state = PERF_EVENT_STATE_REVOKED; if (flags & DETACH_DEAD) state = PERF_EVENT_STATE_DEAD; event_sched_out(event, ctx); if (event->state > PERF_EVENT_STATE_OFF) perf_cgroup_event_disable(event, ctx); perf_event_set_state(event, min(event->state, state)); if (flags & DETACH_GROUP) perf_group_detach(event); if (flags & DETACH_CHILD) perf_child_detach(event); list_del_event(event, ctx); if (!pmu_ctx->nr_events) { pmu_ctx->rotate_necessary = 0; if (ctx->task && ctx->is_active) { struct perf_cpu_pmu_context *cpc = this_cpc(pmu_ctx->pmu); WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); cpc->task_epc = NULL; } } if (!ctx->nr_events && ctx->is_active) { if (ctx == &cpuctx->ctx) update_cgrp_time_from_cpuctx(cpuctx, true); ctx->is_active = 0; if (ctx->task) { WARN_ON_ONCE(cpuctx->task_ctx != ctx); cpuctx->task_ctx = NULL; } } } /* * Remove the event from a task's (or a CPU's) list of events. * * If event->ctx is a cloned context, callers must make sure that * every task struct that event->ctx->task could possibly point to * remains valid. This is OK when called from perf_release since * that only calls us on the top-level context, which can't be a clone. * When called from perf_event_exit_task, it's OK because the * context has been detached from its task. */ static void perf_remove_from_context(struct perf_event *event, unsigned long flags) { struct perf_event_context *ctx = event->ctx; lockdep_assert_held(&ctx->mutex); /* * Because of perf_event_exit_task(), perf_remove_from_context() ought * to work in the face of TASK_TOMBSTONE, unlike every other * event_function_call() user. */ raw_spin_lock_irq(&ctx->lock); if (!ctx->is_active) { __perf_remove_from_context(event, this_cpu_ptr(&perf_cpu_context), ctx, (void *)flags); raw_spin_unlock_irq(&ctx->lock); return; } raw_spin_unlock_irq(&ctx->lock); event_function_call(event, __perf_remove_from_context, (void *)flags); } static void __event_disable(struct perf_event *event, struct perf_event_context *ctx, enum perf_event_state state) { event_sched_out(event, ctx); perf_cgroup_event_disable(event, ctx); perf_event_set_state(event, state); } /* * Cross CPU call to disable a performance event */ static void __perf_event_disable(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, void *info) { if (event->state < PERF_EVENT_STATE_INACTIVE) return; perf_pmu_disable(event->pmu_ctx->pmu); ctx_time_update_event(ctx, event); /* * When disabling a group leader, the whole group becomes ineligible * to run, so schedule out the full group. */ if (event == event->group_leader) group_sched_out(event, ctx); /* * But only mark the leader OFF; the siblings will remain * INACTIVE. */ __event_disable(event, ctx, PERF_EVENT_STATE_OFF); perf_pmu_enable(event->pmu_ctx->pmu); } /* * Disable an event. * * If event->ctx is a cloned context, callers must make sure that * every task struct that event->ctx->task could possibly point to * remains valid. This condition is satisfied when called through * perf_event_for_each_child or perf_event_for_each because they * hold the top-level event's child_mutex, so any descendant that * goes to exit will block in perf_event_exit_event(). * * When called from perf_pending_disable it's OK because event->ctx * is the current context on this CPU and preemption is disabled, * hence we can't get into perf_event_task_sched_out for this context. */ static void _perf_event_disable(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; raw_spin_lock_irq(&ctx->lock); if (event->state <= PERF_EVENT_STATE_OFF) { raw_spin_unlock_irq(&ctx->lock); return; } raw_spin_unlock_irq(&ctx->lock); event_function_call(event, __perf_event_disable, NULL); } void perf_event_disable_local(struct perf_event *event) { event_function_local(event, __perf_event_disable, NULL); } /* * Strictly speaking kernel users cannot create groups and therefore this * interface does not need the perf_event_ctx_lock() magic. */ void perf_event_disable(struct perf_event *event) { struct perf_event_context *ctx; ctx = perf_event_ctx_lock(event); _perf_event_disable(event); perf_event_ctx_unlock(event, ctx); } EXPORT_SYMBOL_GPL(perf_event_disable); void perf_event_disable_inatomic(struct perf_event *event) { event->pending_disable = 1; irq_work_queue(&event->pending_disable_irq); } #define MAX_INTERRUPTS (~0ULL) static void perf_log_throttle(struct perf_event *event, int enable); static void perf_log_itrace_start(struct perf_event *event); static void perf_event_unthrottle(struct perf_event *event, bool start) { if (event->state != PERF_EVENT_STATE_ACTIVE) return; event->hw.interrupts = 0; if (start) event->pmu->start(event, 0); if (event == event->group_leader) perf_log_throttle(event, 1); } static void perf_event_throttle(struct perf_event *event) { if (event->state != PERF_EVENT_STATE_ACTIVE) return; event->hw.interrupts = MAX_INTERRUPTS; event->pmu->stop(event, 0); if (event == event->group_leader) perf_log_throttle(event, 0); } static void perf_event_unthrottle_group(struct perf_event *event, bool skip_start_event) { struct perf_event *sibling, *leader = event->group_leader; perf_event_unthrottle(leader, skip_start_event ? leader != event : true); for_each_sibling_event(sibling, leader) perf_event_unthrottle(sibling, skip_start_event ? sibling != event : true); } static void perf_event_throttle_group(struct perf_event *event) { struct perf_event *sibling, *leader = event->group_leader; perf_event_throttle(leader); for_each_sibling_event(sibling, leader) perf_event_throttle(sibling); } static int event_sched_in(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event_pmu_context *epc = event->pmu_ctx; struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu); int ret = 0; WARN_ON_ONCE(event->ctx != ctx); lockdep_assert_held(&ctx->lock); if (event->state <= PERF_EVENT_STATE_OFF) return 0; WRITE_ONCE(event->oncpu, smp_processor_id()); /* * Order event::oncpu write to happen before the ACTIVE state is * visible. This allows perf_event_{stop,read}() to observe the correct * ->oncpu if it sees ACTIVE. */ smp_wmb(); perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE); /* * Unthrottle events, since we scheduled we might have missed several * ticks already, also for a heavily scheduling task there is little * guarantee it'll get a tick in a timely manner. */ if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) perf_event_unthrottle(event, false); perf_pmu_disable(event->pmu); perf_log_itrace_start(event); if (event->pmu->add(event, PERF_EF_START)) { perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); event->oncpu = -1; ret = -EAGAIN; goto out; } if (!is_software_event(event)) cpc->active_oncpu++; if (is_event_in_freq_mode(event)) { ctx->nr_freq++; epc->nr_freq++; } if (event->attr.exclusive) cpc->exclusive = 1; out: perf_pmu_enable(event->pmu); return ret; } static int group_sched_in(struct perf_event *group_event, struct perf_event_context *ctx) { struct perf_event *event, *partial_group = NULL; struct pmu *pmu = group_event->pmu_ctx->pmu; if (group_event->state == PERF_EVENT_STATE_OFF) return 0; pmu->start_txn(pmu, PERF_PMU_TXN_ADD); if (event_sched_in(group_event, ctx)) goto error; /* * Schedule in siblings as one group (if any): */ for_each_sibling_event(event, group_event) { if (event_sched_in(event, ctx)) { partial_group = event; goto group_error; } } if (!pmu->commit_txn(pmu)) return 0; group_error: /* * Groups can be scheduled in as one unit only, so undo any * partial group before returning: * The events up to the failed event are scheduled out normally. */ for_each_sibling_event(event, group_event) { if (event == partial_group) break; event_sched_out(event, ctx); } event_sched_out(group_event, ctx); error: pmu->cancel_txn(pmu); return -EAGAIN; } /* * Work out whether we can put this event group on the CPU now. */ static int group_can_go_on(struct perf_event *event, int can_add_hw) { struct perf_event_pmu_context *epc = event->pmu_ctx; struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu); /* * Groups consisting entirely of software events can always go on. */ if (event->group_caps & PERF_EV_CAP_SOFTWARE) return 1; /* * If an exclusive group is already on, no other hardware * events can go on. */ if (cpc->exclusive) return 0; /* * If this group is exclusive and there are already * events on the CPU, it can't go on. */ if (event->attr.exclusive && !list_empty(get_event_list(event))) return 0; /* * Otherwise, try to add it if all previous groups were able * to go on. */ return can_add_hw; } static void add_event_to_ctx(struct perf_event *event, struct perf_event_context *ctx) { list_add_event(event, ctx); perf_group_attach(event); } static void task_ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); if (!cpuctx->task_ctx) return; if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) return; ctx_sched_out(ctx, pmu, event_type); } static void perf_event_sched_in(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, struct pmu *pmu) { ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED); if (ctx) ctx_sched_in(ctx, pmu, EVENT_PINNED); ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE); if (ctx) ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE); } /* * We want to maintain the following priority of scheduling: * - CPU pinned (EVENT_CPU | EVENT_PINNED) * - task pinned (EVENT_PINNED) * - CPU flexible (EVENT_CPU | EVENT_FLEXIBLE) * - task flexible (EVENT_FLEXIBLE). * * In order to avoid unscheduling and scheduling back in everything every * time an event is added, only do it for the groups of equal priority and * below. * * This can be called after a batch operation on task events, in which case * event_type is a bit mask of the types of events involved. For CPU events, * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE. */ static void ctx_resched(struct perf_cpu_context *cpuctx, struct perf_event_context *task_ctx, struct pmu *pmu, enum event_type_t event_type) { bool cpu_event = !!(event_type & EVENT_CPU); struct perf_event_pmu_context *epc; /* * If pinned groups are involved, flexible groups also need to be * scheduled out. */ if (event_type & EVENT_PINNED) event_type |= EVENT_FLEXIBLE; event_type &= EVENT_ALL; for_each_epc(epc, &cpuctx->ctx, pmu, false) perf_pmu_disable(epc->pmu); if (task_ctx) { for_each_epc(epc, task_ctx, pmu, false) perf_pmu_disable(epc->pmu); task_ctx_sched_out(task_ctx, pmu, event_type); } /* * Decide which cpu ctx groups to schedule out based on the types * of events that caused rescheduling: * - EVENT_CPU: schedule out corresponding groups; * - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups; * - otherwise, do nothing more. */ if (cpu_event) ctx_sched_out(&cpuctx->ctx, pmu, event_type); else if (event_type & EVENT_PINNED) ctx_sched_out(&cpuctx->ctx, pmu, EVENT_FLEXIBLE); perf_event_sched_in(cpuctx, task_ctx, pmu); for_each_epc(epc, &cpuctx->ctx, pmu, false) perf_pmu_enable(epc->pmu); if (task_ctx) { for_each_epc(epc, task_ctx, pmu, false) perf_pmu_enable(epc->pmu); } } void perf_pmu_resched(struct pmu *pmu) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_context *task_ctx = cpuctx->task_ctx; perf_ctx_lock(cpuctx, task_ctx); ctx_resched(cpuctx, task_ctx, pmu, EVENT_ALL|EVENT_CPU); perf_ctx_unlock(cpuctx, task_ctx); } /* * Cross CPU call to install and enable a performance event * * Very similar to remote_function() + event_function() but cannot assume that * things like ctx->is_active and cpuctx->task_ctx are set. */ static int __perf_install_in_context(void *info) { struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_context *task_ctx = cpuctx->task_ctx; bool reprogram = true; int ret = 0; raw_spin_lock(&cpuctx->ctx.lock); if (ctx->task) { raw_spin_lock(&ctx->lock); task_ctx = ctx; reprogram = (ctx->task == current); /* * If the task is running, it must be running on this CPU, * otherwise we cannot reprogram things. * * If its not running, we don't care, ctx->lock will * serialize against it becoming runnable. */ if (task_curr(ctx->task) && !reprogram) { ret = -ESRCH; goto unlock; } WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx); } else if (task_ctx) { raw_spin_lock(&task_ctx->lock); } #ifdef CONFIG_CGROUP_PERF if (event->state > PERF_EVENT_STATE_OFF && is_cgroup_event(event)) { /* * If the current cgroup doesn't match the event's * cgroup, we should not try to schedule it. */ struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx); reprogram = cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup); } #endif if (reprogram) { ctx_time_freeze(cpuctx, ctx); add_event_to_ctx(event, ctx); ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, get_event_type(event)); } else { add_event_to_ctx(event, ctx); } unlock: perf_ctx_unlock(cpuctx, task_ctx); return ret; } static bool exclusive_event_installable(struct perf_event *event, struct perf_event_context *ctx); /* * Attach a performance event to a context. * * Very similar to event_function_call, see comment there. */ static void perf_install_in_context(struct perf_event_context *ctx, struct perf_event *event, int cpu) { struct task_struct *task = READ_ONCE(ctx->task); lockdep_assert_held(&ctx->mutex); WARN_ON_ONCE(!exclusive_event_installable(event, ctx)); if (event->cpu != -1) WARN_ON_ONCE(event->cpu != cpu); /* * Ensures that if we can observe event->ctx, both the event and ctx * will be 'complete'. See perf_iterate_sb_cpu(). */ smp_store_release(&event->ctx, ctx); /* * perf_event_attr::disabled events will not run and can be initialized * without IPI. Except when this is the first event for the context, in * that case we need the magic of the IPI to set ctx->is_active. * * The IOC_ENABLE that is sure to follow the creation of a disabled * event will issue the IPI and reprogram the hardware. */ if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && ctx->nr_events && !is_cgroup_event(event)) { raw_spin_lock_irq(&ctx->lock); if (ctx->task == TASK_TOMBSTONE) { raw_spin_unlock_irq(&ctx->lock); return; } add_event_to_ctx(event, ctx); raw_spin_unlock_irq(&ctx->lock); return; } if (!task) { cpu_function_call(cpu, __perf_install_in_context, event); return; } /* * Should not happen, we validate the ctx is still alive before calling. */ if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) return; /* * Installing events is tricky because we cannot rely on ctx->is_active * to be set in case this is the nr_events 0 -> 1 transition. * * Instead we use task_curr(), which tells us if the task is running. * However, since we use task_curr() outside of rq::lock, we can race * against the actual state. This means the result can be wrong. * * If we get a false positive, we retry, this is harmless. * * If we get a false negative, things are complicated. If we are after * perf_event_context_sched_in() ctx::lock will serialize us, and the * value must be correct. If we're before, it doesn't matter since * perf_event_context_sched_in() will program the counter. * * However, this hinges on the remote context switch having observed * our task->perf_event_ctxp[] store, such that it will in fact take * ctx::lock in perf_event_context_sched_in(). * * We do this by task_function_call(), if the IPI fails to hit the task * we know any future context switch of task must see the * perf_event_ctpx[] store. */ /* * This smp_mb() orders the task->perf_event_ctxp[] store with the * task_cpu() load, such that if the IPI then does not find the task * running, a future context switch of that task must observe the * store. */ smp_mb(); again: if (!task_function_call(task, __perf_install_in_context, event)) return; raw_spin_lock_irq(&ctx->lock); task = ctx->task; if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) { /* * Cannot happen because we already checked above (which also * cannot happen), and we hold ctx->mutex, which serializes us * against perf_event_exit_task_context(). */ raw_spin_unlock_irq(&ctx->lock); return; } /* * If the task is not running, ctx->lock will avoid it becoming so, * thus we can safely install the event. */ if (task_curr(task)) { raw_spin_unlock_irq(&ctx->lock); goto again; } add_event_to_ctx(event, ctx); raw_spin_unlock_irq(&ctx->lock); } /* * Cross CPU call to enable a performance event */ static void __perf_event_enable(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, void *info) { struct perf_event *leader = event->group_leader; struct perf_event_context *task_ctx; if (event->state >= PERF_EVENT_STATE_INACTIVE || event->state <= PERF_EVENT_STATE_ERROR) return; ctx_time_freeze(cpuctx, ctx); perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); perf_cgroup_event_enable(event, ctx); if (!ctx->is_active) return; if (!event_filter_match(event)) return; /* * If the event is in a group and isn't the group leader, * then don't put it on unless the group is on. */ if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) return; task_ctx = cpuctx->task_ctx; if (ctx->task) WARN_ON_ONCE(task_ctx != ctx); ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, get_event_type(event)); } /* * Enable an event. * * If event->ctx is a cloned context, callers must make sure that * every task struct that event->ctx->task could possibly point to * remains valid. This condition is satisfied when called through * perf_event_for_each_child or perf_event_for_each as described * for perf_event_disable. */ static void _perf_event_enable(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; raw_spin_lock_irq(&ctx->lock); if (event->state >= PERF_EVENT_STATE_INACTIVE || event->state < PERF_EVENT_STATE_ERROR) { out: raw_spin_unlock_irq(&ctx->lock); return; } /* * If the event is in error state, clear that first. * * That way, if we see the event in error state below, we know that it * has gone back into error state, as distinct from the task having * been scheduled away before the cross-call arrived. */ if (event->state == PERF_EVENT_STATE_ERROR) { /* * Detached SIBLING events cannot leave ERROR state. */ if (event->event_caps & PERF_EV_CAP_SIBLING && event->group_leader == event) goto out; event->state = PERF_EVENT_STATE_OFF; } raw_spin_unlock_irq(&ctx->lock); event_function_call(event, __perf_event_enable, NULL); } /* * See perf_event_disable(); */ void perf_event_enable(struct perf_event *event) { struct perf_event_context *ctx; ctx = perf_event_ctx_lock(event); _perf_event_enable(event); perf_event_ctx_unlock(event, ctx); } EXPORT_SYMBOL_GPL(perf_event_enable); struct stop_event_data { struct perf_event *event; unsigned int restart; }; static int __perf_event_stop(void *info) { struct stop_event_data *sd = info; struct perf_event *event = sd->event; /* if it's already INACTIVE, do nothing */ if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE) return 0; /* matches smp_wmb() in event_sched_in() */ smp_rmb(); /* * There is a window with interrupts enabled before we get here, * so we need to check again lest we try to stop another CPU's event. */ if (READ_ONCE(event->oncpu) != smp_processor_id()) return -EAGAIN; event->pmu->stop(event, PERF_EF_UPDATE); /* * May race with the actual stop (through perf_pmu_output_stop()), * but it is only used for events with AUX ring buffer, and such * events will refuse to restart because of rb::aux_mmap_count==0, * see comments in perf_aux_output_begin(). * * Since this is happening on an event-local CPU, no trace is lost * while restarting. */ if (sd->restart) event->pmu->start(event, 0); return 0; } static int perf_event_stop(struct perf_event *event, int restart) { struct stop_event_data sd = { .event = event, .restart = restart, }; int ret = 0; do { if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE) return 0; /* matches smp_wmb() in event_sched_in() */ smp_rmb(); /* * We only want to restart ACTIVE events, so if the event goes * inactive here (event->oncpu==-1), there's nothing more to do; * fall through with ret==-ENXIO. */ ret = cpu_function_call(READ_ONCE(event->oncpu), __perf_event_stop, &sd); } while (ret == -EAGAIN); return ret; } /* * In order to contain the amount of racy and tricky in the address filter * configuration management, it is a two part process: * * (p1) when userspace mappings change as a result of (1) or (2) or (3) below, * we update the addresses of corresponding vmas in * event::addr_filter_ranges array and bump the event::addr_filters_gen; * (p2) when an event is scheduled in (pmu::add), it calls * perf_event_addr_filters_sync() which calls pmu::addr_filters_sync() * if the generation has changed since the previous call. * * If (p1) happens while the event is active, we restart it to force (p2). * * (1) perf_addr_filters_apply(): adjusting filters' offsets based on * pre-existing mappings, called once when new filters arrive via SET_FILTER * ioctl; * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly * registered mapping, called for every new mmap(), with mm::mmap_lock down * for reading; * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process * of exec. */ void perf_event_addr_filters_sync(struct perf_event *event) { struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); if (!has_addr_filter(event)) return; raw_spin_lock(&ifh->lock); if (event->addr_filters_gen != event->hw.addr_filters_gen) { event->pmu->addr_filters_sync(event); event->hw.addr_filters_gen = event->addr_filters_gen; } raw_spin_unlock(&ifh->lock); } EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync); static int _perf_event_refresh(struct perf_event *event, int refresh) { /* * not supported on inherited events */ if (event->attr.inherit || !is_sampling_event(event)) return -EINVAL; atomic_add(refresh, &event->event_limit); _perf_event_enable(event); return 0; } /* * See perf_event_disable() */ int perf_ev