Total coverage: 15706 (2%)of 1192777
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 // SPDX-License-Identifier: GPL-2.0 /* * fs/ext4/extents_status.h * * Written by Yongqiang Yang <xiaoqiangnk@gmail.com> * Modified by * Allison Henderson <achender@linux.vnet.ibm.com> * Zheng Liu <wenqing.lz@taobao.com> * */ #ifndef _EXT4_EXTENTS_STATUS_H #define _EXT4_EXTENTS_STATUS_H /* * Turn on ES_DEBUG__ to get lots of info about extent status operations. */ #ifdef ES_DEBUG__ #define es_debug(fmt, ...) printk(fmt, ##__VA_ARGS__) #else #define es_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif /* * With ES_AGGRESSIVE_TEST defined, the result of es caching will be * checked with old map_block's result. */ #define ES_AGGRESSIVE_TEST__ /* * These flags live in the high bits of extent_status.es_pblk */ enum { ES_WRITTEN_B, ES_UNWRITTEN_B, ES_DELAYED_B, ES_HOLE_B, ES_REFERENCED_B, ES_FLAGS }; #define ES_SHIFT (sizeof(ext4_fsblk_t)*8 - ES_FLAGS) #define ES_MASK (~((ext4_fsblk_t)0) << ES_SHIFT) /* * Besides EXTENT_STATUS_REFERENCED, all these extent type masks * are exclusive, only one type can be set at a time. */ #define EXTENT_STATUS_WRITTEN (1 << ES_WRITTEN_B) #define EXTENT_STATUS_UNWRITTEN (1 << ES_UNWRITTEN_B) #define EXTENT_STATUS_DELAYED (1 << ES_DELAYED_B) #define EXTENT_STATUS_HOLE (1 << ES_HOLE_B) #define EXTENT_STATUS_REFERENCED (1 << ES_REFERENCED_B) #define ES_TYPE_MASK ((ext4_fsblk_t)(EXTENT_STATUS_WRITTEN | \ EXTENT_STATUS_UNWRITTEN | \ EXTENT_STATUS_DELAYED | \ EXTENT_STATUS_HOLE)) #define ES_TYPE_VALID(type) ((type) && !((type) & ((type) - 1))) struct ext4_sb_info; struct ext4_extent; struct extent_status { struct rb_node rb_node; ext4_lblk_t es_lblk; /* first logical block extent covers */ ext4_lblk_t es_len; /* length of extent in block */ ext4_fsblk_t es_pblk; /* first physical block */ }; struct ext4_es_tree { struct rb_root root; struct extent_status *cache_es; /* recently accessed extent */ }; struct ext4_es_stats { unsigned long es_stats_shrunk; struct percpu_counter es_stats_cache_hits; struct percpu_counter es_stats_cache_misses; u64 es_stats_scan_time; u64 es_stats_max_scan_time; struct percpu_counter es_stats_all_cnt; struct percpu_counter es_stats_shk_cnt; }; /* * Pending cluster reservations for bigalloc file systems * * A cluster with a pending reservation is a logical cluster shared by at * least one extent in the extents status tree with delayed and unwritten * status and at least one other written or unwritten extent. The * reservation is said to be pending because a cluster reservation would * have to be taken in the event all blocks in the cluster shared with * written or unwritten extents were deleted while the delayed and * unwritten blocks remained. * * The set of pending cluster reservations is an auxiliary data structure * used with the extents status tree to implement reserved cluster/block * accounting for bigalloc file systems. The set is kept in memory and * records all pending cluster reservations. * * Its primary function is to avoid the need to read extents from the * disk when invalidating pages as a result of a truncate, punch hole, or * collapse range operation. Page invalidation requires a decrease in the * reserved cluster count if it results in the removal of all delayed * and unwritten extents (blocks) from a cluster that is not shared with a * written or unwritten extent, and no decrease otherwise. Determining * whether the cluster is shared can be done by searching for a pending * reservation on it. * * Secondarily, it provides a potentially faster method for determining * whether the reserved cluster count should be increased when a physical * cluster is deallocated as a result of a truncate, punch hole, or * collapse range operation. The necessary information is also present * in the extents status tree, but might be more rapidly accessed in * the pending reservation set in many cases due to smaller size. * * The pending cluster reservation set is implemented as a red-black tree * with the goal of minimizing per page search time overhead. */ struct pending_reservation { struct rb_node rb_node; ext4_lblk_t lclu; }; struct ext4_pending_tree { struct rb_root root; }; extern int __init ext4_init_es(void); extern void ext4_exit_es(void); extern void ext4_es_init_tree(struct ext4_es_tree *tree); extern void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk, unsigned int status, bool delalloc_reserve_used); extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk, unsigned int status); extern void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len); extern void ext4_es_find_extent_range(struct inode *inode, int (*match_fn)(struct extent_status *es), ext4_lblk_t lblk, ext4_lblk_t end, struct extent_status *es); extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t *next_lblk, struct extent_status *es); extern bool ext4_es_scan_range(struct inode *inode, int (*matching_fn)(struct extent_status *es), ext4_lblk_t lblk, ext4_lblk_t end); extern bool ext4_es_scan_clu(struct inode *inode, int (*matching_fn)(struct extent_status *es), ext4_lblk_t lblk); static inline unsigned int ext4_es_status(struct extent_status *es) { return es->es_pblk >> ES_SHIFT; } static inline unsigned int ext4_es_type(struct extent_status *es) { return (es->es_pblk >> ES_SHIFT) & ES_TYPE_MASK; } static inline int ext4_es_is_written(struct extent_status *es) { return (ext4_es_type(es) & EXTENT_STATUS_WRITTEN) != 0; } static inline int ext4_es_is_unwritten(struct extent_status *es) { return (ext4_es_type(es) & EXTENT_STATUS_UNWRITTEN) != 0; } static inline int ext4_es_is_delayed(struct extent_status *es) { return (ext4_es_type(es) & EXTENT_STATUS_DELAYED) != 0; } static inline int ext4_es_is_hole(struct extent_status *es) { return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0; } static inline int ext4_es_is_mapped(struct extent_status *es) { return (ext4_es_is_written(es) || ext4_es_is_unwritten(es)); } static inline void ext4_es_set_referenced(struct extent_status *es) { es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT; } static inline void ext4_es_clear_referenced(struct extent_status *es) { es->es_pblk &= ~(((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT); } static inline int ext4_es_is_referenced(struct extent_status *es) { return (ext4_es_status(es) & EXTENT_STATUS_REFERENCED) != 0; } static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es) { return es->es_pblk & ~ES_MASK; } static inline ext4_fsblk_t ext4_es_show_pblock(struct extent_status *es) { ext4_fsblk_t pblock = ext4_es_pblock(es); return pblock == ~ES_MASK ? 0 : pblock; } static inline void ext4_es_store_pblock(struct extent_status *es, ext4_fsblk_t pb) { ext4_fsblk_t block; block = (pb & ~ES_MASK) | (es->es_pblk & ES_MASK); es->es_pblk = block; } static inline void ext4_es_store_pblock_status(struct extent_status *es, ext4_fsblk_t pb, unsigned int status) { WARN_ON_ONCE(!ES_TYPE_VALID(status & ES_TYPE_MASK)); es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) | (pb & ~ES_MASK); } extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi); extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); extern int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v); extern int __init ext4_init_pending(void); extern void ext4_exit_pending(void); extern void ext4_init_pending_tree(struct ext4_pending_tree *tree); extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk); extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk); extern void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, bool lclu_allocated, bool end_allocated); extern void ext4_clear_inode_es(struct inode *inode); #endif /* _EXT4_EXTENTS_STATUS_H */
299 302 302 302 299 302 302 301 302 301 300 1 302 302 311 311 311 250 250 250 310 311 311 250 250 61 61 61 61 61 61 61 61 61 61 302 302 302 302 302 302 302 302 302 302 302 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2015 - ARM Ltd * Author: Marc Zyngier <marc.zyngier@arm.com> */ #include <hyp/switch.h> #include <linux/arm-smccc.h> #include <linux/kvm_host.h> #include <linux/types.h> #include <linux/jump_label.h> #include <linux/percpu.h> #include <uapi/linux/psci.h> #include <kvm/arm_psci.h> #include <asm/barrier.h> #include <asm/cpufeature.h> #include <asm/kprobes.h> #include <asm/kvm_asm.h> #include <asm/kvm_emulate.h> #include <asm/kvm_hyp.h> #include <asm/kvm_mmu.h> #include <asm/fpsimd.h> #include <asm/debug-monitors.h> #include <asm/processor.h> #include <asm/thread_info.h> #include <asm/vectors.h> /* VHE specific context */ DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data); DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); DEFINE_PER_CPU(unsigned long, kvm_hyp_vector); /* * HCR_EL2 bits that the NV guest can freely change (no RES0/RES1 * semantics, irrespective of the configuration), but that cannot be * applied to the actual HW as things would otherwise break badly. * * - TGE: we want the guest to use EL1, which is incompatible with * this bit being set * * - API/APK: they are already accounted for by vcpu_load(), and can * only take effect across a load/put cycle (such as ERET) */ #define NV_HCR_GUEST_EXCLUDE (HCR_TGE | HCR_API | HCR_APK) static u64 __compute_hcr(struct kvm_vcpu *vcpu) { u64 guest_hcr, hcr = vcpu->arch.hcr_el2; if (!vcpu_has_nv(vcpu)) return hcr; /* * We rely on the invariant that a vcpu entered from HYP * context must also exit in the same context, as only an ERET * instruction can kick us out of it, and we obviously trap * that sucker. PSTATE.M will get fixed-up on exit. */ if (is_hyp_ctxt(vcpu)) { host_data_set_flag(VCPU_IN_HYP_CONTEXT); hcr |= HCR_NV | HCR_NV2 | HCR_AT | HCR_TTLB; if (!vcpu_el2_e2h_is_set(vcpu)) hcr |= HCR_NV1; /* * Nothing in HCR_EL2 should impact running in hypervisor * context, apart from bits we have defined as RESx (E2H, * HCD and co), or that cannot be set directly (the EXCLUDE * bits). Given that we OR the guest's view with the host's, * we can use the 0 value as the starting point, and only * use the config-driven RES1 bits. */ guest_hcr = kvm_vcpu_apply_reg_masks(vcpu, HCR_EL2, 0); write_sysreg_s(vcpu->arch.ctxt.vncr_array, SYS_VNCR_EL2); } else { host_data_clear_flag(VCPU_IN_HYP_CONTEXT); guest_hcr = __vcpu_sys_reg(vcpu, HCR_EL2); if (guest_hcr & HCR_NV) { u64 va = __fix_to_virt(vncr_fixmap(smp_processor_id())); /* Inherit the low bits from the actual register */ va |= __vcpu_sys_reg(vcpu, VNCR_EL2) & GENMASK(PAGE_SHIFT - 1, 0); write_sysreg_s(va, SYS_VNCR_EL2); /* Force NV2 in case the guest is forgetful... */ guest_hcr |= HCR_NV2; } } BUG_ON(host_data_test_flag(VCPU_IN_HYP_CONTEXT) && host_data_test_flag(L1_VNCR_MAPPED)); return hcr | (guest_hcr & ~NV_HCR_GUEST_EXCLUDE); } static void __activate_traps(struct kvm_vcpu *vcpu) { u64 val; ___activate_traps(vcpu, __compute_hcr(vcpu)); if (has_cntpoff()) { struct timer_map map; get_timer_map(vcpu, &map); /* * We're entrering the guest. Reload the correct * values from memory now that TGE is clear. */ if (map.direct_ptimer == vcpu_ptimer(vcpu)) val = __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0); if (map.direct_ptimer == vcpu_hptimer(vcpu)) val = __vcpu_sys_reg(vcpu, CNTHP_CVAL_EL2); if (map.direct_ptimer) { write_sysreg_el0(val, SYS_CNTP_CVAL); isb(); } } __activate_cptr_traps(vcpu); write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el1); } NOKPROBE_SYMBOL(__activate_traps); static void __deactivate_traps(struct kvm_vcpu *vcpu) { const char *host_vectors = vectors; ___deactivate_traps(vcpu); write_sysreg_hcr(HCR_HOST_VHE_FLAGS); if (has_cntpoff()) { struct timer_map map; u64 val, offset; get_timer_map(vcpu, &map); /* * We're exiting the guest. Save the latest CVAL value * to memory and apply the offset now that TGE is set. */ val = read_sysreg_el0(SYS_CNTP_CVAL); if (map.direct_ptimer == vcpu_ptimer(vcpu)) __vcpu_assign_sys_reg(vcpu, CNTP_CVAL_EL0, val); if (map.direct_ptimer == vcpu_hptimer(vcpu)) __vcpu_assign_sys_reg(vcpu, CNTHP_CVAL_EL2, val); offset = read_sysreg_s(SYS_CNTPOFF_EL2); if (map.direct_ptimer && offset) { write_sysreg_el0(val + offset, SYS_CNTP_CVAL); isb(); } } /* * ARM errata 1165522 and 1530923 require the actual execution of the * above before we can switch to the EL2/EL0 translation regime used by * the host. */ asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT)); __deactivate_cptr_traps(vcpu); if (!arm64_kernel_unmapped_at_el0()) host_vectors = __this_cpu_read(this_cpu_vector); write_sysreg(host_vectors, vbar_el1); } NOKPROBE_SYMBOL(__deactivate_traps); /* * Disable IRQs in __vcpu_{load,put}_{activate,deactivate}_traps() to * prevent a race condition between context switching of PMUSERENR_EL0 * in __{activate,deactivate}_traps_common() and IPIs that attempts to * update PMUSERENR_EL0. See also kvm_set_pmuserenr(). */ static void __vcpu_load_activate_traps(struct kvm_vcpu *vcpu) { unsigned long flags; local_irq_save(flags); __activate_traps_common(vcpu); local_irq_restore(flags); } static void __vcpu_put_deactivate_traps(struct kvm_vcpu *vcpu) { unsigned long flags; local_irq_save(flags); __deactivate_traps_common(vcpu); local_irq_restore(flags); } void kvm_vcpu_load_vhe(struct kvm_vcpu *vcpu) { host_data_ptr(host_ctxt)->__hyp_running_vcpu = vcpu; __vcpu_load_switch_sysregs(vcpu); __vcpu_load_activate_traps(vcpu); __load_stage2(vcpu->arch.hw_mmu, vcpu->arch.hw_mmu->arch); } void kvm_vcpu_put_vhe(struct kvm_vcpu *vcpu) { __vcpu_put_deactivate_traps(vcpu); __vcpu_put_switch_sysregs(vcpu); host_data_ptr(host_ctxt)->__hyp_running_vcpu = NULL; } static u64 compute_emulated_cntx_ctl_el0(struct kvm_vcpu *vcpu, enum vcpu_sysreg reg) { unsigned long ctl; u64 cval, cnt; bool stat; switch (reg) { case CNTP_CTL_EL0: cval = __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0); ctl = __vcpu_sys_reg(vcpu, CNTP_CTL_EL0); cnt = compute_counter_value(vcpu_ptimer(vcpu)); break; case CNTV_CTL_EL0: cval = __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0); ctl = __vcpu_sys_reg(vcpu, CNTV_CTL_EL0); cnt = compute_counter_value(vcpu_vtimer(vcpu)); break; default: BUG(); } stat = cval <= cnt; __assign_bit(__ffs(ARCH_TIMER_CTRL_IT_STAT), &ctl, stat); return ctl; } static bool kvm_hyp_handle_timer(struct kvm_vcpu *vcpu, u64 *exit_code) { u64 esr, val; /* * Having FEAT_ECV allows for a better quality of timer emulation. * However, this comes at a huge cost in terms of traps. Try and * satisfy the reads from guest's hypervisor context without * returning to the kernel if we can. */ if (!is_hyp_ctxt(vcpu)) return false; esr = kvm_vcpu_get_esr(vcpu); if ((esr & ESR_ELx_SYS64_ISS_DIR_MASK) != ESR_ELx_SYS64_ISS_DIR_READ) return false; switch (esr_sys64_to_sysreg(esr)) { case SYS_CNTP_CTL_EL02: val = compute_emulated_cntx_ctl_el0(vcpu, CNTP_CTL_EL0); break; case SYS_CNTP_CTL_EL0: if (vcpu_el2_e2h_is_set(vcpu)) val = read_sysreg_el0(SYS_CNTP_CTL); else val = compute_emulated_cntx_ctl_el0(vcpu, CNTP_CTL_EL0); break; case SYS_CNTP_CVAL_EL02: val = __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0); break; case SYS_CNTP_CVAL_EL0: if (vcpu_el2_e2h_is_set(vcpu)) { val = read_sysreg_el0(SYS_CNTP_CVAL); if (!has_cntpoff()) val -= timer_get_offset(vcpu_hptimer(vcpu)); } else { val = __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0); } break; case SYS_CNTPCT_EL0: case SYS_CNTPCTSS_EL0: val = compute_counter_value(vcpu_hptimer(vcpu)); break; case SYS_CNTV_CTL_EL02: val = compute_emulated_cntx_ctl_el0(vcpu, CNTV_CTL_EL0); break; case SYS_CNTV_CTL_EL0: if (vcpu_el2_e2h_is_set(vcpu)) val = read_sysreg_el0(SYS_CNTV_CTL); else val = compute_emulated_cntx_ctl_el0(vcpu, CNTV_CTL_EL0); break; case SYS_CNTV_CVAL_EL02: val = __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0); break; case SYS_CNTV_CVAL_EL0: if (vcpu_el2_e2h_is_set(vcpu)) val = read_sysreg_el0(SYS_CNTV_CVAL); else val = __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0); break; case SYS_CNTVCT_EL0: case SYS_CNTVCTSS_EL0: val = compute_counter_value(vcpu_hvtimer(vcpu)); break; default: return false; } vcpu_set_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu), val); __kvm_skip_instr(vcpu); return true; } static bool kvm_hyp_handle_eret(struct kvm_vcpu *vcpu, u64 *exit_code) { u64 esr = kvm_vcpu_get_esr(vcpu); u64 spsr, elr, mode; /* * Going through the whole put/load motions is a waste of time * if this is a VHE guest hypervisor returning to its own * userspace, or the hypervisor performing a local exception * return. No need to save/restore registers, no need to * switch S2 MMU. Just do the canonical ERET. * * Unless the trap has to be forwarded further down the line, * of course... */ if ((__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_NV) || (__vcpu_sys_reg(vcpu, HFGITR_EL2) & HFGITR_EL2_ERET)) return false; spsr = read_sysreg_el1(SYS_SPSR); mode = spsr & (PSR_MODE_MASK | PSR_MODE32_BIT); switch (mode) { case PSR_MODE_EL0t: if (!(vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu))) return false; break; case PSR_MODE_EL2t: mode = PSR_MODE_EL1t; break; case PSR_MODE_EL2h: mode = PSR_MODE_EL1h; break; default: return false; } /* If ERETAx fails, take the slow path */ if (esr_iss_is_eretax(esr)) { if (!(vcpu_has_ptrauth(vcpu) && kvm_auth_eretax(vcpu, &elr))) return false; } else { elr = read_sysreg_el1(SYS_ELR); } spsr = (spsr & ~(PSR_MODE_MASK | PSR_MODE32_BIT)) | mode; write_sysreg_el2(spsr, SYS_SPSR); write_sysreg_el2(elr, SYS_ELR); return true; } static bool kvm_hyp_handle_tlbi_el2(struct kvm_vcpu *vcpu, u64 *exit_code) { int ret = -EINVAL; u32 instr; u64 val; /* * Ideally, we would never trap on EL2 S1 TLB invalidations using * the EL1 instructions when the guest's HCR_EL2.{E2H,TGE}=={1,1}. * But "thanks" to FEAT_NV2, we don't trap writes to HCR_EL2, * meaning that we can't track changes to the virtual TGE bit. So we * have to leave HCR_EL2.TTLB set on the host. Oopsie... * * Try and handle these invalidation as quickly as possible, without * fully exiting. Note that we don't need to consider any forwarding * here, as having E2H+TGE set is the very definition of being * InHost. * * For the lesser hypervisors out there that have failed to get on * with the VHE program, we can also handle the nVHE style of EL2 * invalidation. */ if (!(is_hyp_ctxt(vcpu))) return false; instr = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu)); val = vcpu_get_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu)); if ((kvm_supported_tlbi_s1e1_op(vcpu, instr) && vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)) || kvm_supported_tlbi_s1e2_op (vcpu, instr)) ret = __kvm_tlbi_s1e2(NULL, val, instr); if (ret) return false; /* * If we have to check for any VNCR mapping being invalidated, * go back to the slow path for further processing. */ if (vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu) && atomic_read(&vcpu->kvm->arch.vncr_map_count)) return false; __kvm_skip_instr(vcpu); return true; } static bool kvm_hyp_handle_cpacr_el1(struct kvm_vcpu *vcpu, u64 *exit_code) { u64 esr = kvm_vcpu_get_esr(vcpu); int rt; if (!is_hyp_ctxt(vcpu) || esr_sys64_to_sysreg(esr) != SYS_CPACR_EL1) return false; rt = kvm_vcpu_sys_get_rt(vcpu); if ((esr & ESR_ELx_SYS64_ISS_DIR_MASK) == ESR_ELx_SYS64_ISS_DIR_READ) { vcpu_set_reg(vcpu, rt, __vcpu_sys_reg(vcpu, CPTR_EL2)); } else { vcpu_write_sys_reg(vcpu, vcpu_get_reg(vcpu, rt), CPTR_EL2); __activate_cptr_traps(vcpu); } __kvm_skip_instr(vcpu); return true; } static bool kvm_hyp_handle_zcr_el2(struct kvm_vcpu *vcpu, u64 *exit_code) { u32 sysreg = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu)); if (!vcpu_has_nv(vcpu)) return false; if (sysreg != SYS_ZCR_EL2) return false; if (guest_owns_fp_regs()) return false; /* * ZCR_EL2 traps are handled in the slow path, with the expectation * that the guest's FP context has already been loaded onto the CPU. * * Load the guest's FP context and unconditionally forward to the * slow path for handling (i.e. return false). */ kvm_hyp_handle_fpsimd(vcpu, exit_code); return false; } static bool kvm_hyp_handle_sysreg_vhe(struct kvm_vcpu *vcpu, u64 *exit_code) { if (kvm_hyp_handle_tlbi_el2(vcpu, exit_code)) return true; if (kvm_hyp_handle_timer(vcpu, exit_code)) return true; if (kvm_hyp_handle_cpacr_el1(vcpu, exit_code)) return true; if (kvm_hyp_handle_zcr_el2(vcpu, exit_code)) return true; return kvm_hyp_handle_sysreg(vcpu, exit_code); } static bool kvm_hyp_handle_impdef(struct kvm_vcpu *vcpu, u64 *exit_code) { u64 iss; if (!cpus_have_final_cap(ARM64_WORKAROUND_PMUV3_IMPDEF_TRAPS)) return false; /* * Compute a synthetic ESR for a sysreg trap. Conveniently, AFSR1_EL2 * is populated with a correct ISS for a sysreg trap. These fruity * parts are 64bit only, so unconditionally set IL. */ iss = ESR_ELx_ISS(read_sysreg_s(SYS_AFSR1_EL2)); vcpu->arch.fault.esr_el2 = FIELD_PREP(ESR_ELx_EC_MASK, ESR_ELx_EC_SYS64) | FIELD_PREP(ESR_ELx_ISS_MASK, iss) | ESR_ELx_IL; return false; } static const exit_handler_fn hyp_exit_handlers[] = { [0 ... ESR_ELx_EC_MAX] = NULL, [ESR_ELx_EC_CP15_32] = kvm_hyp_handle_cp15_32, [ESR_ELx_EC_SYS64] = kvm_hyp_handle_sysreg_vhe, [ESR_ELx_EC_SVE] = kvm_hyp_handle_fpsimd, [ESR_ELx_EC_FP_ASIMD] = kvm_hyp_handle_fpsimd, [ESR_ELx_EC_IABT_LOW] = kvm_hyp_handle_iabt_low, [ESR_ELx_EC_DABT_LOW] = kvm_hyp_handle_dabt_low, [ESR_ELx_EC_WATCHPT_LOW] = kvm_hyp_handle_watchpt_low, [ESR_ELx_EC_ERET] = kvm_hyp_handle_eret, [ESR_ELx_EC_MOPS] = kvm_hyp_handle_mops, /* Apple shenanigans */ [0x3F] = kvm_hyp_handle_impdef, }; static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) { synchronize_vcpu_pstate(vcpu, exit_code); /* * If we were in HYP context on entry, adjust the PSTATE view * so that the usual helpers work correctly. This enforces our * invariant that the guest's HYP context status is preserved * across a run. */ if (vcpu_has_nv(vcpu) && unlikely(host_data_test_flag(VCPU_IN_HYP_CONTEXT))) { u64 mode = *vcpu_cpsr(vcpu) & (PSR_MODE_MASK | PSR_MODE32_BIT); switch (mode) { case PSR_MODE_EL1t: mode = PSR_MODE_EL2t; break; case PSR_MODE_EL1h: mode = PSR_MODE_EL2h; break; } *vcpu_cpsr(vcpu) &= ~(PSR_MODE_MASK | PSR_MODE32_BIT); *vcpu_cpsr(vcpu) |= mode; } /* Apply extreme paranoia! */ BUG_ON(vcpu_has_nv(vcpu) && !!host_data_test_flag(VCPU_IN_HYP_CONTEXT) != is_hyp_ctxt(vcpu)); return __fixup_guest_exit(vcpu, exit_code, hyp_exit_handlers); } /* Switch to the guest for VHE systems running in EL2 */ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) { struct kvm_cpu_context *host_ctxt; struct kvm_cpu_context *guest_ctxt; u64 exit_code; host_ctxt = host_data_ptr(host_ctxt); guest_ctxt = &vcpu->arch.ctxt; fpsimd_lazy_switch_to_guest(vcpu); sysreg_save_host_state_vhe(host_ctxt); /* * Note that ARM erratum 1165522 requires us to configure both stage 1 * and stage 2 translation for the guest context before we clear * HCR_EL2.TGE. The stage 1 and stage 2 guest context has already been * loaded on the CPU in kvm_vcpu_load_vhe(). */ __activate_traps(vcpu); __kvm_adjust_pc(vcpu); sysreg_restore_guest_state_vhe(guest_ctxt); __debug_switch_to_guest(vcpu); do { /* Jump in the fire! */ exit_code = __guest_enter(vcpu); /* And we're baaack! */ } while (fixup_guest_exit(vcpu, &exit_code)); sysreg_save_guest_state_vhe(guest_ctxt); __deactivate_traps(vcpu); sysreg_restore_host_state_vhe(host_ctxt); __debug_switch_to_host(vcpu); /* * Ensure that all system register writes above have taken effect * before returning to the host. In VHE mode, CPTR traps for * FPSIMD/SVE/SME also apply to EL2, so FPSIMD/SVE/SME state must be * manipulated after the ISB. */ isb(); fpsimd_lazy_switch_to_host(vcpu); if (guest_owns_fp_regs()) __fpsimd_save_fpexc32(vcpu); return exit_code; } NOKPROBE_SYMBOL(__kvm_vcpu_run_vhe); int __kvm_vcpu_run(struct kvm_vcpu *vcpu) { int ret; local_daif_mask(); /* * Having IRQs masked via PMR when entering the guest means the GIC * will not signal the CPU of interrupts of lower priority, and the * only way to get out will be via guest exceptions. * Naturally, we want to avoid this. * * local_daif_mask() already sets GIC_PRIO_PSR_I_SET, we just need a * dsb to ensure the redistributor is forwards EL2 IRQs to the CPU. */ pmr_sync(); ret = __kvm_vcpu_run_vhe(vcpu); /* * local_daif_restore() takes care to properly restore PSTATE.DAIF * and the GIC PMR if the host is using IRQ priorities. */ local_daif_restore(DAIF_PROCCTX_NOIRQ); return ret; } static void __noreturn __hyp_call_panic(u64 spsr, u64 elr, u64 par) { struct kvm_cpu_context *host_ctxt; struct kvm_vcpu *vcpu; host_ctxt = host_data_ptr(host_ctxt); vcpu = host_ctxt->__hyp_running_vcpu; __deactivate_traps(vcpu); sysreg_restore_host_state_vhe(host_ctxt); panic("HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%p\n", spsr, elr, read_sysreg_el2(SYS_ESR), read_sysreg_el2(SYS_FAR), read_sysreg(hpfar_el2), par, vcpu); } NOKPROBE_SYMBOL(__hyp_call_panic); void __noreturn hyp_panic(void) { u64 spsr = read_sysreg_el2(SYS_SPSR); u64 elr = read_sysreg_el2(SYS_ELR); u64 par = read_sysreg_par(); __hyp_call_panic(spsr, elr, par); } asmlinkage void kvm_unexpected_el2_exception(void) { __kvm_unexpected_el2_exception(); }
628 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Based on arch/arm/include/asm/mmu_context.h * * Copyright (C) 1996 Russell King. * Copyright (C) 2012 ARM Ltd. */ #ifndef __ASM_MMU_CONTEXT_H #define __ASM_MMU_CONTEXT_H #ifndef __ASSEMBLY__ #include <linux/compiler.h> #include <linux/sched.h> #include <linux/sched/hotplug.h> #include <linux/mm_types.h> #include <linux/pgtable.h> #include <linux/pkeys.h> #include <asm/cacheflush.h> #include <asm/cpufeature.h> #include <asm/daifflags.h> #include <asm/gcs.h> #include <asm/proc-fns.h> #include <asm/cputype.h> #include <asm/sysreg.h> #include <asm/tlbflush.h> extern bool rodata_full; static inline void contextidr_thread_switch(struct task_struct *next) { if (!IS_ENABLED(CONFIG_PID_IN_CONTEXTIDR)) return; write_sysreg(task_pid_nr(next), contextidr_el1); isb(); } /* * Set TTBR0 to reserved_pg_dir. No translations will be possible via TTBR0. */ static inline void cpu_set_reserved_ttbr0_nosync(void) { unsigned long ttbr = phys_to_ttbr(__pa_symbol(reserved_pg_dir)); write_sysreg(ttbr, ttbr0_el1); } static inline void cpu_set_reserved_ttbr0(void) { cpu_set_reserved_ttbr0_nosync(); isb(); } void cpu_do_switch_mm(phys_addr_t pgd_phys, struct mm_struct *mm); static inline void cpu_switch_mm(pgd_t *pgd, struct mm_struct *mm) { BUG_ON(pgd == swapper_pg_dir); cpu_do_switch_mm(virt_to_phys(pgd),mm); } /* * TCR.T0SZ value to use when the ID map is active. */ #define idmap_t0sz TCR_T0SZ(IDMAP_VA_BITS) /* * Ensure TCR.T0SZ is set to the provided value. */ static inline void __cpu_set_tcr_t0sz(unsigned long t0sz) { unsigned long tcr = read_sysreg(tcr_el1); if ((tcr & TCR_T0SZ_MASK) == t0sz) return; tcr &= ~TCR_T0SZ_MASK; tcr |= t0sz; write_sysreg(tcr, tcr_el1); isb(); } #define cpu_set_default_tcr_t0sz() __cpu_set_tcr_t0sz(TCR_T0SZ(vabits_actual)) #define cpu_set_idmap_tcr_t0sz() __cpu_set_tcr_t0sz(idmap_t0sz) /* * Remove the idmap from TTBR0_EL1 and install the pgd of the active mm. * * The idmap lives in the same VA range as userspace, but uses global entries * and may use a different TCR_EL1.T0SZ. To avoid issues resulting from * speculative TLB fetches, we must temporarily install the reserved page * tables while we invalidate the TLBs and set up the correct TCR_EL1.T0SZ. * * If current is a not a user task, the mm covers the TTBR1_EL1 page tables, * which should not be installed in TTBR0_EL1. In this case we can leave the * reserved page tables in place. */ static inline void cpu_uninstall_idmap(void) { struct mm_struct *mm = current->active_mm; cpu_set_reserved_ttbr0(); local_flush_tlb_all(); cpu_set_default_tcr_t0sz(); if (mm != &init_mm && !system_uses_ttbr0_pan()) cpu_switch_mm(mm->pgd, mm); } static inline void cpu_install_idmap(void) { cpu_set_reserved_ttbr0(); local_flush_tlb_all(); cpu_set_idmap_tcr_t0sz(); cpu_switch_mm(lm_alias(idmap_pg_dir), &init_mm); } /* * Load our new page tables. A strict BBM approach requires that we ensure that * TLBs are free of any entries that may overlap with the global mappings we are * about to install. * * For a real hibernate/resume/kexec cycle TTBR0 currently points to a zero * page, but TLBs may contain stale ASID-tagged entries (e.g. for EFI runtime * services), while for a userspace-driven test_resume cycle it points to * userspace page tables (and we must point it at a zero page ourselves). * * We change T0SZ as part of installing the idmap. This is undone by * cpu_uninstall_idmap() in __cpu_suspend_exit(). */ static inline void cpu_install_ttbr0(phys_addr_t ttbr0, unsigned long t0sz) { cpu_set_reserved_ttbr0(); local_flush_tlb_all(); __cpu_set_tcr_t0sz(t0sz); /* avoid cpu_switch_mm() and its SW-PAN and CNP interactions */ write_sysreg(ttbr0, ttbr0_el1); isb(); } void __cpu_replace_ttbr1(pgd_t *pgdp, bool cnp); static inline void cpu_enable_swapper_cnp(void) { __cpu_replace_ttbr1(lm_alias(swapper_pg_dir), true); } static inline void cpu_replace_ttbr1(pgd_t *pgdp) { /* * Only for early TTBR1 replacement before cpucaps are finalized and * before we've decided whether to use CNP. */ WARN_ON(system_capabilities_finalized()); __cpu_replace_ttbr1(pgdp, false); } /* * It would be nice to return ASIDs back to the allocator, but unfortunately * that introduces a race with a generation rollover where we could erroneously * free an ASID allocated in a future generation. We could workaround this by * freeing the ASID from the context of the dying mm (e.g. in arch_exit_mmap), * but we'd then need to make sure that we didn't dirty any TLBs afterwards. * Setting a reserved TTBR0 or EPD0 would work, but it all gets ugly when you * take CPU migration into account. */ void check_and_switch_context(struct mm_struct *mm); #define init_new_context(tsk, mm) init_new_context(tsk, mm) static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm) { atomic64_set(&mm->context.id, 0); refcount_set(&mm->context.pinned, 0); /* pkey 0 is the default, so always reserve it. */ mm->context.pkey_allocation_map = BIT(0); return 0; } static inline void arch_dup_pkeys(struct mm_struct *oldmm, struct mm_struct *mm) { /* Duplicate the oldmm pkey state in mm: */ mm->context.pkey_allocation_map = oldmm->context.pkey_allocation_map; } static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) { arch_dup_pkeys(oldmm, mm); return 0; } static inline void arch_exit_mmap(struct mm_struct *mm) { } static inline void arch_unmap(struct mm_struct *mm, unsigned long start, unsigned long end) { } #ifdef CONFIG_ARM64_SW_TTBR0_PAN static inline void update_saved_ttbr0(struct task_struct *tsk, struct mm_struct *mm) { u64 ttbr; if (!system_uses_ttbr0_pan()) return; if (mm == &init_mm) ttbr = phys_to_ttbr(__pa_symbol(reserved_pg_dir)); else ttbr = phys_to_ttbr(virt_to_phys(mm->pgd)) | ASID(mm) << 48; WRITE_ONCE(task_thread_info(tsk)->ttbr0, ttbr); } #else static inline void update_saved_ttbr0(struct task_struct *tsk, struct mm_struct *mm) { } #endif #define enter_lazy_tlb enter_lazy_tlb static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { /* * We don't actually care about the ttbr0 mapping, so point it at the * zero page. */ update_saved_ttbr0(tsk, &init_mm); } static inline void __switch_mm(struct mm_struct *next) { /* * init_mm.pgd does not contain any user mappings and it is always * active for kernel addresses in TTBR1. Just set the reserved TTBR0. */ if (next == &init_mm) { cpu_set_reserved_ttbr0(); return; } check_and_switch_context(next); } static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { if (prev != next) __switch_mm(next); /* * Update the saved TTBR0_EL1 of the scheduled-in task as the previous * value may have not been initialised yet (activate_mm caller) or the * ASID has changed since the last run (following the context switch * of another thread of the same process). */ update_saved_ttbr0(tsk, next); } static inline const struct cpumask * __task_cpu_possible_mask(struct task_struct *p, const struct cpumask *mask) { if (!static_branch_unlikely(&arm64_mismatched_32bit_el0)) return mask; if (!is_compat_thread(task_thread_info(p))) return mask; return system_32bit_el0_cpumask(); } static inline const struct cpumask * task_cpu_possible_mask(struct task_struct *p) { return __task_cpu_possible_mask(p, cpu_possible_mask); } #define task_cpu_possible_mask task_cpu_possible_mask const struct cpumask *task_cpu_fallback_mask(struct task_struct *p); void verify_cpu_asid_bits(void); void post_ttbr_update_workaround(void); unsigned long arm64_mm_context_get(struct mm_struct *mm); void arm64_mm_context_put(struct mm_struct *mm); #define mm_untag_mask mm_untag_mask static inline unsigned long mm_untag_mask(struct mm_struct *mm) { return -1UL >> 8; } /* * Only enforce protection keys on the current process, because there is no * user context to access POR_EL0 for another address space. */ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write, bool execute, bool foreign) { if (!system_supports_poe()) return true; /* allow access if the VMA is not one from this process */ if (foreign || vma_is_foreign(vma)) return true; return por_el0_allows_pkey(vma_pkey(vma), write, execute); } #define deactivate_mm deactivate_mm static inline void deactivate_mm(struct task_struct *tsk, struct mm_struct *mm) { gcs_free(tsk); } #include <asm-generic/mmu_context.h> #endif /* !__ASSEMBLY__ */ #endif /* !__ASM_MMU_CONTEXT_H */
45 45 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 // SPDX-License-Identifier: GPL-2.0 /* * security/tomoyo/condition.c * * Copyright (C) 2005-2011 NTT DATA CORPORATION */ #include "common.h" #include <linux/slab.h> /* List of "struct tomoyo_condition". */ LIST_HEAD(tomoyo_condition_list); /** * tomoyo_argv - Check argv[] in "struct linux_binbrm". * * @index: Index number of @arg_ptr. * @arg_ptr: Contents of argv[@index]. * @argc: Length of @argv. * @argv: Pointer to "struct tomoyo_argv". * @checked: Set to true if @argv[@index] was found. * * Returns true on success, false otherwise. */ static bool tomoyo_argv(const unsigned int index, const char *arg_ptr, const int argc, const struct tomoyo_argv *argv, u8 *checked) { int i; struct tomoyo_path_info arg; arg.name = arg_ptr; for (i = 0; i < argc; argv++, checked++, i++) { bool result; if (index != argv->index) continue; *checked = 1; tomoyo_fill_path_info(&arg); result = tomoyo_path_matches_pattern(&arg, argv->value); if (argv->is_not) result = !result; if (!result) return false; } return true; } /** * tomoyo_envp - Check envp[] in "struct linux_binbrm". * * @env_name: The name of environment variable. * @env_value: The value of environment variable. * @envc: Length of @envp. * @envp: Pointer to "struct tomoyo_envp". * @checked: Set to true if @envp[@env_name] was found. * * Returns true on success, false otherwise. */ static bool tomoyo_envp(const char *env_name, const char *env_value, const int envc, const struct tomoyo_envp *envp, u8 *checked) { int i; struct tomoyo_path_info name; struct tomoyo_path_info value; name.name = env_name; tomoyo_fill_path_info(&name); value.name = env_value; tomoyo_fill_path_info(&value); for (i = 0; i < envc; envp++, checked++, i++) { bool result; if (!tomoyo_path_matches_pattern(&name, envp->name)) continue; *checked = 1; if (envp->value) { result = tomoyo_path_matches_pattern(&value, envp->value); if (envp->is_not) result = !result; } else { result = true; if (!envp->is_not) result = !result; } if (!result) return false; } return true; } /** * tomoyo_scan_bprm - Scan "struct linux_binprm". * * @ee: Pointer to "struct tomoyo_execve". * @argc: Length of @argc. * @argv: Pointer to "struct tomoyo_argv". * @envc: Length of @envp. * @envp: Pointer to "struct tomoyo_envp". * * Returns true on success, false otherwise. */ static bool tomoyo_scan_bprm(struct tomoyo_execve *ee, const u16 argc, const struct tomoyo_argv *argv, const u16 envc, const struct tomoyo_envp *envp) { struct linux_binprm *bprm = ee->bprm; struct tomoyo_page_dump *dump = &ee->dump; char *arg_ptr = ee->tmp; int arg_len = 0; unsigned long pos = bprm->p; int offset = pos % PAGE_SIZE; int argv_count = bprm->argc; int envp_count = bprm->envc; bool result = true; u8 local_checked[32]; u8 *checked; if (argc + envc <= sizeof(local_checked)) { checked = local_checked; memset(local_checked, 0, sizeof(local_checked)); } else { checked = kzalloc(argc + envc, GFP_NOFS); if (!checked) return false; } while (argv_count || envp_count) { if (!tomoyo_dump_page(bprm, pos, dump)) { result = false; goto out; } pos += PAGE_SIZE - offset; while (offset < PAGE_SIZE) { /* Read. */ const char *kaddr = dump->data; const unsigned char c = kaddr[offset++]; if (c && arg_len < TOMOYO_EXEC_TMPSIZE - 10) { if (c == '\\') { arg_ptr[arg_len++] = '\\'; arg_ptr[arg_len++] = '\\'; } else if (c > ' ' && c < 127) { arg_ptr[arg_len++] = c; } else { arg_ptr[arg_len++] = '\\'; arg_ptr[arg_len++] = (c >> 6) + '0'; arg_ptr[arg_len++] = ((c >> 3) & 7) + '0'; arg_ptr[arg_len++] = (c & 7) + '0'; } } else { arg_ptr[arg_len] = '\0'; } if (c) continue; /* Check. */ if (argv_count) { if (!tomoyo_argv(bprm->argc - argv_count, arg_ptr, argc, argv, checked)) { result = false; break; } argv_count--; } else if (envp_count) { char *cp = strchr(arg_ptr, '='); if (cp) { *cp = '\0'; if (!tomoyo_envp(arg_ptr, cp + 1, envc, envp, checked + argc)) { result = false; break; } } envp_count--; } else { break; } arg_len = 0; } offset = 0; if (!result) break; } out: if (result) { int i; /* Check not-yet-checked entries. */ for (i = 0; i < argc; i++) { if (checked[i]) continue; /* * Return true only if all unchecked indexes in * bprm->argv[] are not matched. */ if (argv[i].is_not) continue; result = false; break; } for (i = 0; i < envc; envp++, i++) { if (checked[argc + i]) continue; /* * Return true only if all unchecked environ variables * in bprm->envp[] are either undefined or not matched. */ if ((!envp->value && !envp->is_not) || (envp->value && envp->is_not)) continue; result = false; break; } } if (checked != local_checked) kfree(checked); return result; } /** * tomoyo_scan_exec_realpath - Check "exec.realpath" parameter of "struct tomoyo_condition". * * @file: Pointer to "struct file". * @ptr: Pointer to "struct tomoyo_name_union". * @match: True if "exec.realpath=", false if "exec.realpath!=". * * Returns true on success, false otherwise. */ static bool tomoyo_scan_exec_realpath(struct file *file, const struct tomoyo_name_union *ptr, const bool match) { bool result; struct tomoyo_path_info exe; if (!file) return false; exe.name = tomoyo_realpath_from_path(&file->f_path); if (!exe.name) return false; tomoyo_fill_path_info(&exe); result = tomoyo_compare_name_union(&exe, ptr); kfree(exe.name); return result == match; } /** * tomoyo_get_dqword - tomoyo_get_name() for a quoted string. * * @start: String to save. * * Returns pointer to "struct tomoyo_path_info" on success, NULL otherwise. */ static const struct tomoyo_path_info *tomoyo_get_dqword(char *start) { char *cp = start + strlen(start) - 1; if (cp == start || *start++ != '"' || *cp != '"') return NULL; *cp = '\0'; if (*start && !tomoyo_correct_word(start)) return NULL; return tomoyo_get_name(start); } /** * tomoyo_parse_name_union_quoted - Parse a quoted word. * * @param: Pointer to "struct tomoyo_acl_param". * @ptr: Pointer to "struct tomoyo_name_union". * * Returns true on success, false otherwise. */ static bool tomoyo_parse_name_union_quoted(struct tomoyo_acl_param *param, struct tomoyo_name_union *ptr) { char *filename = param->data; if (*filename == '@') return tomoyo_parse_name_union(param, ptr); ptr->filename = tomoyo_get_dqword(filename); return ptr->filename != NULL; } /** * tomoyo_parse_argv - Parse an argv[] condition part. * * @left: Lefthand value. * @right: Righthand value. * @argv: Pointer to "struct tomoyo_argv". * * Returns true on success, false otherwise. */ static bool tomoyo_parse_argv(char *left, char *right, struct tomoyo_argv *argv) { if (tomoyo_parse_ulong(&argv->index, &left) != TOMOYO_VALUE_TYPE_DECIMAL || *left++ != ']' || *left) return false; argv->value = tomoyo_get_dqword(right); return argv->value != NULL; } /** * tomoyo_parse_envp - Parse an envp[] condition part. * * @left: Lefthand value. * @right: Righthand value. * @envp: Pointer to "struct tomoyo_envp". * * Returns true on success, false otherwise. */ static bool tomoyo_parse_envp(char *left, char *right, struct tomoyo_envp *envp) { const struct tomoyo_path_info *name; const struct tomoyo_path_info *value; char *cp = left + strlen(left) - 1; if (*cp-- != ']' || *cp != '"') goto out; *cp = '\0'; if (!tomoyo_correct_word(left)) goto out; name = tomoyo_get_name(left); if (!name) goto out; if (!strcmp(right, "NULL")) { value = NULL; } else { value = tomoyo_get_dqword(right); if (!value) { tomoyo_put_name(name); goto out; } } envp->name = name; envp->value = value; return true; out: return false; } /** * tomoyo_same_condition - Check for duplicated "struct tomoyo_condition" entry. * * @a: Pointer to "struct tomoyo_condition". * @b: Pointer to "struct tomoyo_condition". * * Returns true if @a == @b, false otherwise. */ static inline bool tomoyo_same_condition(const struct tomoyo_condition *a, const struct tomoyo_condition *b) { return a->size == b->size && a->condc == b->condc && a->numbers_count == b->numbers_count && a->names_count == b->names_count && a->argc == b->argc && a->envc == b->envc && a->grant_log == b->grant_log && a->transit == b->transit && !memcmp(a + 1, b + 1, a->size - sizeof(*a)); } /** * tomoyo_condition_type - Get condition type. * * @word: Keyword string. * * Returns one of values in "enum tomoyo_conditions_index" on success, * TOMOYO_MAX_CONDITION_KEYWORD otherwise. */ static u8 tomoyo_condition_type(const char *word) { u8 i; for (i = 0; i < TOMOYO_MAX_CONDITION_KEYWORD; i++) { if (!strcmp(word, tomoyo_condition_keyword[i])) break; } return i; } /* Define this to enable debug mode. */ /* #define DEBUG_CONDITION */ #ifdef DEBUG_CONDITION #define dprintk printk #else #define dprintk(...) do { } while (0) #endif /** * tomoyo_commit_condition - Commit "struct tomoyo_condition". * * @entry: Pointer to "struct tomoyo_condition". * * Returns pointer to "struct tomoyo_condition" on success, NULL otherwise. * * This function merges duplicated entries. This function returns NULL if * @entry is not duplicated but memory quota for policy has exceeded. */ static struct tomoyo_condition *tomoyo_commit_condition (struct tomoyo_condition *entry) { struct tomoyo_condition *ptr; bool found = false; if (mutex_lock_interruptible(&tomoyo_policy_lock)) { dprintk(KERN_WARNING "%u: %s failed\n", __LINE__, __func__); ptr = NULL; found = true; goto out; } list_for_each_entry(ptr, &tomoyo_condition_list, head.list) { if (!tomoyo_same_condition(ptr, entry) || atomic_read(&ptr->head.users) == TOMOYO_GC_IN_PROGRESS) continue; /* Same entry found. Share this entry. */ atomic_inc(&ptr->head.users); found = true; break; } if (!found) { if (tomoyo_memory_ok(entry)) { atomic_set(&entry->head.users, 1); list_add(&entry->head.list, &tomoyo_condition_list); } else { found = true; ptr = NULL; } } mutex_unlock(&tomoyo_policy_lock); out: if (found) { tomoyo_del_condition(&entry->head.list); kfree(entry); entry = ptr; } return entry; } /** * tomoyo_get_transit_preference - Parse domain transition preference for execve(). * * @param: Pointer to "struct tomoyo_acl_param". * @e: Pointer to "struct tomoyo_condition". * * Returns the condition string part. */ static char *tomoyo_get_transit_preference(struct tomoyo_acl_param *param, struct tomoyo_condition *e) { char * const pos = param->data; bool flag; if (*pos == '<') { e->transit = tomoyo_get_domainname(param); goto done; } { char *cp = strchr(pos, ' '); if (cp) *cp = '\0'; flag = tomoyo_correct_path(pos) || !strcmp(pos, "keep") || !strcmp(pos, "initialize") || !strcmp(pos, "reset") || !strcmp(pos, "child") || !strcmp(pos, "parent"); if (cp) *cp = ' '; } if (!flag) return pos; e->transit = tomoyo_get_name(tomoyo_read_token(param)); done: if (e->transit) return param->data; /* * Return a bad read-only condition string that will let * tomoyo_get_condition() return NULL. */ return "/"; } /** * tomoyo_get_condition - Parse condition part. * * @param: Pointer to "struct tomoyo_acl_param". * * Returns pointer to "struct tomoyo_condition" on success, NULL otherwise. */ struct tomoyo_condition *tomoyo_get_condition(struct tomoyo_acl_param *param) { struct tomoyo_condition *entry = NULL; struct tomoyo_condition_element *condp = NULL; struct tomoyo_number_union *numbers_p = NULL; struct tomoyo_name_union *names_p = NULL; struct tomoyo_argv *argv = NULL; struct tomoyo_envp *envp = NULL; struct tomoyo_condition e = { }; char * const start_of_string = tomoyo_get_transit_preference(param, &e); char * const end_of_string = start_of_string + strlen(start_of_string); char *pos; rerun: pos = start_of_string; while (1) { u8 left = -1; u8 right = -1; char *left_word = pos; char *cp; char *right_word; bool is_not; if (!*left_word) break; /* * Since left-hand condition does not allow use of "path_group" * or "number_group" and environment variable's names do not * accept '=', it is guaranteed that the original line consists * of one or more repetition of $left$operator$right blocks * where "$left is free from '=' and ' '" and "$operator is * either '=' or '!='" and "$right is free from ' '". * Therefore, we can reconstruct the original line at the end * of dry run even if we overwrite $operator with '\0'. */ cp = strchr(pos, ' '); if (cp) { *cp = '\0'; /* Will restore later. */ pos = cp + 1; } else { pos = ""; } right_word = strchr(left_word, '='); if (!right_word || right_word == left_word) goto out; is_not = *(right_word - 1) == '!'; if (is_not) *(right_word++ - 1) = '\0'; /* Will restore later. */ else if (*(right_word + 1) != '=') *right_word++ = '\0'; /* Will restore later. */ else goto out; dprintk(KERN_WARNING "%u: <%s>%s=<%s>\n", __LINE__, left_word, is_not ? "!" : "", right_word); if (!strcmp(left_word, "grant_log")) { if (entry) { if (is_not || entry->grant_log != TOMOYO_GRANTLOG_AUTO) goto out; else if (!strcmp(right_word, "yes")) entry->grant_log = TOMOYO_GRANTLOG_YES; else if (!strcmp(right_word, "no")) entry->grant_log = TOMOYO_GRANTLOG_NO; else goto out; } continue; } if (!strncmp(left_word, "exec.argv[", 10)) { if (!argv) { e.argc++; e.condc++; } else { e.argc--; e.condc--; left = TOMOYO_ARGV_ENTRY; argv->is_not = is_not; if (!tomoyo_parse_argv(left_word + 10, right_word, argv++)) goto out; } goto store_value; } if (!strncmp(left_word, "exec.envp[\"", 11)) { if (!envp) { e.envc++; e.condc++; } else { e.envc--; e.condc--; left = TOMOYO_ENVP_ENTRY; envp->is_not = is_not; if (!tomoyo_parse_envp(left_word + 11, right_word, envp++)) goto out; } goto store_value; } left = tomoyo_condition_type(left_word); dprintk(KERN_WARNING "%u: <%s> left=%u\n", __LINE__, left_word, left); if (left == TOMOYO_MAX_CONDITION_KEYWORD) { if (!numbers_p) { e.numbers_count++; } else { e.numbers_count--; left = TOMOYO_NUMBER_UNION; param->data = left_word; if (*left_word == '@' || !tomoyo_parse_number_union(param, numbers_p++)) goto out; } } if (!condp) e.condc++; else e.condc--; if (left == TOMOYO_EXEC_REALPATH || left == TOMOYO_SYMLINK_TARGET) { if (!names_p) { e.names_count++; } else { e.names_count--; right = TOMOYO_NAME_UNION; param->data = right_word; if (!tomoyo_parse_name_union_quoted(param, names_p++)) goto out; } goto store_value; } right = tomoyo_condition_type(right_word); if (right == TOMOYO_MAX_CONDITION_KEYWORD) { if (!numbers_p) { e.numbers_count++; } else { e.numbers_count--; right = TOMOYO_NUMBER_UNION; param->data = right_word; if (!tomoyo_parse_number_union(param, numbers_p++)) goto out; } } store_value: if (!condp) { dprintk(KERN_WARNING "%u: dry_run left=%u right=%u match=%u\n", __LINE__, left, right, !is_not); continue; } condp->left = left; condp->right = right; condp->equals = !is_not; dprintk(KERN_WARNING "%u: left=%u right=%u match=%u\n", __LINE__, condp->left, condp->right, condp->equals); condp++; } dprintk(KERN_INFO "%u: cond=%u numbers=%u names=%u ac=%u ec=%u\n", __LINE__, e.condc, e.numbers_count, e.names_count, e.argc, e.envc); if (entry) { BUG_ON(e.names_count | e.numbers_count | e.argc | e.envc | e.condc); return tomoyo_commit_condition(entry); } e.size = sizeof(*entry) + e.condc * sizeof(struct tomoyo_condition_element) + e.numbers_count * sizeof(struct tomoyo_number_union) + e.names_count * sizeof(struct tomoyo_name_union) + e.argc * sizeof(struct tomoyo_argv) + e.envc * sizeof(struct tomoyo_envp); entry = kzalloc(e.size, GFP_NOFS); if (!entry) goto out2; *entry = e; e.transit = NULL; condp = (struct tomoyo_condition_element *) (entry + 1); numbers_p = (struct tomoyo_number_union *) (condp + e.condc); names_p = (struct tomoyo_name_union *) (numbers_p + e.numbers_count); argv = (struct tomoyo_argv *) (names_p + e.names_count); envp = (struct tomoyo_envp *) (argv + e.argc); { bool flag = false; for (pos = start_of_string; pos < end_of_string; pos++) { if (*pos) continue; if (flag) /* Restore " ". */ *pos = ' '; else if (*(pos + 1) == '=') /* Restore "!=". */ *pos = '!'; else /* Restore "=". */ *pos = '='; flag = !flag; } } goto rerun; out: dprintk(KERN_WARNING "%u: %s failed\n", __LINE__, __func__); if (entry) { tomoyo_del_condition(&entry->head.list); kfree(entry); } out2: tomoyo_put_name(e.transit); return NULL; } /** * tomoyo_get_attributes - Revalidate "struct inode". * * @obj: Pointer to "struct tomoyo_obj_info". * * Returns nothing. */ void tomoyo_get_attributes(struct tomoyo_obj_info *obj) { u8 i; struct dentry *dentry = NULL; for (i = 0; i < TOMOYO_MAX_PATH_STAT; i++) { struct inode *inode; switch (i) { case TOMOYO_PATH1: dentry = obj->path1.dentry; if (!dentry) continue; break; case TOMOYO_PATH2: dentry = obj->path2.dentry; if (!dentry) continue; break; default: if (!dentry) continue; dentry = dget_parent(dentry); break; } inode = d_backing_inode(dentry); if (inode) { struct tomoyo_mini_stat *stat = &obj->stat[i]; stat->uid = inode->i_uid; stat->gid = inode->i_gid; stat->ino = inode->i_ino; stat->mode = inode->i_mode; stat->dev = inode->i_sb->s_dev; stat->rdev = inode->i_rdev; obj->stat_valid[i] = true; } if (i & 1) /* TOMOYO_PATH1_PARENT or TOMOYO_PATH2_PARENT */ dput(dentry); } } /** * tomoyo_condition - Check condition part. * * @r: Pointer to "struct tomoyo_request_info". * @cond: Pointer to "struct tomoyo_condition". Maybe NULL. * * Returns true on success, false otherwise. * * Caller holds tomoyo_read_lock(). */ bool tomoyo_condition(struct tomoyo_request_info *r, const struct tomoyo_condition *cond) { u32 i; unsigned long min_v[2] = { 0, 0 }; unsigned long max_v[2] = { 0, 0 }; const struct tomoyo_condition_element *condp; const struct tomoyo_number_union *numbers_p; const struct tomoyo_name_union *names_p; const struct tomoyo_argv *argv; const struct tomoyo_envp *envp; struct tomoyo_obj_info *obj; u16 condc; u16 argc; u16 envc; struct linux_binprm *bprm = NULL; if (!cond) return true; condc = cond->condc; argc = cond->argc; envc = cond->envc; obj = r->obj; if (r->ee) bprm = r->ee->bprm; if (!bprm && (argc || envc)) return false; condp = (struct tomoyo_condition_element *) (cond + 1); numbers_p = (const struct tomoyo_number_union *) (condp + condc); names_p = (const struct tomoyo_name_union *) (numbers_p + cond->numbers_count); argv = (const struct tomoyo_argv *) (names_p + cond->names_count); envp = (const struct tomoyo_envp *) (argv + argc); for (i = 0; i < condc; i++) { const bool match = condp->equals; const u8 left = condp->left; const u8 right = condp->right; bool is_bitop[2] = { false, false }; u8 j; condp++; /* Check argv[] and envp[] later. */ if (left == TOMOYO_ARGV_ENTRY || left == TOMOYO_ENVP_ENTRY) continue; /* Check string expressions. */ if (right == TOMOYO_NAME_UNION) { const struct tomoyo_name_union *ptr = names_p++; struct tomoyo_path_info *symlink; struct tomoyo_execve *ee; struct file *file; switch (left) { case TOMOYO_SYMLINK_TARGET: symlink = obj ? obj->symlink_target : NULL; if (!symlink || !tomoyo_compare_name_union(symlink, ptr) == match) goto out; break; case TOMOYO_EXEC_REALPATH: ee = r->ee; file = ee ? ee->bprm->file : NULL; if (!tomoyo_scan_exec_realpath(file, ptr, match)) goto out; break; } continue; } /* Check numeric or bit-op expressions. */ for (j = 0; j < 2; j++) { const u8 index = j ? right : left; unsigned long value = 0; switch (index) { case TOMOYO_TASK_UID: value = from_kuid(&init_user_ns, current_uid()); break; case TOMOYO_TASK_EUID: value = from_kuid(&init_user_ns, current_euid()); break; case TOMOYO_TASK_SUID: value = from_kuid(&init_user_ns, current_suid()); break; case TOMOYO_TASK_FSUID: value = from_kuid(&init_user_ns, current_fsuid()); break; case TOMOYO_TASK_GID: value = from_kgid(&init_user_ns, current_gid()); break; case TOMOYO_TASK_EGID: value = from_kgid(&init_user_ns, current_egid()); break; case TOMOYO_TASK_SGID: value = from_kgid(&init_user_ns, current_sgid()); break; case TOMOYO_TASK_FSGID: value = from_kgid(&init_user_ns, current_fsgid()); break; case TOMOYO_TASK_PID: value = tomoyo_sys_getpid(); break; case TOMOYO_TASK_PPID: value = tomoyo_sys_getppid(); break; case TOMOYO_TYPE_IS_SOCKET: value = S_IFSOCK; break; case TOMOYO_TYPE_IS_SYMLINK: value = S_IFLNK; break; case TOMOYO_TYPE_IS_FILE: value = S_IFREG; break; case TOMOYO_TYPE_IS_BLOCK_DEV: value = S_IFBLK; break; case TOMOYO_TYPE_IS_DIRECTORY: value = S_IFDIR; break; case TOMOYO_TYPE_IS_CHAR_DEV: value = S_IFCHR; break; case TOMOYO_TYPE_IS_FIFO: value = S_IFIFO; break; case TOMOYO_MODE_SETUID: value = S_ISUID; break; case TOMOYO_MODE_SETGID: value = S_ISGID; break; case TOMOYO_MODE_STICKY: value = S_ISVTX; break; case TOMOYO_MODE_OWNER_READ: value = 0400; break; case TOMOYO_MODE_OWNER_WRITE: value = 0200; break; case TOMOYO_MODE_OWNER_EXECUTE: value = 0100; break; case TOMOYO_MODE_GROUP_READ: value = 0040; break; case TOMOYO_MODE_GROUP_WRITE: value = 0020; break; case TOMOYO_MODE_GROUP_EXECUTE: value = 0010; break; case TOMOYO_MODE_OTHERS_READ: value = 0004; break; case TOMOYO_MODE_OTHERS_WRITE: value = 0002; break; case TOMOYO_MODE_OTHERS_EXECUTE: value = 0001; break; case TOMOYO_EXEC_ARGC: if (!bprm) goto out; value = bprm->argc; break; case TOMOYO_EXEC_ENVC: if (!bprm) goto out; value = bprm->envc; break; case TOMOYO_NUMBER_UNION: /* Fetch values later. */ break; default: if (!obj) goto out; if (!obj->validate_done) { tomoyo_get_attributes(obj); obj->validate_done = true; } { u8 stat_index; struct tomoyo_mini_stat *stat; switch (index) { case TOMOYO_PATH1_UID: case TOMOYO_PATH1_GID: case TOMOYO_PATH1_INO: case TOMOYO_PATH1_MAJOR: case TOMOYO_PATH1_MINOR: case TOMOYO_PATH1_TYPE: case TOMOYO_PATH1_DEV_MAJOR: case TOMOYO_PATH1_DEV_MINOR: case TOMOYO_PATH1_PERM: stat_index = TOMOYO_PATH1; break; case TOMOYO_PATH2_UID: case TOMOYO_PATH2_GID: case TOMOYO_PATH2_INO: case TOMOYO_PATH2_MAJOR: case TOMOYO_PATH2_MINOR: case TOMOYO_PATH2_TYPE: case TOMOYO_PATH2_DEV_MAJOR: case TOMOYO_PATH2_DEV_MINOR: case TOMOYO_PATH2_PERM: stat_index = TOMOYO_PATH2; break; case TOMOYO_PATH1_PARENT_UID: case TOMOYO_PATH1_PARENT_GID: case TOMOYO_PATH1_PARENT_INO: case TOMOYO_PATH1_PARENT_PERM: stat_index = TOMOYO_PATH1_PARENT; break; case TOMOYO_PATH2_PARENT_UID: case TOMOYO_PATH2_PARENT_GID: case TOMOYO_PATH2_PARENT_INO: case TOMOYO_PATH2_PARENT_PERM: stat_index = TOMOYO_PATH2_PARENT; break; default: goto out; } if (!obj->stat_valid[stat_index]) goto out; stat = &obj->stat[stat_index]; switch (index) { case TOMOYO_PATH1_UID: case TOMOYO_PATH2_UID: case TOMOYO_PATH1_PARENT_UID: case TOMOYO_PATH2_PARENT_UID: value = from_kuid(&init_user_ns, stat->uid); break; case TOMOYO_PATH1_GID: case TOMOYO_PATH2_GID: case TOMOYO_PATH1_PARENT_GID: case TOMOYO_PATH2_PARENT_GID: value = from_kgid(&init_user_ns, stat->gid); break; case TOMOYO_PATH1_INO: case TOMOYO_PATH2_INO: case TOMOYO_PATH1_PARENT_INO: case TOMOYO_PATH2_PARENT_INO: value = stat->ino; break; case TOMOYO_PATH1_MAJOR: case TOMOYO_PATH2_MAJOR: value = MAJOR(stat->dev); break; case TOMOYO_PATH1_MINOR: case TOMOYO_PATH2_MINOR: value = MINOR(stat->dev); break; case TOMOYO_PATH1_TYPE: case TOMOYO_PATH2_TYPE: value = stat->mode & S_IFMT; break; case TOMOYO_PATH1_DEV_MAJOR: case TOMOYO_PATH2_DEV_MAJOR: value = MAJOR(stat->rdev); break; case TOMOYO_PATH1_DEV_MINOR: case TOMOYO_PATH2_DEV_MINOR: value = MINOR(stat->rdev); break; case TOMOYO_PATH1_PERM: case TOMOYO_PATH2_PERM: case TOMOYO_PATH1_PARENT_PERM: case TOMOYO_PATH2_PARENT_PERM: value = stat->mode & S_IALLUGO; break; } } break; } max_v[j] = value; min_v[j] = value; switch (index) { case TOMOYO_MODE_SETUID: case TOMOYO_MODE_SETGID: case TOMOYO_MODE_STICKY: case TOMOYO_MODE_OWNER_READ: case TOMOYO_MODE_OWNER_WRITE: case TOMOYO_MODE_OWNER_EXECUTE: case TOMOYO_MODE_GROUP_READ: case TOMOYO_MODE_GROUP_WRITE: case TOMOYO_MODE_GROUP_EXECUTE: case TOMOYO_MODE_OTHERS_READ: case TOMOYO_MODE_OTHERS_WRITE: case TOMOYO_MODE_OTHERS_EXECUTE: is_bitop[j] = true; } } if (left == TOMOYO_NUMBER_UNION) { /* Fetch values now. */ const struct tomoyo_number_union *ptr = numbers_p++; min_v[0] = ptr->values[0]; max_v[0] = ptr->values[1]; } if (right == TOMOYO_NUMBER_UNION) { /* Fetch values now. */ const struct tomoyo_number_union *ptr = numbers_p++; if (ptr->group) { if (tomoyo_number_matches_group(min_v[0], max_v[0], ptr->group) == match) continue; } else { if ((min_v[0] <= ptr->values[1] && max_v[0] >= ptr->values[0]) == match) continue; } goto out; } /* * Bit operation is valid only when counterpart value * represents permission. */ if (is_bitop[0] && is_bitop[1]) { goto out; } else if (is_bitop[0]) { switch (right) { case TOMOYO_PATH1_PERM: case TOMOYO_PATH1_PARENT_PERM: case TOMOYO_PATH2_PERM: case TOMOYO_PATH2_PARENT_PERM: if (!(max_v[0] & max_v[1]) == !match) continue; } goto out; } else if (is_bitop[1]) { switch (left) { case TOMOYO_PATH1_PERM: case TOMOYO_PATH1_PARENT_PERM: case TOMOYO_PATH2_PERM: case TOMOYO_PATH2_PARENT_PERM: if (!(max_v[0] & max_v[1]) == !match) continue; } goto out; } /* Normal value range comparison. */ if ((min_v[0] <= max_v[1] && max_v[0] >= min_v[1]) == match) continue; out: return false; } /* Check argv[] and envp[] now. */ if (r->ee && (argc || envc)) return tomoyo_scan_bprm(r->ee, argc, argv, envc, envp); return true; }
12 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 // SPDX-License-Identifier: GPL-2.0 /* * Supplementary group IDs */ #include <linux/cred.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/security.h> #include <linux/sort.h> #include <linux/syscalls.h> #include <linux/user_namespace.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> struct group_info *groups_alloc(int gidsetsize) { struct group_info *gi; gi = kvmalloc(struct_size(gi, gid, gidsetsize), GFP_KERNEL_ACCOUNT); if (!gi) return NULL; refcount_set(&gi->usage, 1); gi->ngroups = gidsetsize; return gi; } EXPORT_SYMBOL(groups_alloc); void groups_free(struct group_info *group_info) { kvfree(group_info); } EXPORT_SYMBOL(groups_free); /* export the group_info to a user-space array */ static int groups_to_user(gid_t __user *grouplist, const struct group_info *group_info) { struct user_namespace *user_ns = current_user_ns(); int i; unsigned int count = group_info->ngroups; for (i = 0; i < count; i++) { gid_t gid; gid = from_kgid_munged(user_ns, group_info->gid[i]); if (put_user(gid, grouplist+i)) return -EFAULT; } return 0; } /* fill a group_info from a user-space array - it must be allocated already */ static int groups_from_user(struct group_info *group_info, gid_t __user *grouplist) { struct user_namespace *user_ns = current_user_ns(); int i; unsigned int count = group_info->ngroups; for (i = 0; i < count; i++) { gid_t gid; kgid_t kgid; if (get_user(gid, grouplist+i)) return -EFAULT; kgid = make_kgid(user_ns, gid); if (!gid_valid(kgid)) return -EINVAL; group_info->gid[i] = kgid; } return 0; } static int gid_cmp(const void *_a, const void *_b) { kgid_t a = *(kgid_t *)_a; kgid_t b = *(kgid_t *)_b; return gid_gt(a, b) - gid_lt(a, b); } void groups_sort(struct group_info *group_info) { sort(group_info->gid, group_info->ngroups, sizeof(*group_info->gid), gid_cmp, NULL); } EXPORT_SYMBOL(groups_sort); /* a simple bsearch */ int groups_search(const struct group_info *group_info, kgid_t grp) { unsigned int left, right; if (!group_info) return 0; left = 0; right = group_info->ngroups; while (left < right) { unsigned int mid = (left+right)/2; if (gid_gt(grp, group_info->gid[mid])) left = mid + 1; else if (gid_lt(grp, group_info->gid[mid])) right = mid; else return 1; } return 0; } /** * set_groups - Change a group subscription in a set of credentials * @new: The newly prepared set of credentials to alter * @group_info: The group list to install */ void set_groups(struct cred *new, struct group_info *group_info) { put_group_info(new->group_info); get_group_info(group_info); new->group_info = group_info; } EXPORT_SYMBOL(set_groups); /** * set_current_groups - Change current's group subscription * @group_info: The group list to impose * * Validate a group subscription and, if valid, impose it upon current's task * security record. */ int set_current_groups(struct group_info *group_info) { struct cred *new; const struct cred *old; int retval; new = prepare_creds(); if (!new) return -ENOMEM; old = current_cred(); set_groups(new, group_info); retval = security_task_fix_setgroups(new, old); if (retval < 0) goto error; return commit_creds(new); error: abort_creds(new); return retval; } EXPORT_SYMBOL(set_current_groups); SYSCALL_DEFINE2(getgroups, int, gidsetsize, gid_t __user *, grouplist) { const struct cred *cred = current_cred(); int i; if (gidsetsize < 0) return -EINVAL; /* no need to grab task_lock here; it cannot change */ i = cred->group_info->ngroups; if (gidsetsize) { if (i > gidsetsize) { i = -EINVAL; goto out; } if (groups_to_user(grouplist, cred->group_info)) { i = -EFAULT; goto out; } } out: return i; } bool may_setgroups(void) { struct user_namespace *user_ns = current_user_ns(); return ns_capable_setid(user_ns, CAP_SETGID) && userns_may_setgroups(user_ns); } /* * SMP: Our groups are copy-on-write. We can set them safely * without another task interfering. */ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist) { struct group_info *group_info; int retval; if (!may_setgroups()) return -EPERM; if ((unsigned)gidsetsize > NGROUPS_MAX) return -EINVAL; group_info = groups_alloc(gidsetsize); if (!group_info) return -ENOMEM; retval = groups_from_user(group_info, grouplist); if (retval) { put_group_info(group_info); return retval; } groups_sort(group_info); retval = set_current_groups(group_info); put_group_info(group_info); return retval; } /* * Check whether we're fsgid/egid or in the supplemental group.. */ int in_group_p(kgid_t grp) { const struct cred *cred = current_cred(); int retval = 1; if (!gid_eq(grp, cred->fsgid)) retval = groups_search(cred->group_info, grp); return retval; } EXPORT_SYMBOL(in_group_p); int in_egroup_p(kgid_t grp) { const struct cred *cred = current_cred(); int retval = 1; if (!gid_eq(grp, cred->egid)) retval = groups_search(cred->group_info, grp); return retval; } EXPORT_SYMBOL(in_egroup_p);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2001-2003 Patrick Mochel <mochel@osdl.org> * Copyright (c) 2004-2009 Greg Kroah-Hartman <gregkh@suse.de> * Copyright (c) 2008-2012 Novell Inc. * Copyright (c) 2012-2019 Greg Kroah-Hartman <gregkh@linuxfoundation.org> * Copyright (c) 2012-2019 Linux Foundation * * Core driver model functions and structures that should not be * shared outside of the drivers/base/ directory. * */ #include <linux/notifier.h> /** * struct subsys_private - structure to hold the private to the driver core portions of the bus_type/class structure. * * @subsys - the struct kset that defines this subsystem * @devices_kset - the subsystem's 'devices' directory * @interfaces - list of subsystem interfaces associated * @mutex - protect the devices, and interfaces lists. * * @drivers_kset - the list of drivers associated * @klist_devices - the klist to iterate over the @devices_kset * @klist_drivers - the klist to iterate over the @drivers_kset * @bus_notifier - the bus notifier list for anything that cares about things * on this bus. * @bus - pointer back to the struct bus_type that this structure is associated * with. * @dev_root: Default device to use as the parent. * * @glue_dirs - "glue" directory to put in-between the parent device to * avoid namespace conflicts * @class - pointer back to the struct class that this structure is associated * with. * @lock_key: Lock class key for use by the lock validator * * This structure is the one that is the actual kobject allowing struct * bus_type/class to be statically allocated safely. Nothing outside of the * driver core should ever touch these fields. */ struct subsys_private { struct kset subsys; struct kset *devices_kset; struct list_head interfaces; struct mutex mutex; struct kset *drivers_kset; struct klist klist_devices; struct klist klist_drivers; struct blocking_notifier_head bus_notifier; unsigned int drivers_autoprobe:1; const struct bus_type *bus; struct device *dev_root; struct kset glue_dirs; const struct class *class; struct lock_class_key lock_key; }; #define to_subsys_private(obj) container_of_const(obj, struct subsys_private, subsys.kobj) static inline struct subsys_private *subsys_get(struct subsys_private *sp) { if (sp) kset_get(&sp->subsys); return sp; } static inline void subsys_put(struct subsys_private *sp) { if (sp) kset_put(&sp->subsys); } struct subsys_private *bus_to_subsys(const struct bus_type *bus); struct subsys_private *class_to_subsys(const struct class *class); struct driver_private { struct kobject kobj; struct klist klist_devices; struct klist_node knode_bus; struct module_kobject *mkobj; struct device_driver *driver; }; #define to_driver(obj) container_of(obj, struct driver_private, kobj) /** * struct device_private - structure to hold the private to the driver core portions of the device structure. * * @klist_children - klist containing all children of this device * @knode_parent - node in sibling list * @knode_driver - node in driver list * @knode_bus - node in bus list * @knode_class - node in class list * @deferred_probe - entry in deferred_probe_list which is used to retry the * binding of drivers which were unable to get all the resources needed by * the device; typically because it depends on another driver getting * probed first. * @async_driver - pointer to device driver awaiting probe via async_probe * @device - pointer back to the struct device that this structure is * associated with. * @dead - This device is currently either in the process of or has been * removed from the system. Any asynchronous events scheduled for this * device should exit without taking any action. * * Nothing outside of the driver core should ever touch these fields. */ struct device_private { struct klist klist_children; struct klist_node knode_parent; struct klist_node knode_driver; struct klist_node knode_bus; struct klist_node knode_class; struct list_head deferred_probe; const struct device_driver *async_driver; char *deferred_probe_reason; struct device *device; u8 dead:1; }; #define to_device_private_parent(obj) \ container_of(obj, struct device_private, knode_parent) #define to_device_private_driver(obj) \ container_of(obj, struct device_private, knode_driver) #define to_device_private_bus(obj) \ container_of(obj, struct device_private, knode_bus) #define to_device_private_class(obj) \ container_of(obj, struct device_private, knode_class) /* initialisation functions */ int devices_init(void); int buses_init(void); int classes_init(void); int firmware_init(void); #ifdef CONFIG_SYS_HYPERVISOR int hypervisor_init(void); #else static inline int hypervisor_init(void) { return 0; } #endif int platform_bus_init(void); int faux_bus_init(void); void cpu_dev_init(void); void container_dev_init(void); #ifdef CONFIG_AUXILIARY_BUS void auxiliary_bus_init(void); #else static inline void auxiliary_bus_init(void) { } #endif struct kobject *virtual_device_parent(void); int bus_add_device(struct device *dev); void bus_probe_device(struct device *dev); void bus_remove_device(struct device *dev); void bus_notify(struct device *dev, enum bus_notifier_event value); bool bus_is_registered(const struct bus_type *bus); int bus_add_driver(struct device_driver *drv); void bus_remove_driver(struct device_driver *drv); void device_release_driver_internal(struct device *dev, const struct device_driver *drv, struct device *parent); void driver_detach(const struct device_driver *drv); void driver_deferred_probe_del(struct device *dev); void device_set_deferred_probe_reason(const struct device *dev, struct va_format *vaf); static inline int driver_match_device(const struct device_driver *drv, struct device *dev) { return drv->bus->match ? drv->bus->match(dev, drv) : 1; } static inline void dev_sync_state(struct device *dev) { if (dev->bus->sync_state) dev->bus->sync_state(dev); else if (dev->driver && dev->driver->sync_state) dev->driver->sync_state(dev); } int driver_add_groups(const struct device_driver *drv, const struct attribute_group **groups); void driver_remove_groups(const struct device_driver *drv, const struct attribute_group **groups); void device_driver_detach(struct device *dev); static inline void device_set_driver(struct device *dev, const struct device_driver *drv) { /* * Majority (all?) read accesses to dev->driver happens either * while holding device lock or in bus/driver code that is only * invoked when the device is bound to a driver and there is no * concern of the pointer being changed while it is being read. * However when reading device's uevent file we read driver pointer * without taking device lock (so we do not block there for * arbitrary amount of time). We use WRITE_ONCE() here to prevent * tearing so that READ_ONCE() can safely be used in uevent code. */ // FIXME - this cast should not be needed "soon" WRITE_ONCE(dev->driver, (struct device_driver *)drv); } int devres_release_all(struct device *dev); void device_block_probing(void); void device_unblock_probing(void); void deferred_probe_extend_timeout(void); void driver_deferred_probe_trigger(void); const char *device_get_devnode(const struct device *dev, umode_t *mode, kuid_t *uid, kgid_t *gid, const char **tmp); /* /sys/devices directory */ extern struct kset *devices_kset; void devices_kset_move_last(struct device *dev); #if defined(CONFIG_MODULES) && defined(CONFIG_SYSFS) int module_add_driver(struct module *mod, const struct device_driver *drv); void module_remove_driver(const struct device_driver *drv); #else static inline int module_add_driver(struct module *mod, struct device_driver *drv) { return 0; } static inline void module_remove_driver(struct device_driver *drv) { } #endif #ifdef CONFIG_DEVTMPFS int devtmpfs_init(void); #else static inline int devtmpfs_init(void) { return 0; } #endif #ifdef CONFIG_BLOCK extern const struct class block_class; static inline bool is_blockdev(struct device *dev) { return dev->class == &block_class; } #else static inline bool is_blockdev(struct device *dev) { return false; } #endif /* Device links support */ int device_links_read_lock(void); void device_links_read_unlock(int idx); int device_links_read_lock_held(void); int device_links_check_suppliers(struct device *dev); void device_links_force_bind(struct device *dev); void device_links_driver_bound(struct device *dev); void device_links_driver_cleanup(struct device *dev); void device_links_no_driver(struct device *dev); bool device_links_busy(struct device *dev); void device_links_unbind_consumers(struct device *dev); void fw_devlink_drivers_done(void); void fw_devlink_probing_done(void); /* device pm support */ void device_pm_move_to_tail(struct device *dev); #ifdef CONFIG_DEVTMPFS int devtmpfs_create_node(struct device *dev); int devtmpfs_delete_node(struct device *dev); #else static inline int devtmpfs_create_node(struct device *dev) { return 0; } static inline int devtmpfs_delete_node(struct device *dev) { return 0; } #endif void software_node_notify(struct device *dev); void software_node_notify_remove(struct device *dev);
298 32 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM percpu #if !defined(_TRACE_PERCPU_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_PERCPU_H #include <linux/tracepoint.h> #include <trace/events/mmflags.h> TRACE_EVENT(percpu_alloc_percpu, TP_PROTO(unsigned long call_site, bool reserved, bool is_atomic, size_t size, size_t align, void *base_addr, int off, void __percpu *ptr, size_t bytes_alloc, gfp_t gfp_flags), TP_ARGS(call_site, reserved, is_atomic, size, align, base_addr, off, ptr, bytes_alloc, gfp_flags), TP_STRUCT__entry( __field( unsigned long, call_site ) __field( bool, reserved ) __field( bool, is_atomic ) __field( size_t, size ) __field( size_t, align ) __field( void *, base_addr ) __field( int, off ) __field( void __percpu *, ptr ) __field( size_t, bytes_alloc ) __field( unsigned long, gfp_flags ) ), TP_fast_assign( __entry->call_site = call_site; __entry->reserved = reserved; __entry->is_atomic = is_atomic; __entry->size = size; __entry->align = align; __entry->base_addr = base_addr; __entry->off = off; __entry->ptr = ptr; __entry->bytes_alloc = bytes_alloc; __entry->gfp_flags = (__force unsigned long)gfp_flags; ), TP_printk("call_site=%pS reserved=%d is_atomic=%d size=%zu align=%zu base_addr=%p off=%d ptr=%p bytes_alloc=%zu gfp_flags=%s", (void *)__entry->call_site, __entry->reserved, __entry->is_atomic, __entry->size, __entry->align, __entry->base_addr, __entry->off, __entry->ptr, __entry->bytes_alloc, show_gfp_flags(__entry->gfp_flags)) ); TRACE_EVENT(percpu_free_percpu, TP_PROTO(void *base_addr, int off, void __percpu *ptr), TP_ARGS(base_addr, off, ptr), TP_STRUCT__entry( __field( void *, base_addr ) __field( int, off ) __field( void __percpu *, ptr ) ), TP_fast_assign( __entry->base_addr = base_addr; __entry->off = off; __entry->ptr = ptr; ), TP_printk("base_addr=%p off=%d ptr=%p", __entry->base_addr, __entry->off, __entry->ptr) ); TRACE_EVENT(percpu_alloc_percpu_fail, TP_PROTO(bool reserved, bool is_atomic, size_t size, size_t align), TP_ARGS(reserved, is_atomic, size, align), TP_STRUCT__entry( __field( bool, reserved ) __field( bool, is_atomic ) __field( size_t, size ) __field( size_t, align ) ), TP_fast_assign( __entry->reserved = reserved; __entry->is_atomic = is_atomic; __entry->size = size; __entry->align = align; ), TP_printk("reserved=%d is_atomic=%d size=%zu align=%zu", __entry->reserved, __entry->is_atomic, __entry->size, __entry->align) ); TRACE_EVENT(percpu_create_chunk, TP_PROTO(void *base_addr), TP_ARGS(base_addr), TP_STRUCT__entry( __field( void *, base_addr ) ), TP_fast_assign( __entry->base_addr = base_addr; ), TP_printk("base_addr=%p", __entry->base_addr) ); TRACE_EVENT(percpu_destroy_chunk, TP_PROTO(void *base_addr), TP_ARGS(base_addr), TP_STRUCT__entry( __field( void *, base_addr ) ), TP_fast_assign( __entry->base_addr = base_addr; ), TP_printk("base_addr=%p", __entry->base_addr) ); #endif /* _TRACE_PERCPU_H */ #include <trace/define_trace.h>
25 25 25 25 25 25 25 25 5 25 25 24 25 25 25 24 25 25 25 25 8 25 25 25 8 25 25 25 25 25 25 25 5 5 5 25 25 25 25 25 24 25 25 25 25 25 25 25 25 25 25 25 25 24 25 25 25 25 25 25 5 25 25 25 5 5 25 25 25 25 25 5 25 25 25 25 25 24 25 25 25 25 25 25 25 25 25 25 25 25 24 24 25 25 24 25 25 25 25 25 25 25 25 25 25 6 25 25 5 25 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 // SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/printk.c * * Copyright (C) 1991, 1992 Linus Torvalds * * Modified to make sys_syslog() more flexible: added commands to * return the last 4k of kernel messages, regardless of whether * they've been read or not. Added option to suppress kernel printk's * to the console. Added hook for sending the console messages * elsewhere, in preparation for a serial line console (someday). * Ted Ts'o, 2/11/93. * Modified for sysctl support, 1/8/97, Chris Horn. * Fixed SMP synchronization, 08/08/99, Manfred Spraul * manfred@colorfullife.com * Rewrote bits to get rid of console_lock * 01Mar01 Andrew Morton */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/mm.h> #include <linux/tty.h> #include <linux/tty_driver.h> #include <linux/console.h> #include <linux/init.h> #include <linux/jiffies.h> #include <linux/nmi.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/delay.h> #include <linux/smp.h> #include <linux/security.h> #include <linux/memblock.h> #include <linux/syscalls.h> #include <linux/syscore_ops.h> #include <linux/vmcore_info.h> #include <linux/ratelimit.h> #include <linux/kmsg_dump.h> #include <linux/syslog.h> #include <linux/cpu.h> #include <linux/rculist.h> #include <linux/poll.h> #include <linux/irq_work.h> #include <linux/ctype.h> #include <linux/uio.h> #include <linux/sched/clock.h> #include <linux/sched/debug.h> #include <linux/sched/task_stack.h> #include <linux/uaccess.h> #include <asm/sections.h> #include <trace/events/initcall.h> #define CREATE_TRACE_POINTS #include <trace/events/printk.h> #include "printk_ringbuffer.h" #include "console_cmdline.h" #include "braille.h" #include "internal.h" int console_printk[4] = { CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */ MESSAGE_LOGLEVEL_DEFAULT, /* default_message_loglevel */ CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */ CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ }; EXPORT_SYMBOL_GPL(console_printk); atomic_t ignore_console_lock_warning __read_mostly = ATOMIC_INIT(0); EXPORT_SYMBOL(ignore_console_lock_warning); EXPORT_TRACEPOINT_SYMBOL_GPL(console); /* * Low level drivers may need that to know if they can schedule in * their unblank() callback or not. So let's export it. */ int oops_in_progress; EXPORT_SYMBOL(oops_in_progress); /* * console_mutex protects console_list updates and console->flags updates. * The flags are synchronized only for consoles that are registered, i.e. * accessible via the console list. */ static DEFINE_MUTEX(console_mutex); /* * console_sem protects updates to console->seq * and also provides serialization for console printing. */ static DEFINE_SEMAPHORE(console_sem, 1); HLIST_HEAD(console_list); EXPORT_SYMBOL_GPL(console_list); DEFINE_STATIC_SRCU(console_srcu); /* * System may need to suppress printk message under certain * circumstances, like after kernel panic happens. */ int __read_mostly suppress_printk; #ifdef CONFIG_LOCKDEP static struct lockdep_map console_lock_dep_map = { .name = "console_lock" }; void lockdep_assert_console_list_lock_held(void) { lockdep_assert_held(&console_mutex); } EXPORT_SYMBOL(lockdep_assert_console_list_lock_held); #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC bool console_srcu_read_lock_is_held(void) { return srcu_read_lock_held(&console_srcu); } EXPORT_SYMBOL(console_srcu_read_lock_is_held); #endif enum devkmsg_log_bits { __DEVKMSG_LOG_BIT_ON = 0, __DEVKMSG_LOG_BIT_OFF, __DEVKMSG_LOG_BIT_LOCK, }; enum devkmsg_log_masks { DEVKMSG_LOG_MASK_ON = BIT(__DEVKMSG_LOG_BIT_ON), DEVKMSG_LOG_MASK_OFF = BIT(__DEVKMSG_LOG_BIT_OFF), DEVKMSG_LOG_MASK_LOCK = BIT(__DEVKMSG_LOG_BIT_LOCK), }; /* Keep both the 'on' and 'off' bits clear, i.e. ratelimit by default: */ #define DEVKMSG_LOG_MASK_DEFAULT 0 static unsigned int __read_mostly devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT; static int __control_devkmsg(char *str) { size_t len; if (!str) return -EINVAL; len = str_has_prefix(str, "on"); if (len) { devkmsg_log = DEVKMSG_LOG_MASK_ON; return len; } len = str_has_prefix(str, "off"); if (len) { devkmsg_log = DEVKMSG_LOG_MASK_OFF; return len; } len = str_has_prefix(str, "ratelimit"); if (len) { devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT; return len; } return -EINVAL; } static int __init control_devkmsg(char *str) { if (__control_devkmsg(str) < 0) { pr_warn("printk.devkmsg: bad option string '%s'\n", str); return 1; } /* * Set sysctl string accordingly: */ if (devkmsg_log == DEVKMSG_LOG_MASK_ON) strscpy(devkmsg_log_str, "on"); else if (devkmsg_log == DEVKMSG_LOG_MASK_OFF) strscpy(devkmsg_log_str, "off"); /* else "ratelimit" which is set by default. */ /* * Sysctl cannot change it anymore. The kernel command line setting of * this parameter is to force the setting to be permanent throughout the * runtime of the system. This is a precation measure against userspace * trying to be a smarta** and attempting to change it up on us. */ devkmsg_log |= DEVKMSG_LOG_MASK_LOCK; return 1; } __setup("printk.devkmsg=", control_devkmsg); char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE] = "ratelimit"; #if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL) int devkmsg_sysctl_set_loglvl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { char old_str[DEVKMSG_STR_MAX_SIZE]; unsigned int old; int err; if (write) { if (devkmsg_log & DEVKMSG_LOG_MASK_LOCK) return -EINVAL; old = devkmsg_log; strscpy(old_str, devkmsg_log_str); } err = proc_dostring(table, write, buffer, lenp, ppos); if (err) return err; if (write) { err = __control_devkmsg(devkmsg_log_str); /* * Do not accept an unknown string OR a known string with * trailing crap... */ if (err < 0 || (err + 1 != *lenp)) { /* ... and restore old setting. */ devkmsg_log = old; strscpy(devkmsg_log_str, old_str); return -EINVAL; } } return 0; } #endif /* CONFIG_PRINTK && CONFIG_SYSCTL */ /** * console_list_lock - Lock the console list * * For console list or console->flags updates */ void console_list_lock(void) { /* * In unregister_console() and console_force_preferred_locked(), * synchronize_srcu() is called with the console_list_lock held. * Therefore it is not allowed that the console_list_lock is taken * with the srcu_lock held. * * Detecting if this context is really in the read-side critical * section is only possible if the appropriate debug options are * enabled. */ WARN_ON_ONCE(debug_lockdep_rcu_enabled() && srcu_read_lock_held(&console_srcu)); mutex_lock(&console_mutex); } EXPORT_SYMBOL(console_list_lock); /** * console_list_unlock - Unlock the console list * * Counterpart to console_list_lock() */ void console_list_unlock(void) { mutex_unlock(&console_mutex); } EXPORT_SYMBOL(console_list_unlock); /** * console_srcu_read_lock - Register a new reader for the * SRCU-protected console list * * Use for_each_console_srcu() to iterate the console list * * Context: Any context. * Return: A cookie to pass to console_srcu_read_unlock(). */ int console_srcu_read_lock(void) __acquires(&console_srcu) { return srcu_read_lock_nmisafe(&console_srcu); } EXPORT_SYMBOL(console_srcu_read_lock); /** * console_srcu_read_unlock - Unregister an old reader from * the SRCU-protected console list * @cookie: cookie returned from console_srcu_read_lock() * * Counterpart to console_srcu_read_lock() */ void console_srcu_read_unlock(int cookie) __releases(&console_srcu) { srcu_read_unlock_nmisafe(&console_srcu, cookie); } EXPORT_SYMBOL(console_srcu_read_unlock); /* * Helper macros to handle lockdep when locking/unlocking console_sem. We use * macros instead of functions so that _RET_IP_ contains useful information. */ #define down_console_sem() do { \ down(&console_sem);\ mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);\ } while (0) static int __down_trylock_console_sem(unsigned long ip) { int lock_failed; unsigned long flags; /* * Here and in __up_console_sem() we need to be in safe mode, * because spindump/WARN/etc from under console ->lock will * deadlock in printk()->down_trylock_console_sem() otherwise. */ printk_safe_enter_irqsave(flags); lock_failed = down_trylock(&console_sem); printk_safe_exit_irqrestore(flags); if (lock_failed) return 1; mutex_acquire(&console_lock_dep_map, 0, 1, ip); return 0; } #define down_trylock_console_sem() __down_trylock_console_sem(_RET_IP_) static void __up_console_sem(unsigned long ip) { unsigned long flags; mutex_release(&console_lock_dep_map, ip); printk_safe_enter_irqsave(flags); up(&console_sem); printk_safe_exit_irqrestore(flags); } #define up_console_sem() __up_console_sem(_RET_IP_) static bool panic_in_progress(void) { return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID); } /* Return true if a panic is in progress on the current CPU. */ bool this_cpu_in_panic(void) { /* * We can use raw_smp_processor_id() here because it is impossible for * the task to be migrated to the panic_cpu, or away from it. If * panic_cpu has already been set, and we're not currently executing on * that CPU, then we never will be. */ return unlikely(atomic_read(&panic_cpu) == raw_smp_processor_id()); } /* * Return true if a panic is in progress on a remote CPU. * * On true, the local CPU should immediately release any printing resources * that may be needed by the panic CPU. */ bool other_cpu_in_panic(void) { return (panic_in_progress() && !this_cpu_in_panic()); } /* * This is used for debugging the mess that is the VT code by * keeping track if we have the console semaphore held. It's * definitely not the perfect debug tool (we don't know if _WE_ * hold it and are racing, but it helps tracking those weird code * paths in the console code where we end up in places I want * locked without the console semaphore held). */ static int console_locked; /* * Array of consoles built from command line options (console=) */ #define MAX_CMDLINECONSOLES 8 static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; static int preferred_console = -1; int console_set_on_cmdline; EXPORT_SYMBOL(console_set_on_cmdline); /* Flag: console code may call schedule() */ static int console_may_schedule; enum con_msg_format_flags { MSG_FORMAT_DEFAULT = 0, MSG_FORMAT_SYSLOG = (1 << 0), }; static int console_msg_format = MSG_FORMAT_DEFAULT; /* * The printk log buffer consists of a sequenced collection of records, each * containing variable length message text. Every record also contains its * own meta-data (@info). * * Every record meta-data carries the timestamp in microseconds, as well as * the standard userspace syslog level and syslog facility. The usual kernel * messages use LOG_KERN; userspace-injected messages always carry a matching * syslog facility, by default LOG_USER. The origin of every message can be * reliably determined that way. * * The human readable log message of a record is available in @text, the * length of the message text in @text_len. The stored message is not * terminated. * * Optionally, a record can carry a dictionary of properties (key/value * pairs), to provide userspace with a machine-readable message context. * * Examples for well-defined, commonly used property names are: * DEVICE=b12:8 device identifier * b12:8 block dev_t * c127:3 char dev_t * n8 netdev ifindex * +sound:card0 subsystem:devname * SUBSYSTEM=pci driver-core subsystem name * * Valid characters in property names are [a-zA-Z0-9.-_]. Property names * and values are terminated by a '\0' character. * * Example of record values: * record.text_buf = "it's a line" (unterminated) * record.info.seq = 56 * record.info.ts_nsec = 36863 * record.info.text_len = 11 * record.info.facility = 0 (LOG_KERN) * record.info.flags = 0 * record.info.level = 3 (LOG_ERR) * record.info.caller_id = 299 (task 299) * record.info.dev_info.subsystem = "pci" (terminated) * record.info.dev_info.device = "+pci:0000:00:01.0" (terminated) * * The 'struct printk_info' buffer must never be directly exported to * userspace, it is a kernel-private implementation detail that might * need to be changed in the future, when the requirements change. * * /dev/kmsg exports the structured data in the following line format: * "<level>,<sequnum>,<timestamp>,<contflag>[,additional_values, ... ];<message text>\n" * * Users of the export format should ignore possible additional values * separated by ',', and find the message after the ';' character. * * The optional key/value pairs are attached as continuation lines starting * with a space character and terminated by a newline. All possible * non-prinatable characters are escaped in the "\xff" notation. */ /* syslog_lock protects syslog_* variables and write access to clear_seq. */ static DEFINE_MUTEX(syslog_lock); /* * Specifies if a legacy console is registered. If legacy consoles are * present, it is necessary to perform the console lock/unlock dance * whenever console flushing should occur. */ bool have_legacy_console; /* * Specifies if an nbcon console is registered. If nbcon consoles are present, * synchronous printing of legacy consoles will not occur during panic until * the backtrace has been stored to the ringbuffer. */ bool have_nbcon_console; /* * Specifies if a boot console is registered. If boot consoles are present, * nbcon consoles cannot print simultaneously and must be synchronized by * the console lock. This is because boot consoles and nbcon consoles may * have mapped the same hardware. */ bool have_boot_console; /* See printk_legacy_allow_panic_sync() for details. */ bool legacy_allow_panic_sync; #ifdef CONFIG_PRINTK DECLARE_WAIT_QUEUE_HEAD(log_wait); static DECLARE_WAIT_QUEUE_HEAD(legacy_wait); /* All 3 protected by @syslog_lock. */ /* the next printk record to read by syslog(READ) or /proc/kmsg */ static u64 syslog_seq; static size_t syslog_partial; static bool syslog_time; /* True when _all_ printer threads are available for printing. */ bool printk_kthreads_running; struct latched_seq { seqcount_latch_t latch; u64 val[2]; }; /* * The next printk record to read after the last 'clear' command. There are * two copies (updated with seqcount_latch) so that reads can locklessly * access a valid value. Writers are synchronized by @syslog_lock. */ static struct latched_seq clear_seq = { .latch = SEQCNT_LATCH_ZERO(clear_seq.latch), .val[0] = 0, .val[1] = 0, }; #define LOG_LEVEL(v) ((v) & 0x07) #define LOG_FACILITY(v) ((v) >> 3 & 0xff) /* record buffer */ #define LOG_ALIGN __alignof__(unsigned long) #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) #define LOG_BUF_LEN_MAX ((u32)1 << 31) static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); static char *log_buf = __log_buf; static u32 log_buf_len = __LOG_BUF_LEN; /* * Define the average message size. This only affects the number of * descriptors that will be available. Underestimating is better than * overestimating (too many available descriptors is better than not enough). */ #define PRB_AVGBITS 5 /* 32 character average length */ #if CONFIG_LOG_BUF_SHIFT <= PRB_AVGBITS #error CONFIG_LOG_BUF_SHIFT value too small. #endif _DEFINE_PRINTKRB(printk_rb_static, CONFIG_LOG_BUF_SHIFT - PRB_AVGBITS, PRB_AVGBITS, &__log_buf[0]); static struct printk_ringbuffer printk_rb_dynamic; struct printk_ringbuffer *prb = &printk_rb_static; /* * We cannot access per-CPU data (e.g. per-CPU flush irq_work) before * per_cpu_areas are initialised. This variable is set to true when * it's safe to access per-CPU data. */ static bool __printk_percpu_data_ready __ro_after_init; bool printk_percpu_data_ready(void) { return __printk_percpu_data_ready; } /* Must be called under syslog_lock. */ static void latched_seq_write(struct latched_seq *ls, u64 val) { write_seqcount_latch_begin(&ls->latch); ls->val[0] = val; write_seqcount_latch(&ls->latch); ls->val[1] = val; write_seqcount_latch_end(&ls->latch); } /* Can be called from any context. */ static u64 latched_seq_read_nolock(struct latched_seq *ls) { unsigned int seq; unsigned int idx; u64 val; do { seq = read_seqcount_latch(&ls->latch); idx = seq & 0x1; val = ls->val[idx]; } while (read_seqcount_latch_retry(&ls->latch, seq)); return val; } /* Return log buffer address */ char *log_buf_addr_get(void) { return log_buf; } /* Return log buffer size */ u32 log_buf_len_get(void) { return log_buf_len; } /* * Define how much of the log buffer we could take at maximum. The value * must be greater than two. Note that only half of the buffer is available * when the index points to the middle. */ #define MAX_LOG_TAKE_PART 4 static const char trunc_msg[] = "<truncated>"; static void truncate_msg(u16 *text_len, u16 *trunc_msg_len) { /* * The message should not take the whole buffer. Otherwise, it might * get removed too soon. */ u32 max_text_len = log_buf_len / MAX_LOG_TAKE_PART; if (*text_len > max_text_len) *text_len = max_text_len; /* enable the warning message (if there is room) */ *trunc_msg_len = strlen(trunc_msg); if (*text_len >= *trunc_msg_len) *text_len -= *trunc_msg_len; else *trunc_msg_len = 0; } int dmesg_restrict = IS_ENABLED(CONFIG_SECURITY_DMESG_RESTRICT); static int syslog_action_restricted(int type) { if (dmesg_restrict) return 1; /* * Unless restricted, we allow "read all" and "get buffer size" * for everybody. */ return type != SYSLOG_ACTION_READ_ALL && type != SYSLOG_ACTION_SIZE_BUFFER; } static int check_syslog_permissions(int type, int source) { /* * If this is from /proc/kmsg and we've already opened it, then we've * already done the capabilities checks at open time. */ if (source == SYSLOG_FROM_PROC && type != SYSLOG_ACTION_OPEN) goto ok; if (syslog_action_restricted(type)) { if (capable(CAP_SYSLOG)) goto ok; return -EPERM; } ok: return security_syslog(type); } static void append_char(char **pp, char *e, char c) { if (*pp < e) *(*pp)++ = c; } static ssize_t info_print_ext_header(char *buf, size_t size, struct printk_info *info) { u64 ts_usec = info->ts_nsec; char caller[20]; #ifdef CONFIG_PRINTK_CALLER u32 id = info->caller_id; snprintf(caller, sizeof(caller), ",caller=%c%u", id & 0x80000000 ? 'C' : 'T', id & ~0x80000000); #else caller[0] = '\0'; #endif do_div(ts_usec, 1000); return scnprintf(buf, size, "%u,%llu,%llu,%c%s;", (info->facility << 3) | info->level, info->seq, ts_usec, info->flags & LOG_CONT ? 'c' : '-', caller); } static ssize_t msg_add_ext_text(char *buf, size_t size, const char *text, size_t text_len, unsigned char endc) { char *p = buf, *e = buf + size; size_t i; /* escape non-printable characters */ for (i = 0; i < text_len; i++) { unsigned char c = text[i]; if (c < ' ' || c >= 127 || c == '\\') p += scnprintf(p, e - p, "\\x%02x", c); else append_char(&p, e, c); } append_char(&p, e, endc); return p - buf; } static ssize_t msg_add_dict_text(char *buf, size_t size, const char *key, const char *val) { size_t val_len = strlen(val); ssize_t len; if (!val_len) return 0; len = msg_add_ext_text(buf, size, "", 0, ' '); /* dict prefix */ len += msg_add_ext_text(buf + len, size - len, key, strlen(key), '='); len += msg_add_ext_text(buf + len, size - len, val, val_len, '\n'); return len; } static ssize_t msg_print_ext_body(char *buf, size_t size, char *text, size_t text_len, struct dev_printk_info *dev_info) { ssize_t len; len = msg_add_ext_text(buf, size, text, text_len, '\n'); if (!dev_info) goto out; len += msg_add_dict_text(buf + len, size - len, "SUBSYSTEM", dev_info->subsystem); len += msg_add_dict_text(buf + len, size - len, "DEVICE", dev_info->device); out: return len; } /* /dev/kmsg - userspace message inject/listen interface */ struct devkmsg_user { atomic64_t seq; struct ratelimit_state rs; struct mutex lock; struct printk_buffers pbufs; }; static __printf(3, 4) __cold int devkmsg_emit(int facility, int level, const char *fmt, ...) { va_list args; int r; va_start(args, fmt); r = vprintk_emit(facility, level, NULL, fmt, args); va_end(args); return r; } static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) { char *buf, *line; int level = default_message_loglevel; int facility = 1; /* LOG_USER */ struct file *file = iocb->ki_filp; struct devkmsg_user *user = file->private_data; size_t len = iov_iter_count(from); ssize_t ret = len; if (len > PRINTKRB_RECORD_MAX) return -EINVAL; /* Ignore when user logging is disabled. */ if (devkmsg_log & DEVKMSG_LOG_MASK_OFF) return len; /* Ratelimit when not explicitly enabled. */ if (!(devkmsg_log & DEVKMSG_LOG_MASK_ON)) { if (!___ratelimit(&user->rs, current->comm)) return ret; } buf = kmalloc(len+1, GFP_KERNEL); if (buf == NULL) return -ENOMEM; buf[len] = '\0'; if (!copy_from_iter_full(buf, len, from)) { kfree(buf); return -EFAULT; } /* * Extract and skip the syslog prefix <[0-9]*>. Coming from userspace * the decimal value represents 32bit, the lower 3 bit are the log * level, the rest are the log facility. * * If no prefix or no userspace facility is specified, we * enforce LOG_USER, to be able to reliably distinguish * kernel-generated messages from userspace-injected ones. */ line = buf; if (line[0] == '<') { char *endp = NULL; unsigned int u; u = simple_strtoul(line + 1, &endp, 10); if (endp && endp[0] == '>') { level = LOG_LEVEL(u); if (LOG_FACILITY(u) != 0) facility = LOG_FACILITY(u); endp++; line = endp; } } devkmsg_emit(facility, level, "%s", line); kfree(buf); return ret; } static ssize_t devkmsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct devkmsg_user *user = file->private_data; char *outbuf = &user->pbufs.outbuf[0]; struct printk_message pmsg = { .pbufs = &user->pbufs, }; ssize_t ret; ret = mutex_lock_interruptible(&user->lock); if (ret) return ret; if (!printk_get_next_message(&pmsg, atomic64_read(&user->seq), true, false)) { if (file->f_flags & O_NONBLOCK) { ret = -EAGAIN; goto out; } /* * Guarantee this task is visible on the waitqueue before * checking the wake condition. * * The full memory barrier within set_current_state() of * prepare_to_wait_event() pairs with the full memory barrier * within wq_has_sleeper(). * * This pairs with __wake_up_klogd:A. */ ret = wait_event_interruptible(log_wait, printk_get_next_message(&pmsg, atomic64_read(&user->seq), true, false)); /* LMM(devkmsg_read:A) */ if (ret) goto out; } if (pmsg.dropped) { /* our last seen message is gone, return error and reset */ atomic64_set(&user->seq, pmsg.seq); ret = -EPIPE; goto out; } atomic64_set(&user->seq, pmsg.seq + 1); if (pmsg.outbuf_len > count) { ret = -EINVAL; goto out; } if (copy_to_user(buf, outbuf, pmsg.outbuf_len)) { ret = -EFAULT; goto out; } ret = pmsg.outbuf_len; out: mutex_unlock(&user->lock); return ret; } /* * Be careful when modifying this function!!! * * Only few operations are supported because the device works only with the * entire variable length messages (records). Non-standard values are * returned in the other cases and has been this way for quite some time. * User space applications might depend on this behavior. */ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) { struct devkmsg_user *user = file->private_data; loff_t ret = 0; if (offset) return -ESPIPE; switch (whence) { case SEEK_SET: /* the first record */ atomic64_set(&user->seq, prb_first_valid_seq(prb)); break; case SEEK_DATA: /* * The first record after the last SYSLOG_ACTION_CLEAR, * like issued by 'dmesg -c'. Reading /dev/kmsg itself * changes no global state, and does not clear anything. */ atomic64_set(&user->seq, latched_seq_read_nolock(&clear_seq)); break; case SEEK_END: /* after the last record */ atomic64_set(&user->seq, prb_next_seq(prb)); break; default: ret = -EINVAL; } return ret; } static __poll_t devkmsg_poll(struct file *file, poll_table *wait) { struct devkmsg_user *user = file->private_data; struct printk_info info; __poll_t ret = 0; poll_wait(file, &log_wait, wait); if (prb_read_valid_info(prb, atomic64_read(&user->seq), &info, NULL)) { /* return error when data has vanished underneath us */ if (info.seq != atomic64_read(&user->seq)) ret = EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI; else ret = EPOLLIN|EPOLLRDNORM; } return ret; } static int devkmsg_open(struct inode *inode, struct file *file) { struct devkmsg_user *user; int err; if (devkmsg_log & DEVKMSG_LOG_MASK_OFF) return -EPERM; /* write-only does not need any file context */ if ((file->f_flags & O_ACCMODE) != O_WRONLY) { err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL, SYSLOG_FROM_READER); if (err) return err; } user = kvmalloc(sizeof(struct devkmsg_user), GFP_KERNEL); if (!user) return -ENOMEM; ratelimit_default_init(&user->rs); ratelimit_set_flags(&user->rs, RATELIMIT_MSG_ON_RELEASE); mutex_init(&user->lock); atomic64_set(&user->seq, prb_first_valid_seq(prb)); file->private_data = user; return 0; } static int devkmsg_release(struct inode *inode, struct file *file) { struct devkmsg_user *user = file->private_data; ratelimit_state_exit(&user->rs); mutex_destroy(&user->lock); kvfree(user); return 0; } const struct file_operations kmsg_fops = { .open = devkmsg_open, .read = devkmsg_read, .write_iter = devkmsg_write, .llseek = devkmsg_llseek, .poll = devkmsg_poll, .release = devkmsg_release, }; #ifdef CONFIG_VMCORE_INFO /* * This appends the listed symbols to /proc/vmcore * * /proc/vmcore is used by various utilities, like crash and makedumpfile to * obtain access to symbols that are otherwise very difficult to locate. These * symbols are specifically used so that utilities can access and extract the * dmesg log from a vmcore file after a crash. */ void log_buf_vmcoreinfo_setup(void) { struct dev_printk_info *dev_info = NULL; VMCOREINFO_SYMBOL(prb); VMCOREINFO_SYMBOL(printk_rb_static); VMCOREINFO_SYMBOL(clear_seq); /* * Export struct size and field offsets. User space tools can * parse it and detect any changes to structure down the line. */ VMCOREINFO_STRUCT_SIZE(printk_ringbuffer); VMCOREINFO_OFFSET(printk_ringbuffer, desc_ring); VMCOREINFO_OFFSET(printk_ringbuffer, text_data_ring); VMCOREINFO_OFFSET(printk_ringbuffer, fail); VMCOREINFO_STRUCT_SIZE(prb_desc_ring); VMCOREINFO_OFFSET(prb_desc_ring, count_bits); VMCOREINFO_OFFSET(prb_desc_ring, descs); VMCOREINFO_OFFSET(prb_desc_ring, infos); VMCOREINFO_OFFSET(prb_desc_ring, head_id); VMCOREINFO_OFFSET(prb_desc_ring, tail_id); VMCOREINFO_STRUCT_SIZE(prb_desc); VMCOREINFO_OFFSET(prb_desc, state_var); VMCOREINFO_OFFSET(prb_desc, text_blk_lpos); VMCOREINFO_STRUCT_SIZE(prb_data_blk_lpos); VMCOREINFO_OFFSET(prb_data_blk_lpos, begin); VMCOREINFO_OFFSET(prb_data_blk_lpos, next); VMCOREINFO_STRUCT_SIZE(printk_info); VMCOREINFO_OFFSET(printk_info, seq); VMCOREINFO_OFFSET(printk_info, ts_nsec); VMCOREINFO_OFFSET(printk_info, text_len); VMCOREINFO_OFFSET(printk_info, caller_id); VMCOREINFO_OFFSET(printk_info, dev_info); VMCOREINFO_STRUCT_SIZE(dev_printk_info); VMCOREINFO_OFFSET(dev_printk_info, subsystem); VMCOREINFO_LENGTH(printk_info_subsystem, sizeof(dev_info->subsystem)); VMCOREINFO_OFFSET(dev_printk_info, device); VMCOREINFO_LENGTH(printk_info_device, sizeof(dev_info->device)); VMCOREINFO_STRUCT_SIZE(prb_data_ring); VMCOREINFO_OFFSET(prb_data_ring, size_bits); VMCOREINFO_OFFSET(prb_data_ring, data); VMCOREINFO_OFFSET(prb_data_ring, head_lpos); VMCOREINFO_OFFSET(prb_data_ring, tail_lpos); VMCOREINFO_SIZE(atomic_long_t); VMCOREINFO_TYPE_OFFSET(atomic_long_t, counter); VMCOREINFO_STRUCT_SIZE(latched_seq); VMCOREINFO_OFFSET(latched_seq, val); } #endif /* requested log_buf_len from kernel cmdline */ static unsigned long __initdata new_log_buf_len; /* we practice scaling the ring buffer by powers of 2 */ static void __init log_buf_len_update(u64 size) { if (size > (u64)LOG_BUF_LEN_MAX) { size = (u64)LOG_BUF_LEN_MAX; pr_err("log_buf over 2G is not supported.\n"); } if (size) size = roundup_pow_of_two(size); if (size > log_buf_len) new_log_buf_len = (unsigned long)size; } /* save requested log_buf_len since it's too early to process it */ static int __init log_buf_len_setup(char *str) { u64 size; if (!str) return -EINVAL; size = memparse(str, &str); log_buf_len_update(size); return 0; } early_param("log_buf_len", log_buf_len_setup); #ifdef CONFIG_SMP #define __LOG_CPU_MAX_BUF_LEN (1 << CONFIG_LOG_CPU_MAX_BUF_SHIFT) static void __init log_buf_add_cpu(void) { unsigned int cpu_extra; /* * archs should set up cpu_possible_bits properly with * set_cpu_possible() after setup_arch() but just in * case lets ensure this is valid. */ if (num_possible_cpus() == 1) return; cpu_extra = (num_possible_cpus() - 1) * __LOG_CPU_MAX_BUF_LEN; /* by default this will only continue through for large > 64 CPUs */ if (cpu_extra <= __LOG_BUF_LEN / 2) return; pr_info("log_buf_len individual max cpu contribution: %d bytes\n", __LOG_CPU_MAX_BUF_LEN); pr_info("log_buf_len total cpu_extra contributions: %d bytes\n", cpu_extra); pr_info("log_buf_len min size: %d bytes\n", __LOG_BUF_LEN); log_buf_len_update(cpu_extra + __LOG_BUF_LEN); } #else /* !CONFIG_SMP */ static inline void log_buf_add_cpu(void) {} #endif /* CONFIG_SMP */ static void __init set_percpu_data_ready(void) { __printk_percpu_data_ready = true; } static unsigned int __init add_to_rb(struct printk_ringbuffer *rb, struct printk_record *r) { struct prb_reserved_entry e; struct printk_record dest_r; prb_rec_init_wr(&dest_r, r->info->text_len); if (!prb_reserve(&e, rb, &dest_r)) return 0; memcpy(&dest_r.text_buf[0], &r->text_buf[0], r->info->text_len); dest_r.info->text_len = r->info->text_len; dest_r.info->facility = r->info->facility; dest_r.info->level = r->info->level; dest_r.info->flags = r->info->flags; dest_r.info->ts_nsec = r->info->ts_nsec; dest_r.info->caller_id = r->info->caller_id; memcpy(&dest_r.info->dev_info, &r->info->dev_info, sizeof(dest_r.info->dev_info)); prb_final_commit(&e); return prb_record_text_space(&e); } static char setup_text_buf[PRINTKRB_RECORD_MAX] __initdata; static void print_log_buf_usage_stats(void) { unsigned int descs_count = log_buf_len >> PRB_AVGBITS; size_t meta_data_size; meta_data_size = descs_count * (sizeof(struct prb_desc) + sizeof(struct printk_info)); pr_info("log buffer data + meta data: %u + %zu = %zu bytes\n", log_buf_len, meta_data_size, log_buf_len + meta_data_size); } void __init setup_log_buf(int early) { struct printk_info *new_infos; unsigned int new_descs_count; struct prb_desc *new_descs; struct printk_info info; struct printk_record r; unsigned int text_size; size_t new_descs_size; size_t new_infos_size; unsigned long flags; char *new_log_buf; unsigned int free; u64 seq; /* * Some archs call setup_log_buf() multiple times - first is very * early, e.g. from setup_arch(), and second - when percpu_areas * are initialised. */ if (!early) set_percpu_data_ready(); if (log_buf != __log_buf) return; if (!early && !new_log_buf_len) log_buf_add_cpu(); if (!new_log_buf_len) { /* Show the memory stats only once. */ if (!early) goto out; return; } new_descs_count = new_log_buf_len >> PRB_AVGBITS; if (new_descs_count == 0) { pr_err("new_log_buf_len: %lu too small\n", new_log_buf_len); goto out; } new_log_buf = memblock_alloc(new_log_buf_len, LOG_ALIGN); if (unlikely(!new_log_buf)) { pr_err("log_buf_len: %lu text bytes not available\n", new_log_buf_len); goto out; } new_descs_size = new_descs_count * sizeof(struct prb_desc); new_descs = memblock_alloc(new_descs_size, LOG_ALIGN); if (unlikely(!new_descs)) { pr_err("log_buf_len: %zu desc bytes not available\n", new_descs_size); goto err_free_log_buf; } new_infos_size = new_descs_count * sizeof(struct printk_info); new_infos = memblock_alloc(new_infos_size, LOG_ALIGN); if (unlikely(!new_infos)) { pr_err("log_buf_len: %zu info bytes not available\n", new_infos_size); goto err_free_descs; } prb_rec_init_rd(&r, &info, &setup_text_buf[0], sizeof(setup_text_buf)); prb_init(&printk_rb_dynamic, new_log_buf, ilog2(new_log_buf_len), new_descs, ilog2(new_descs_count), new_infos); local_irq_save(flags); log_buf_len = new_log_buf_len; log_buf = new_log_buf; new_log_buf_len = 0; free = __LOG_BUF_LEN; prb_for_each_record(0, &printk_rb_static, seq, &r) { text_size = add_to_rb(&printk_rb_dynamic, &r); if (text_size > free) free = 0; else free -= text_size; } prb = &printk_rb_dynamic; local_irq_restore(flags); /* * Copy any remaining messages that might have appeared from * NMI context after copying but before switching to the * dynamic buffer. */ prb_for_each_record(seq, &printk_rb_static, seq, &r) { text_size = add_to_rb(&printk_rb_dynamic, &r); if (text_size > free) free = 0; else free -= text_size; } if (seq != prb_next_seq(&printk_rb_static)) { pr_err("dropped %llu messages\n", prb_next_seq(&printk_rb_static) - seq); } print_log_buf_usage_stats(); pr_info("early log buf free: %u(%u%%)\n", free, (free * 100) / __LOG_BUF_LEN); return; err_free_descs: memblock_free(new_descs, new_descs_size); err_free_log_buf: memblock_free(new_log_buf, new_log_buf_len); out: print_log_buf_usage_stats(); } static bool __read_mostly ignore_loglevel; static int __init ignore_loglevel_setup(char *str) { ignore_loglevel = true; pr_info("debug: ignoring loglevel setting.\n"); return 0; } early_param("ignore_loglevel", ignore_loglevel_setup); module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting (prints all kernel messages to the console)"); static bool suppress_message_printing(int level) { return (level >= console_loglevel && !ignore_loglevel); } #ifdef CONFIG_BOOT_PRINTK_DELAY static int boot_delay; /* msecs delay after each printk during bootup */ static unsigned long long loops_per_msec; /* based on boot_delay */ static int __init boot_delay_setup(char *str) { unsigned long lpj; lpj = preset_lpj ? preset_lpj : 1000000; /* some guess */ loops_per_msec = (unsigned long long)lpj / 1000 * HZ; get_option(&str, &boot_delay); if (boot_delay > 10 * 1000) boot_delay = 0; pr_debug("boot_delay: %u, preset_lpj: %ld, lpj: %lu, " "HZ: %d, loops_per_msec: %llu\n", boot_delay, preset_lpj, lpj, HZ, loops_per_msec); return 0; } early_param("boot_delay", boot_delay_setup); static void boot_delay_msec(int level) { unsigned long long k; unsigned long timeout; bool suppress = !is_printk_force_console() && suppress_message_printing(level); if ((boot_delay == 0 || system_state >= SYSTEM_RUNNING) || suppress) return; k = (unsigned long long)loops_per_msec * boot_delay; timeout = jiffies + msecs_to_jiffies(boot_delay); while (k) { k--; cpu_relax(); /* * use (volatile) jiffies to prevent * compiler reduction; loop termination via jiffies * is secondary and may or may not happen. */ if (time_after(jiffies, timeout)) break; touch_nmi_watchdog(); } } #else static inline void boot_delay_msec(int level) { } #endif static bool printk_time = IS_ENABLED(CONFIG_PRINTK_TIME); module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); static size_t print_syslog(unsigned int level, char *buf) { return sprintf(buf, "<%u>", level); } static size_t print_time(u64 ts, char *buf) { unsigned long rem_nsec = do_div(ts, 1000000000); return sprintf(buf, "[%5lu.%06lu]", (unsigned long)ts, rem_nsec / 1000); } #ifdef CONFIG_PRINTK_CALLER static size_t print_caller(u32 id, char *buf) { char caller[12]; snprintf(caller, sizeof(caller), "%c%u", id & 0x80000000 ? 'C' : 'T', id & ~0x80000000); return sprintf(buf, "[%6s]", caller); } #else #define print_caller(id, buf) 0 #endif static size_t info_print_prefix(const struct printk_info *info, bool syslog, bool time, char *buf) { size_t len = 0; if (syslog) len = print_syslog((info->facility << 3) | info->level, buf); if (time) len += print_time(info->ts_nsec, buf + len); len += print_caller(info->caller_id, buf + len); if (IS_ENABLED(CONFIG_PRINTK_CALLER) || time) { buf[len++] = ' '; buf[len] = '\0'; } return len; } /* * Prepare the record for printing. The text is shifted within the given * buffer to avoid a need for another one. The following operations are * done: * * - Add prefix for each line. * - Drop truncated lines that no longer fit into the buffer. * - Add the trailing newline that has been removed in vprintk_store(). * - Add a string terminator. * * Since the produced string is always terminated, the maximum possible * return value is @r->text_buf_size - 1; * * Return: The length of the updated/prepared text, including the added * prefixes and the newline. The terminator is not counted. The dropped * line(s) are not counted. */ static size_t record_print_text(struct printk_record *r, bool syslog, bool time) { size_t text_len = r->info->text_len; size_t buf_size = r->text_buf_size; char *text = r->text_buf; char prefix[PRINTK_PREFIX_MAX]; bool truncated = false; size_t prefix_len; size_t line_len; size_t len = 0; char *next; /* * If the message was truncated because the buffer was not large * enough, treat the available text as if it were the full text. */ if (text_len > buf_size) text_len = buf_size; prefix_len = info_print_prefix(r->info, syslog, time, prefix); /* * @text_len: bytes of unprocessed text * @line_len: bytes of current line _without_ newline * @text: pointer to beginning of current line * @len: number of bytes prepared in r->text_buf */ for (;;) { next = memchr(text, '\n', text_len); if (next) { line_len = next - text; } else { /* Drop truncated line(s). */ if (truncated) break; line_len = text_len; } /* * Truncate the text if there is not enough space to add the * prefix and a trailing newline and a terminator. */ if (len + prefix_len + text_len + 1 + 1 > buf_size) { /* Drop even the current line if no space. */ if (len + prefix_len + line_len + 1 + 1 > buf_size) break; text_len = buf_size - len - prefix_len - 1 - 1; truncated = true; } memmove(text + prefix_len, text, text_len); memcpy(text, prefix, prefix_len); /* * Increment the prepared length to include the text and * prefix that were just moved+copied. Also increment for the * newline at the end of this line. If this is the last line, * there is no newline, but it will be added immediately below. */ len += prefix_len + line_len + 1; if (text_len == line_len) { /* * This is the last line. Add the trailing newline * removed in vprintk_store(). */ text[prefix_len + line_len] = '\n'; break; } /* * Advance beyond the added prefix and the related line with * its newline. */ text += prefix_len + line_len + 1; /* * The remaining text has only decreased by the line with its * newline. * * Note that @text_len can become zero. It happens when @text * ended with a newline (either due to truncation or the * original string ending with "\n\n"). The loop is correctly * repeated and (if not truncated) an empty line with a prefix * will be prepared. */ text_len -= line_len + 1; } /* * If a buffer was provided, it will be terminated. Space for the * string terminator is guaranteed to be available. The terminator is * not counted in the return value. */ if (buf_size > 0) r->text_buf[len] = 0; return len; } static size_t get_record_print_text_size(struct printk_info *info, unsigned int line_count, bool syslog, bool time) { char prefix[PRINTK_PREFIX_MAX]; size_t prefix_len; prefix_len = info_print_prefix(info, syslog, time, prefix); /* * Each line will be preceded with a prefix. The intermediate * newlines are already within the text, but a final trailing * newline will be added. */ return ((prefix_len * line_count) + info->text_len + 1); } /* * Beginning with @start_seq, find the first record where it and all following * records up to (but not including) @max_seq fit into @size. * * @max_seq is simply an upper bound and does not need to exist. If the caller * does not require an upper bound, -1 can be used for @max_seq. */ static u64 find_first_fitting_seq(u64 start_seq, u64 max_seq, size_t size, bool syslog, bool time) { struct printk_info info; unsigned int line_count; size_t len = 0; u64 seq; /* Determine the size of the records up to @max_seq. */ prb_for_each_info(start_seq, prb, seq, &info, &line_count) { if (info.seq >= max_seq) break; len += get_record_print_text_size(&info, line_count, syslog, time); } /* * Adjust the upper bound for the next loop to avoid subtracting * lengths that were never added. */ if (seq < max_seq) max_seq = seq; /* * Move first record forward until length fits into the buffer. Ignore * newest messages that were not counted in the above cycle. Messages * might appear and get lost in the meantime. This is a best effort * that prevents an infinite loop that could occur with a retry. */ prb_for_each_info(start_seq, prb, seq, &info, &line_count) { if (len <= size || info.seq >= max_seq) break; len -= get_record_print_text_size(&info, line_count, syslog, time); } return seq; } /* The caller is responsible for making sure @size is greater than 0. */ static int syslog_print(char __user *buf, int size) { struct printk_info info; struct printk_record r; char *text; int len = 0; u64 seq; text = kmalloc(PRINTK_MESSAGE_MAX, GFP_KERNEL); if (!text) return -ENOMEM; prb_rec_init_rd(&r, &info, text, PRINTK_MESSAGE_MAX); mutex_lock(&syslog_lock); /* * Wait for the @syslog_seq record to be available. @syslog_seq may * change while waiting. */ do { seq = syslog_seq; mutex_unlock(&syslog_lock); /* * Guarantee this task is visible on the waitqueue before * checking the wake condition. * * The full memory barrier within set_current_state() of * prepare_to_wait_event() pairs with the full memory barrier * within wq_has_sleeper(). * * This pairs with __wake_up_klogd:A. */ len = wait_event_interruptible(log_wait, prb_read_valid(prb, seq, NULL)); /* LMM(syslog_print:A) */ mutex_lock(&syslog_lock); if (len) goto out; } while (syslog_seq != seq); /* * Copy records that fit into the buffer. The above cycle makes sure * that the first record is always available. */ do { size_t n; size_t skip; int err; if (!prb_read_valid(prb, syslog_seq, &r)) break; if (r.info->seq != syslog_seq) { /* message is gone, move to next valid one */ syslog_seq = r.info->seq; syslog_partial = 0; } /* * To keep reading/counting partial line consistent, * use printk_time value as of the beginning of a line. */ if (!syslog_partial) syslog_time = printk_time; skip = syslog_partial; n = record_print_text(&r, true, syslog_time); if (n - syslog_partial <= size) { /* message fits into buffer, move forward */ syslog_seq = r.info->seq + 1; n -= syslog_partial; syslog_partial = 0; } else if (!len){ /* partial read(), remember position */ n = size; syslog_partial += n; } else n = 0; if (!n) break; mutex_unlock(&syslog_lock); err = copy_to_user(buf, text + skip, n); mutex_lock(&syslog_lock); if (err) { if (!len) len = -EFAULT; break; } len += n; size -= n; buf += n; } while (size); out: mutex_unlock(&syslog_lock); kfree(text); return len; } static int syslog_print_all(char __user *buf, int size, bool clear) { struct printk_info info; struct printk_record r; char *text; int len = 0; u64 seq; bool time; text = kmalloc(PRINTK_MESSAGE_MAX, GFP_KERNEL); if (!text) return -ENOMEM; time = printk_time; /* * Find first record that fits, including all following records, * into the user-provided buffer for this dump. */ seq = find_first_fitting_seq(latched_seq_read_nolock(&clear_seq), -1, size, true, time); prb_rec_init_rd(&r, &info, text, PRINTK_MESSAGE_MAX); prb_for_each_record(seq, prb, seq, &r) { int textlen; textlen = record_print_text(&r, true, time); if (len + textlen > size) { seq--; break; } if (copy_to_user(buf + len, text, textlen)) len = -EFAULT; else len += textlen; if (len < 0) break; } if (clear) { mutex_lock(&syslog_lock); latched_seq_write(&clear_seq, seq); mutex_unlock(&syslog_lock); } kfree(text); return len; } static void syslog_clear(void) { mutex_lock(&syslog_lock); latched_seq_write(&clear_seq, prb_next_seq(prb)); mutex_unlock(&syslog_lock); } int do_syslog(int type, char __user *buf, int len, int source) { struct printk_info info; bool clear = false; static int saved_console_loglevel = LOGLEVEL_DEFAULT; int error; error = check_syslog_permissions(type, source); if (error) return error; switch (type) { case SYSLOG_ACTION_CLOSE: /* Close log */ break; case SYSLOG_ACTION_OPEN: /* Open log */ break; case SYSLOG_ACTION_READ: /* Read from log */ if (!buf || len < 0) return -EINVAL; if (!len) return 0; if (!access_ok(buf, len)) return -EFAULT; error = syslog_print(buf, len); break; /* Read/clear last kernel messages */ case SYSLOG_ACTION_READ_CLEAR: clear = true; fallthrough; /* Read last kernel messages */ case SYSLOG_ACTION_READ_ALL: if (!buf || len < 0) return -EINVAL; if (!len) return 0; if (!access_ok(buf, len)) return -EFAULT; error = syslog_print_all(buf, len, clear); break; /* Clear ring buffer */ case SYSLOG_ACTION_CLEAR: syslog_clear(); break; /* Disable logging to console */ case SYSLOG_ACTION_CONSOLE_OFF: if (saved_console_loglevel == LOGLEVEL_DEFAULT) saved_console_loglevel = console_loglevel; console_loglevel = minimum_console_loglevel; break; /* Enable logging to console */ case SYSLOG_ACTION_CONSOLE_ON: if (saved_console_loglevel != LOGLEVEL_DEFAULT) { console_loglevel = saved_console_loglevel; saved_console_loglevel = LOGLEVEL_DEFAULT; } break; /* Set level of messages printed to console */ case SYSLOG_ACTION_CONSOLE_LEVEL: if (len < 1 || len > 8) return -EINVAL; if (len < minimum_console_loglevel) len = minimum_console_loglevel; console_loglevel = len; /* Implicitly re-enable logging to console */ saved_console_loglevel = LOGLEVEL_DEFAULT; break; /* Number of chars in the log buffer */ case SYSLOG_ACTION_SIZE_UNREAD: mutex_lock(&syslog_lock); if (!prb_read_valid_info(prb, syslog_seq, &info, NULL)) { /* No unread messages. */ mutex_unlock(&syslog_lock); return 0; } if (info.seq != syslog_seq) { /* messages are gone, move to first one */ syslog_seq = info.seq; syslog_partial = 0; } if (source == SYSLOG_FROM_PROC) { /* * Short-cut for poll(/"proc/kmsg") which simply checks * for pending data, not the size; return the count of * records, not the length. */ error = prb_next_seq(prb) - syslog_seq; } else { bool time = syslog_partial ? syslog_time : printk_time; unsigned int line_count; u64 seq; prb_for_each_info(syslog_seq, prb, seq, &info, &line_count) { error += get_record_print_text_size(&info, line_count, true, time); time = printk_time; } error -= syslog_partial; } mutex_unlock(&syslog_lock); break; /* Size of the log buffer */ case SYSLOG_ACTION_SIZE_BUFFER: error = log_buf_len; break; default: error = -EINVAL; break; } return error; } SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) { return do_syslog(type, buf, len, SYSLOG_FROM_READER); } /* * Special console_lock variants that help to reduce the risk of soft-lockups. * They allow to pass console_lock to another printk() call using a busy wait. */ #ifdef CONFIG_LOCKDEP static struct lockdep_map console_owner_dep_map = { .name = "console_owner" }; #endif static DEFINE_RAW_SPINLOCK(console_owner_lock); static struct task_struct *console_owner; static bool console_waiter; /** * console_lock_spinning_enable - mark beginning of code where another * thread might safely busy wait * * This basically converts console_lock into a spinlock. This marks * the section where the console_lock owner can not sleep, because * there may be a waiter spinning (like a spinlock). Also it must be * ready to hand over the lock at the end of the section. */ void console_lock_spinning_enable(void) { /* * Do not use spinning in panic(). The panic CPU wants to keep the lock. * Non-panic CPUs abandon the flush anyway. * * Just keep the lockdep annotation. The panic-CPU should avoid * taking console_owner_lock because it might cause a deadlock. * This looks like the easiest way how to prevent false lockdep * reports without handling races a lockless way. */ if (panic_in_progress()) goto lockdep; raw_spin_lock(&console_owner_lock); console_owner = current; raw_spin_unlock(&console_owner_lock); lockdep: /* The waiter may spin on us after setting console_owner */ spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); } /** * console_lock_spinning_disable_and_check - mark end of code where another * thread was able to busy wait and check if there is a waiter * @cookie: cookie returned from console_srcu_read_lock() * * This is called at the end of the section where spinning is allowed. * It has two functions. First, it is a signal that it is no longer * safe to start busy waiting for the lock. Second, it checks if * there is a busy waiter and passes the lock rights to her. * * Important: Callers lose both the console_lock and the SRCU read lock if * there was a busy waiter. They must not touch items synchronized by * console_lock or SRCU read lock in this case. * * Return: 1 if the lock rights were passed, 0 otherwise. */ int console_lock_spinning_disable_and_check(int cookie) { int waiter; /* * Ignore spinning waiters during panic() because they might get stopped * or blocked at any time, * * It is safe because nobody is allowed to start spinning during panic * in the first place. If there has been a waiter then non panic CPUs * might stay spinning. They would get stopped anyway. The panic context * will never start spinning and an interrupted spin on panic CPU will * never continue. */ if (panic_in_progress()) { /* Keep lockdep happy. */ spin_release(&console_owner_dep_map, _THIS_IP_); return 0; } raw_spin_lock(&console_owner_lock); waiter = READ_ONCE(console_waiter); console_owner = NULL; raw_spin_unlock(&console_owner_lock); if (!waiter) { spin_release(&console_owner_dep_map, _THIS_IP_); return 0; } /* The waiter is now free to continue */ WRITE_ONCE(console_waiter, false); spin_release(&console_owner_dep_map, _THIS_IP_); /* * Preserve lockdep lock ordering. Release the SRCU read lock before * releasing the console_lock. */ console_srcu_read_unlock(cookie); /* * Hand off console_lock to waiter. The waiter will perform * the up(). After this, the waiter is the console_lock owner. */ mutex_release(&console_lock_dep_map, _THIS_IP_); return 1; } /** * console_trylock_spinning - try to get console_lock by busy waiting * * This allows to busy wait for the console_lock when the current * owner is running in specially marked sections. It means that * the current owner is running and cannot reschedule until it * is ready to lose the lock. * * Return: 1 if we got the lock, 0 othrewise */ static int console_trylock_spinning(void) { struct task_struct *owner = NULL; bool waiter; bool spin = false; unsigned long flags; if (console_trylock()) return 1; /* * It's unsafe to spin once a panic has begun. If we are the * panic CPU, we may have already halted the owner of the * console_sem. If we are not the panic CPU, then we should * avoid taking console_sem, so the panic CPU has a better * chance of cleanly acquiring it later. */ if (panic_in_progress()) return 0; printk_safe_enter_irqsave(flags); raw_spin_lock(&console_owner_lock); owner = READ_ONCE(console_owner); waiter = READ_ONCE(console_waiter); if (!waiter && owner && owner != current) { WRITE_ONCE(console_waiter, true); spin = true; } raw_spin_unlock(&console_owner_lock); /* * If there is an active printk() writing to the * consoles, instead of having it write our data too, * see if we can offload that load from the active * printer, and do some printing ourselves. * Go into a spin only if there isn't already a waiter * spinning, and there is an active printer, and * that active printer isn't us (recursive printk?). */ if (!spin) { printk_safe_exit_irqrestore(flags); return 0; } /* We spin waiting for the owner to release us */ spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); /* Owner will clear console_waiter on hand off */ while (READ_ONCE(console_waiter)) cpu_relax(); spin_release(&console_owner_dep_map, _THIS_IP_); printk_safe_exit_irqrestore(flags); /* * The owner passed the console lock to us. * Since we did not spin on console lock, annotate * this as a trylock. Otherwise lockdep will * complain. */ mutex_acquire(&console_lock_dep_map, 0, 1, _THIS_IP_); /* * Update @console_may_schedule for trylock because the previous * owner may have been schedulable. */ console_may_schedule = 0; return 1; } /* * Recursion is tracked separately on each CPU. If NMIs are supported, an * additional NMI context per CPU is also separately tracked. Until per-CPU * is available, a separate "early tracking" is performed. */ static DEFINE_PER_CPU(u8, printk_count); static u8 printk_count_early; #ifdef CONFIG_HAVE_NMI static DEFINE_PER_CPU(u8, printk_count_nmi); static u8 printk_count_nmi_early; #endif /* * Recursion is limited to keep the output sane. printk() should not require * more than 1 level of recursion (allowing, for example, printk() to trigger * a WARN), but a higher value is used in case some printk-internal errors * exist, such as the ringbuffer validation checks failing. */ #define PRINTK_MAX_RECURSION 3 /* * Return a pointer to the dedicated counter for the CPU+context of the * caller. */ static u8 *__printk_recursion_counter(void) { #ifdef CONFIG_HAVE_NMI if (in_nmi()) { if (printk_percpu_data_ready()) return this_cpu_ptr(&printk_count_nmi); return &printk_count_nmi_early; } #endif if (printk_percpu_data_ready()) return this_cpu_ptr(&printk_count); return &printk_count_early; } /* * Enter recursion tracking. Interrupts are disabled to simplify tracking. * The caller must check the boolean return value to see if the recursion is * allowed. On failure, interrupts are not disabled. * * @recursion_ptr must be a variable of type (u8 *) and is the same variable * that is passed to printk_exit_irqrestore(). */ #define printk_enter_irqsave(recursion_ptr, flags) \ ({ \ bool success = true; \ \ typecheck(u8 *, recursion_ptr); \ local_irq_save(flags); \ (recursion_ptr) = __printk_recursion_counter(); \ if (*(recursion_ptr) > PRINTK_MAX_RECURSION) { \ local_irq_restore(flags); \ success = false; \ } else { \ (*(recursion_ptr))++; \ } \ success; \ }) /* Exit recursion tracking, restoring interrupts. */ #define printk_exit_irqrestore(recursion_ptr, flags) \ do { \ typecheck(u8 *, recursion_ptr); \ (*(recursion_ptr))--; \ local_irq_restore(flags); \ } while (0) int printk_delay_msec __read_mostly; static inline void printk_delay(int level) { boot_delay_msec(level); if (unlikely(printk_delay_msec)) { int m = printk_delay_msec; while (m--) { mdelay(1); touch_nmi_watchdog(); } } } static inline u32 printk_caller_id(void) { return in_task() ? task_pid_nr(current) : 0x80000000 + smp_processor_id(); } /** * printk_parse_prefix - Parse level and control flags. * * @text: The terminated text message. * @level: A pointer to the current level value, will be updated. * @flags: A pointer to the current printk_info flags, will be updated. * * @level may be NULL if the caller is not interested in the parsed value. * Otherwise the variable pointed to by @level must be set to * LOGLEVEL_DEFAULT in order to be updated with the parsed value. * * @flags may be NULL if the caller is not interested in the parsed value. * Otherwise the variable pointed to by @flags will be OR'd with the parsed * value. * * Return: The length of the parsed level and control flags. */ u16 printk_parse_prefix(const char *text, int *level, enum printk_info_flags *flags) { u16 prefix_len = 0; int kern_level; while (*text) { kern_level = printk_get_level(text); if (!kern_level) break; switch (kern_level) { case '0' ... '7': if (level && *level == LOGLEVEL_DEFAULT) *level = kern_level - '0'; break; case 'c': /* KERN_CONT */ if (flags) *flags |= LOG_CONT; } prefix_len += 2; text += 2; } return prefix_len; } __printf(5, 0) static u16 printk_sprint(char *text, u16 size, int facility, enum printk_info_flags *flags, const char *fmt, va_list args) { u16 text_len; text_len = vscnprintf(text, size, fmt, args); /* Mark and strip a trailing newline. */ if (text_len && text[text_len - 1] == '\n') { text_len--; *flags |= LOG_NEWLINE; } /* Strip log level and control flags. */ if (facility == 0) { u16 prefix_len; prefix_len = printk_parse_prefix(text, NULL, NULL); if (prefix_len) { text_len -= prefix_len; memmove(text, text + prefix_len, text_len); } } trace_console(text, text_len); return text_len; } __printf(4, 0) int vprintk_store(int facility, int level, const struct dev_printk_info *dev_info, const char *fmt, va_list args) { struct prb_reserved_entry e; enum printk_info_flags flags = 0; struct printk_record r; unsigned long irqflags; u16 trunc_msg_len = 0; char prefix_buf[8]; u8 *recursion_ptr; u16 reserve_size; va_list args2; u32 caller_id; u16 text_len; int ret = 0; u64 ts_nsec; if (!printk_enter_irqsave(recursion_ptr, irqflags)) return 0; /* * Since the duration of printk() can vary depending on the message * and state of the ringbuffer, grab the timestamp now so that it is * close to the call of printk(). This provides a more deterministic * timestamp with respect to the caller. */ ts_nsec = local_clock(); caller_id = printk_caller_id(); /* * The sprintf needs to come first since the syslog prefix might be * passed in as a parameter. An extra byte must be reserved so that * later the vscnprintf() into the reserved buffer has room for the * terminating '\0', which is not counted by vsnprintf(). */ va_copy(args2, args); reserve_size = vsnprintf(&prefix_buf[0], sizeof(prefix_buf), fmt, args2) + 1; va_end(args2); if (reserve_size > PRINTKRB_RECORD_MAX) reserve_size = PRINTKRB_RECORD_MAX; /* Extract log level or control flags. */ if (facility == 0) printk_parse_prefix(&prefix_buf[0], &level, &flags); if (level == LOGLEVEL_DEFAULT) level = default_message_loglevel; if (dev_info) flags |= LOG_NEWLINE; if (is_printk_force_console()) flags |= LOG_FORCE_CON; if (flags & LOG_CONT) { prb_rec_init_wr(&r, reserve_size); if (prb_reserve_in_last(&e, prb, &r, caller_id, PRINTKRB_RECORD_MAX)) { text_len = printk_sprint(&r.text_buf[r.info->text_len], reserve_size, facility, &flags, fmt, args); r.info->text_len += text_len; if (flags & LOG_FORCE_CON) r.info->flags |= LOG_FORCE_CON; if (flags & LOG_NEWLINE) { r.info->flags |= LOG_NEWLINE; prb_final_commit(&e); } else { prb_commit(&e); } ret = text_len; goto out; } } /* * Explicitly initialize the record before every prb_reserve() call. * prb_reserve_in_last() and prb_reserve() purposely invalidate the * structure when they fail. */ prb_rec_init_wr(&r, reserve_size); if (!prb_reserve(&e, prb, &r)) { /* truncate the message if it is too long for empty buffer */ truncate_msg(&reserve_size, &trunc_msg_len); prb_rec_init_wr(&r, reserve_size + trunc_msg_len); if (!prb_reserve(&e, prb, &r)) goto out; } /* fill message */ text_len = printk_sprint(&r.text_buf[0], reserve_size, facility, &flags, fmt, args); if (trunc_msg_len) memcpy(&r.text_buf[text_len], trunc_msg, trunc_msg_len); r.info->text_len = text_len + trunc_msg_len; r.info->facility = facility; r.info->level = level & 7; r.info->flags = flags & 0x1f; r.info->ts_nsec = ts_nsec; r.info->caller_id = caller_id; if (dev_info) memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info)); /* A message without a trailing newline can be continued. */ if (!(flags & LOG_NEWLINE)) prb_commit(&e); else prb_final_commit(&e); ret = text_len + trunc_msg_len; out: printk_exit_irqrestore(recursion_ptr, irqflags); return ret; } /* * This acts as a one-way switch to allow legacy consoles to print from * the printk() caller context on a panic CPU. It also attempts to flush * the legacy consoles in this context. */ void printk_legacy_allow_panic_sync(void) { struct console_flush_type ft; legacy_allow_panic_sync = true; printk_get_console_flush_type(&ft); if (ft.legacy_direct) { if (console_trylock()) console_unlock(); } } bool __read_mostly debug_non_panic_cpus; #ifdef CONFIG_PRINTK_CALLER static int __init debug_non_panic_cpus_setup(char *str) { debug_non_panic_cpus = true; pr_info("allow messages from non-panic CPUs in panic()\n"); return 0; } early_param("debug_non_panic_cpus", debug_non_panic_cpus_setup); module_param(debug_non_panic_cpus, bool, 0644); MODULE_PARM_DESC(debug_non_panic_cpus, "allow messages from non-panic CPUs in panic()"); #endif asmlinkage int vprintk_emit(int facility, int level, const struct dev_printk_info *dev_info, const char *fmt, va_list args) { struct console_flush_type ft; int printed_len; /* Suppress unimportant messages after panic happens */ if (unlikely(suppress_printk)) return 0; /* * The messages on the panic CPU are the most important. If * non-panic CPUs are generating any messages, they will be * silently dropped. */ if (other_cpu_in_panic() && !debug_non_panic_cpus && !panic_triggering_all_cpu_backtrace) return 0; printk_get_console_flush_type(&ft); /* If called from the scheduler, we can not call up(). */ if (level == LOGLEVEL_SCHED) { level = LOGLEVEL_DEFAULT; ft.legacy_offload |= ft.legacy_direct; ft.legacy_direct = false; } printk_delay(level); printed_len = vprintk_store(facility, level, dev_info, fmt, args); if (ft.nbcon_atomic) nbcon_atomic_flush_pending(); if (ft.nbcon_offload) nbcon_kthreads_wake(); if (ft.legacy_direct) { /* * The caller may be holding system-critical or * timing-sensitive locks. Disable preemption during * printing of all remaining records to all consoles so that * this context can return as soon as possible. Hopefully * another printk() caller will take over the printing. */ preempt_disable(); /* * Try to acquire and then immediately release the console * semaphore. The release will print out buffers. With the * spinning variant, this context tries to take over the * printing from another printing context. */ if (console_trylock_spinning()) console_unlock(); preempt_enable(); } if (ft.legacy_offload) defer_console_output(); else wake_up_klogd(); return printed_len; } EXPORT_SYMBOL(vprintk_emit); int vprintk_default(const char *fmt, va_list args) { return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, fmt, args); } EXPORT_SYMBOL_GPL(vprintk_default); asmlinkage __visible int _printk(const char *fmt, ...) { va_list args; int r; va_start(args, fmt); r = vprintk(fmt, args); va_end(args); return r; } EXPORT_SYMBOL(_printk); static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress); #else /* CONFIG_PRINTK */ #define printk_time false #define prb_read_valid(rb, seq, r) false #define prb_first_valid_seq(rb) 0 #define prb_next_seq(rb) 0 static u64 syslog_seq; static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { return true; } #endif /* CONFIG_PRINTK */ #ifdef CONFIG_EARLY_PRINTK struct console *early_console; asmlinkage __visible void early_printk(const char *fmt, ...) { va_list ap; char buf[512]; int n; if (!early_console) return; va_start(ap, fmt); n = vscnprintf(buf, sizeof(buf), fmt, ap); va_end(ap); early_console->write(early_console, buf, n); } #endif static void set_user_specified(struct console_cmdline *c, bool user_specified) { if (!user_specified) return; /* * @c console was defined by the user on the command line. * Do not clear when added twice also by SPCR or the device tree. */ c->user_specified = true; /* At least one console defined by the user on the command line. */ console_set_on_cmdline = 1; } static int __add_preferred_console(const char *name, const short idx, const char *devname, char *options, char *brl_options, bool user_specified) { struct console_cmdline *c; int i; if (!name && !devname) return -EINVAL; /* * We use a signed short index for struct console for device drivers to * indicate a not yet assigned index or port. However, a negative index * value is not valid when the console name and index are defined on * the command line. */ if (name && idx < 0) return -EINVAL; /* * See if this tty is not yet registered, and * if we have a slot free. */ for (i = 0, c = console_cmdline; i < MAX_CMDLINECONSOLES && (c->name[0] || c->devname[0]); i++, c++) { if ((name && strcmp(c->name, name) == 0 && c->index == idx) || (devname && strcmp(c->devname, devname) == 0)) { if (!brl_options) preferred_console = i; set_user_specified(c, user_specified); return 0; } } if (i == MAX_CMDLINECONSOLES) return -E2BIG; if (!brl_options) preferred_console = i; if (name) strscpy(c->name, name); if (devname) strscpy(c->devname, devname); c->options = options; set_user_specified(c, user_specified); braille_set_options(c, brl_options); c->index = idx; return 0; } static int __init console_msg_format_setup(char *str) { if (!strcmp(str, "syslog")) console_msg_format = MSG_FORMAT_SYSLOG; if (!strcmp(str, "default")) console_msg_format = MSG_FORMAT_DEFAULT; return 1; } __setup("console_msg_format=", console_msg_format_setup); /* * Set up a console. Called via do_early_param() in init/main.c * for each "console=" parameter in the boot command line. */ static int __init console_setup(char *str) { static_assert(sizeof(console_cmdline[0].devname) >= sizeof(console_cmdline[0].name) + 4); char buf[sizeof(console_cmdline[0].devname)]; char *brl_options = NULL; char *ttyname = NULL; char *devname = NULL; char *options; char *s; int idx; /* * console="" or console=null have been suggested as a way to * disable console output. Use ttynull that has been created * for exactly this purpose. */ if (str[0] == 0 || strcmp(str, "null") == 0) { __add_preferred_console("ttynull", 0, NULL, NULL, NULL, true); return 1; } if (_braille_console_setup(&str, &brl_options)) return 1; /* For a DEVNAME:0.0 style console the character device is unknown early */ if (strchr(str, ':')) devname = buf; else ttyname = buf; /* * Decode str into name, index, options. */ if (ttyname && isdigit(str[0])) scnprintf(buf, sizeof(buf), "ttyS%s", str); else strscpy(buf, str); options = strchr(str, ','); if (options) *(options++) = 0; #ifdef __sparc__ if (!strcmp(str, "ttya")) strscpy(buf, "ttyS0"); if (!strcmp(str, "ttyb")) strscpy(buf, "ttyS1"); #endif for (s = buf; *s; s++) if ((ttyname && isdigit(*s)) || *s == ',') break; /* @idx will get defined when devname matches. */ if (devname) idx = -1; else idx = simple_strtoul(s, NULL, 10); *s = 0; __add_preferred_console(ttyname, idx, devname, options, brl_options, true); return 1; } __setup("console=", console_setup); /** * add_preferred_console - add a device to the list of preferred consoles. * @name: device name * @idx: device index * @options: options for this console * * The last preferred console added will be used for kernel messages * and stdin/out/err for init. Normally this is used by console_setup * above to handle user-supplied console arguments; however it can also * be used by arch-specific code either to override the user or more * commonly to provide a default console (ie from PROM variables) when * the user has not supplied one. */ int add_preferred_console(const char *name, const short idx, char *options) { return __add_preferred_console(name, idx, NULL, options, NULL, false); } /** * match_devname_and_update_preferred_console - Update a preferred console * when matching devname is found. * @devname: DEVNAME:0.0 style device name * @name: Name of the corresponding console driver, e.g. "ttyS" * @idx: Console index, e.g. port number. * * The function checks whether a device with the given @devname is * preferred via the console=DEVNAME:0.0 command line option. * It fills the missing console driver name and console index * so that a later register_console() call could find (match) * and enable this device. * * It might be used when a driver subsystem initializes particular * devices with already known DEVNAME:0.0 style names. And it * could predict which console driver name and index this device * would later get associated with. * * Return: 0 on success, negative error code on failure. */ int match_devname_and_update_preferred_console(const char *devname, const char *name, const short idx) { struct console_cmdline *c = console_cmdline; int i; if (!devname || !strlen(devname) || !name || !strlen(name) || idx < 0) return -EINVAL; for (i = 0; i < MAX_CMDLINECONSOLES && (c->name[0] || c->devname[0]); i++, c++) { if (!strcmp(devname, c->devname)) { pr_info("associate the preferred console \"%s\" with \"%s%d\"\n", devname, name, idx); strscpy(c->name, name); c->index = idx; return 0; } } return -ENOENT; } EXPORT_SYMBOL_GPL(match_devname_and_update_preferred_console); bool console_suspend_enabled = true; EXPORT_SYMBOL(console_suspend_enabled); static int __init console_suspend_disable(char *str) { console_suspend_enabled = false; return 1; } __setup("no_console_suspend", console_suspend_disable); module_param_named(console_suspend, console_suspend_enabled, bool, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(console_suspend, "suspend console during suspend" " and hibernate operations"); static bool printk_console_no_auto_verbose; void console_verbose(void) { if (console_loglevel && !printk_console_no_auto_verbose) console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH; } EXPORT_SYMBOL_GPL(console_verbose); module_param_named(console_no_auto_verbose, printk_console_no_auto_verbose, bool, 0644); MODULE_PARM_DESC(console_no_auto_verbose, "Disable console loglevel raise to highest on oops/panic/etc"); /** * console_suspend_all - suspend the console subsystem * * This disables printk() while we go into suspend states */ void console_suspend_all(void) { struct console *con; if (!console_suspend_enabled) return; pr_info("Suspending console(s) (use no_console_suspend to debug)\n"); pr_flush(1000, true); console_list_lock(); for_each_console(con) console_srcu_write_flags(con, con->flags | CON_SUSPENDED); console_list_unlock(); /* * Ensure that all SRCU list walks have completed. All printing * contexts must be able to see that they are suspended so that it * is guaranteed that all printing has stopped when this function * completes. */ synchronize_srcu(&console_srcu); } void console_resume_all(void) { struct console_flush_type ft; struct console *con; if (!console_suspend_enabled) return; console_list_lock(); for_each_console(con) console_srcu_write_flags(con, con->flags & ~CON_SUSPENDED); console_list_unlock(); /* * Ensure that all SRCU list walks have completed. All printing * contexts must be able to see they are no longer suspended so * that they are guaranteed to wake up and resume printing. */ synchronize_srcu(&console_srcu); printk_get_console_flush_type(&ft); if (ft.nbcon_offload) nbcon_kthreads_wake(); if (ft.legacy_offload) defer_console_output(); pr_flush(1000, true); } /** * console_cpu_notify - print deferred console messages after CPU hotplug * @cpu: unused * * If printk() is called from a CPU that is not online yet, the messages * will be printed on the console only if there are CON_ANYTIME consoles. * This function is called when a new CPU comes online (or fails to come * up) or goes offline. */ static int console_cpu_notify(unsigned int cpu) { struct console_flush_type ft; if (!cpuhp_tasks_frozen) { printk_get_console_flush_type(&ft); if (ft.nbcon_atomic) nbcon_atomic_flush_pending(); if (ft.legacy_direct) { if (console_trylock()) console_unlock(); } } return 0; } /** * console_lock - block the console subsystem from printing * * Acquires a lock which guarantees that no consoles will * be in or enter their write() callback. * * Can sleep, returns nothing. */ void console_lock(void) { might_sleep(); /* On panic, the console_lock must be left to the panic cpu. */ while (other_cpu_in_panic()) msleep(1000); down_console_sem(); console_locked = 1; console_may_schedule = 1; } EXPORT_SYMBOL(console_lock); /** * console_trylock - try to block the console subsystem from printing * * Try to acquire a lock which guarantees that no consoles will * be in or enter their write() callback. * * returns 1 on success, and 0 on failure to acquire the lock. */ int console_trylock(void) { /* On panic, the console_lock must be left to the panic cpu. */ if (other_cpu_in_panic()) return 0; if (down_trylock_console_sem()) return 0; console_locked = 1; console_may_schedule = 0; return 1; } EXPORT_SYMBOL(console_trylock); int is_console_locked(void) { return console_locked; } EXPORT_SYMBOL(is_console_locked); static void __console_unlock(void) { console_locked = 0; up_console_sem(); } #ifdef CONFIG_PRINTK /* * Prepend the message in @pmsg->pbufs->outbuf. This is achieved by shifting * the existing message over and inserting the scratchbuf message. * * @pmsg is the original printk message. * @fmt is the printf format of the message which will prepend the existing one. * * If there is not enough space in @pmsg->pbufs->outbuf, the existing * message text will be sufficiently truncated. * * If @pmsg->pbufs->outbuf is modified, @pmsg->outbuf_len is updated. */ __printf(2, 3) static void console_prepend_message(struct printk_message *pmsg, const char *fmt, ...) { struct printk_buffers *pbufs = pmsg->pbufs; const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf); const size_t outbuf_sz = sizeof(pbufs->outbuf); char *scratchbuf = &pbufs->scratchbuf[0]; char *outbuf = &pbufs->outbuf[0]; va_list args; size_t len; va_start(args, fmt); len = vscnprintf(scratchbuf, scratchbuf_sz, fmt, args); va_end(args); /* * Make sure outbuf is sufficiently large before prepending. * Keep at least the prefix when the message must be truncated. * It is a rather theoretical problem when someone tries to * use a minimalist buffer. */ if (WARN_ON_ONCE(len + PRINTK_PREFIX_MAX >= outbuf_sz)) return; if (pmsg->outbuf_len + len >= outbuf_sz) { /* Truncate the message, but keep it terminated. */ pmsg->outbuf_len = outbuf_sz - (len + 1); outbuf[pmsg->outbuf_len] = 0; } memmove(outbuf + len, outbuf, pmsg->outbuf_len + 1); memcpy(outbuf, scratchbuf, len); pmsg->outbuf_len += len; } /* * Prepend the message in @pmsg->pbufs->outbuf with a "dropped message". * @pmsg->outbuf_len is updated appropriately. * * @pmsg is the printk message to prepend. * * @dropped is the dropped count to report in the dropped message. */ void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped) { console_prepend_message(pmsg, "** %lu printk messages dropped **\n", dropped); } /* * Prepend the message in @pmsg->pbufs->outbuf with a "replay message". * @pmsg->outbuf_len is updated appropriately. * * @pmsg is the printk message to prepend. */ void console_prepend_replay(struct printk_message *pmsg) { console_prepend_message(pmsg, "** replaying previous printk message **\n"); } /* * Read and format the specified record (or a later record if the specified * record is not available). * * @pmsg will contain the formatted result. @pmsg->pbufs must point to a * struct printk_buffers. * * @seq is the record to read and format. If it is not available, the next * valid record is read. * * @is_extended specifies if the message should be formatted for extended * console output. * * @may_supress specifies if records may be skipped based on loglevel. * * Returns false if no record is available. Otherwise true and all fields * of @pmsg are valid. (See the documentation of struct printk_message * for information about the @pmsg fields.) */ bool printk_get_next_message(struct printk_message *pmsg, u64 seq, bool is_extended, bool may_suppress) { struct printk_buffers *pbufs = pmsg->pbufs; const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf); const size_t outbuf_sz = sizeof(pbufs->outbuf); char *scratchbuf = &pbufs->scratchbuf[0]; char *outbuf = &pbufs->outbuf[0]; struct printk_info info; struct printk_record r; size_t len = 0; bool force_con; /* * Formatting extended messages requires a separate buffer, so use the * scratch buffer to read in the ringbuffer text. * * Formatting normal messages is done in-place, so read the ringbuffer * text directly into the output buffer. */ if (is_extended) prb_rec_init_rd(&r, &info, scratchbuf, scratchbuf_sz); else prb_rec_init_rd(&r, &info, outbuf, outbuf_sz); if (!prb_read_valid(prb, seq, &r)) return false; pmsg->seq = r.info->seq; pmsg->dropped = r.info->seq - seq; force_con = r.info->flags & LOG_FORCE_CON; /* * Skip records that are not forced to be printed on consoles and that * has level above the console loglevel. */ if (!force_con && may_suppress && suppress_message_printing(r.info->level)) goto out; if (is_extended) { len = info_print_ext_header(outbuf, outbuf_sz, r.info); len += msg_print_ext_body(outbuf + len, outbuf_sz - len, &r.text_buf[0], r.info->text_len, &r.info->dev_info); } else { len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time); } out: pmsg->outbuf_len = len; return true; } /* * Legacy console printing from printk() caller context does not respect * raw_spinlock/spinlock nesting. For !PREEMPT_RT the lockdep warning is a * false positive. For PREEMPT_RT the false positive condition does not * occur. * * This map is used to temporarily establish LD_WAIT_SLEEP context for the * console write() callback when legacy printing to avoid false positive * lockdep complaints, thus allowing lockdep to continue to function for * real issues. */ #ifdef CONFIG_PREEMPT_RT static inline void printk_legacy_allow_spinlock_enter(void) { } static inline void printk_legacy_allow_spinlock_exit(void) { } #else static DEFINE_WAIT_OVERRIDE_MAP(printk_legacy_map, LD_WAIT_SLEEP); static inline void printk_legacy_allow_spinlock_enter(void) { lock_map_acquire_try(&printk_legacy_map); } static inline void printk_legacy_allow_spinlock_exit(void) { lock_map_release(&printk_legacy_map); } #endif /* CONFIG_PREEMPT_RT */ /* * Used as the printk buffers for non-panic, serialized console printing. * This is for legacy (!CON_NBCON) as well as all boot (CON_BOOT) consoles. * Its usage requires the console_lock held. */ struct printk_buffers printk_shared_pbufs; /* * Print one record for the given console. The record printed is whatever * record is the next available record for the given console. * * @handover will be set to true if a printk waiter has taken over the * console_lock, in which case the caller is no longer holding both the * console_lock and the SRCU read lock. Otherwise it is set to false. * * @cookie is the cookie from the SRCU read lock. * * Returns false if the given console has no next record to print, otherwise * true. * * Requires the console_lock and the SRCU read lock. */ static bool console_emit_next_record(struct console *con, bool *handover, int cookie) { bool is_extended = console_srcu_read_flags(con) & CON_EXTENDED; char *outbuf = &printk_shared_pbufs.outbuf[0]; struct printk_message pmsg = { .pbufs = &printk_shared_pbufs, }; unsigned long flags; *handover = false; if (!printk_get_next_message(&pmsg, con->seq, is_extended, true)) return false; con->dropped += pmsg.dropped; /* Skip messages of formatted length 0. */ if (pmsg.outbuf_len == 0) { con->seq = pmsg.seq + 1; goto skip; } if (con->dropped && !is_extended) { console_prepend_dropped(&pmsg, con->dropped); con->dropped = 0; } /* Write everything out to the hardware. */ if (force_legacy_kthread() && !panic_in_progress()) { /* * With forced threading this function is in a task context * (either legacy kthread or get_init_console_seq()). There * is no need for concern about printk reentrance, handovers, * or lockdep complaints. */ con->write(con, outbuf, pmsg.outbuf_len); con->seq = pmsg.seq + 1; } else { /* * While actively printing out messages, if another printk() * were to occur on another CPU, it may wait for this one to * finish. This task can not be preempted if there is a * waiter waiting to take over. * * Interrupts are disabled because the hand over to a waiter * must not be interrupted until the hand over is completed * (@console_waiter is cleared). */ printk_safe_enter_irqsave(flags); console_lock_spinning_enable(); /* Do not trace print latency. */ stop_critical_timings(); printk_legacy_allow_spinlock_enter(); con->write(con, outbuf, pmsg.outbuf_len); printk_legacy_allow_spinlock_exit(); start_critical_timings(); con->seq = pmsg.seq + 1; *handover = console_lock_spinning_disable_and_check(cookie); printk_safe_exit_irqrestore(flags); } skip: return true; } #else static bool console_emit_next_record(struct console *con, bool *handover, int cookie) { *handover = false; return false; } static inline void printk_kthreads_check_locked(void) { } #endif /* CONFIG_PRINTK */ /* * Print out all remaining records to all consoles. * * @do_cond_resched is set by the caller. It can be true only in schedulable * context. * * @next_seq is set to the sequence number after the last available record. * The value is valid only when this function returns true. It means that all * usable consoles are completely flushed. * * @handover will be set to true if a printk waiter has taken over the * console_lock, in which case the caller is no longer holding the * console_lock. Otherwise it is set to false. * * Returns true when there was at least one usable console and all messages * were flushed to all usable consoles. A returned false informs the caller * that everything was not flushed (either there were no usable consoles or * another context has taken over printing or it is a panic situation and this * is not the panic CPU). Regardless the reason, the caller should assume it * is not useful to immediately try again. * * Requires the console_lock. */ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handover) { struct console_flush_type ft; bool any_usable = false; struct console *con; bool any_progress; int cookie; *next_seq = 0; *handover = false; do { any_progress = false; printk_get_console_flush_type(&ft); cookie = console_srcu_read_lock(); for_each_console_srcu(con) { short flags = console_srcu_read_flags(con); u64 printk_seq; bool progress; /* * console_flush_all() is only responsible for nbcon * consoles when the nbcon consoles cannot print via * their atomic or threaded flushing. */ if ((flags & CON_NBCON) && (ft.nbcon_atomic || ft.nbcon_offload)) continue; if (!console_is_usable(con, flags, !do_cond_resched)) continue; any_usable = true; if (flags & CON_NBCON) { progress = nbcon_legacy_emit_next_record(con, handover, cookie, !do_cond_resched); printk_seq = nbcon_seq_read(con); } else { progress = console_emit_next_record(con, handover, cookie); printk_seq = con->seq; } /* * If a handover has occurred, the SRCU read lock * is already released. */ if (*handover) return false; /* Track the next of the highest seq flushed. */ if (printk_seq > *next_seq) *next_seq = printk_seq; if (!progress) continue; any_progress = true; /* Allow panic_cpu to take over the consoles safely. */ if (other_cpu_in_panic()) goto abandon; if (do_cond_resched) cond_resched(); } console_srcu_read_unlock(cookie); } while (any_progress); return any_usable; abandon: console_srcu_read_unlock(cookie); return false; } static void __console_flush_and_unlock(void) { bool do_cond_resched; bool handover; bool flushed; u64 next_seq; /* * Console drivers are called with interrupts disabled, so * @console_may_schedule should be cleared before; however, we may * end up dumping a lot of lines, for example, if called from * console registration path, and should invoke cond_resched() * between lines if allowable. Not doing so can cause a very long * scheduling stall on a slow console leading to RCU stall and * softlockup warnings which exacerbate the issue with more * messages practically incapacitating the system. Therefore, create * a local to use for the printing loop. */ do_cond_resched = console_may_schedule; do { console_may_schedule = 0; flushed = console_flush_all(do_cond_resched, &next_seq, &handover); if (!handover) __console_unlock(); /* * Abort if there was a failure to flush all messages to all * usable consoles. Either it is not possible to flush (in * which case it would be an infinite loop of retrying) or * another context has taken over printing. */ if (!flushed) break; /* * Some context may have added new records after * console_flush_all() but before unlocking the console. * Re-check if there is a new record to flush. If the trylock * fails, another context is already handling the printing. */ } while (prb_read_valid(prb, next_seq, NULL) && console_trylock()); } /** * console_unlock - unblock the legacy console subsystem from printing * * Releases the console_lock which the caller holds to block printing of * the legacy console subsystem. * * While the console_lock was held, console output may have been buffered * by printk(). If this is the case, console_unlock() emits the output on * legacy consoles prior to releasing the lock. * * console_unlock(); may be called from any context. */ void console_unlock(void) { struct console_flush_type ft; printk_get_console_flush_type(&ft); if (ft.legacy_direct) __console_flush_and_unlock(); else __console_unlock(); } EXPORT_SYMBOL(console_unlock); /** * console_conditional_schedule - yield the CPU if required * * If the console code is currently allowed to sleep, and * if this CPU should yield the CPU to another task, do * so here. * * Must be called within console_lock();. */ void __sched console_conditional_schedule(void) { if (console_may_schedule) cond_resched(); } EXPORT_SYMBOL(console_conditional_schedule); void console_unblank(void) { bool found_unblank = false; struct console *c; int cookie; /* * First check if there are any consoles implementing the unblank() * callback. If not, there is no reason to continue and take the * console lock, which in particular can be dangerous if * @oops_in_progress is set. */ cookie = console_srcu_read_lock(); for_each_console_srcu(c) { short flags = console_srcu_read_flags(c); if (flags & CON_SUSPENDED) continue; if ((flags & CON_ENABLED) && c->unblank) { found_unblank = true; break; } } console_srcu_read_unlock(cookie); if (!found_unblank) return; /* * Stop console printing because the unblank() callback may * assume the console is not within its write() callback. * * If @oops_in_progress is set, this may be an atomic context. * In that case, attempt a trylock as best-effort. */ if (oops_in_progress) { /* Semaphores are not NMI-safe. */ if (in_nmi()) return; /* * Attempting to trylock the console lock can deadlock * if another CPU was stopped while modifying the * semaphore. "Hope and pray" that this is not the * current situation. */ if (down_trylock_console_sem() != 0) return; } else console_lock(); console_locked = 1; console_may_schedule = 0; cookie = console_srcu_read_lock(); for_each_console_srcu(c) { short flags = console_srcu_read_flags(c); if (flags & CON_SUSPENDED) continue; if ((flags & CON_ENABLED) && c->unblank) c->unblank(); } console_srcu_read_unlock(cookie); console_unlock(); if (!oops_in_progress) pr_flush(1000, true); } /* * Rewind all consoles to the oldest available record. * * IMPORTANT: The function is safe only when called under * console_lock(). It is not enforced because * it is used as a best effort in panic(). */ static void __console_rewind_all(void) { struct console *c; short flags; int cookie; u64 seq; seq = prb_first_valid_seq(prb); cookie = console_srcu_read_lock(); for_each_console_srcu(c) { flags = console_srcu_read_flags(c); if (flags & CON_NBCON) { nbcon_seq_force(c, seq); } else { /* * This assignment is safe only when called under * console_lock(). On panic, legacy consoles are * only best effort. */ c->seq = seq; } } console_srcu_read_unlock(cookie); } /** * console_flush_on_panic - flush console content on panic * @mode: flush all messages in buffer or just the pending ones * * Immediately output all pending messages no matter what. */ void console_flush_on_panic(enum con_flush_mode mode) { struct console_flush_type ft; bool handover; u64 next_seq; /* * Ignore the console lock and flush out the messages. Attempting a * trylock would not be useful because: * * - if it is contended, it must be ignored anyway * - console_lock() and console_trylock() block and fail * respectively in panic for non-panic CPUs * - semaphores are not NMI-safe */ /* * If another context is holding the console lock, * @console_may_schedule might be set. Clear it so that * this context does not call cond_resched() while flushing. */ console_may_schedule = 0; if (mode == CONSOLE_REPLAY_ALL) __console_rewind_all(); printk_get_console_flush_type(&ft); if (ft.nbcon_atomic) nbcon_atomic_flush_pending(); /* Flush legacy consoles once allowed, even when dangerous. */ if (legacy_allow_panic_sync) console_flush_all(false, &next_seq, &handover); } /* * Return the console tty driver structure and its associated index */ struct tty_driver *console_device(int *index) { struct console *c; struct tty_driver *driver = NULL; int cookie; /* * Take console_lock to serialize device() callback with * other console operations. For example, fg_console is * modified under console_lock when switching vt. */ console_lock(); cookie = console_srcu_read_lock(); for_each_console_srcu(c) { if (!c->device) continue; driver = c->device(c, index); if (driver) break; } console_srcu_read_unlock(cookie); console_unlock(); return driver; } /* * Prevent further output on the passed console device so that (for example) * serial drivers can suspend console output before suspending a port, and can * re-enable output afterwards. */ void console_suspend(struct console *console) { __pr_flush(console, 1000, true); console_list_lock(); console_srcu_write_flags(console, console->flags & ~CON_ENABLED); console_list_unlock(); /* * Ensure that all SRCU list walks have completed. All contexts must * be able to see that this console is disabled so that (for example) * the caller can suspend the port without risk of another context * using the port. */ synchronize_srcu(&console_srcu); } EXPORT_SYMBOL(console_suspend); void console_resume(struct console *console) { struct console_flush_type ft; bool is_nbcon; console_list_lock(); console_srcu_write_flags(console, console->flags | CON_ENABLED); is_nbcon = console->flags & CON_NBCON; console_list_unlock(); /* * Ensure that all SRCU list walks have completed. The related * printing context must be able to see it is enabled so that * it is guaranteed to wake up and resume printing. */ synchronize_srcu(&console_srcu); printk_get_console_flush_type(&ft); if (is_nbcon && ft.nbcon_offload) nbcon_kthread_wake(console); else if (ft.legacy_offload) defer_console_output(); __pr_flush(console, 1000, true); } EXPORT_SYMBOL(console_resume); #ifdef CONFIG_PRINTK static int unregister_console_locked(struct console *console); /* True when system boot is far enough to create printer threads. */ static bool printk_kthreads_ready __ro_after_init; static struct task_struct *printk_legacy_kthread; static bool legacy_kthread_should_wakeup(void) { struct console_flush_type ft; struct console *con; bool ret = false; int cookie; if (kthread_should_stop()) return true; printk_get_console_flush_type(&ft); cookie = console_srcu_read_lock(); for_each_console_srcu(con) { short flags = console_srcu_read_flags(con); u64 printk_seq; /* * The legacy printer thread is only responsible for nbcon * consoles when the nbcon consoles cannot print via their * atomic or threaded flushing. */ if ((flags & CON_NBCON) && (ft.nbcon_atomic || ft.nbcon_offload)) continue; if (!console_is_usable(con, flags, false)) continue; if (flags & CON_NBCON) { printk_seq = nbcon_seq_read(con); } else { /* * It is safe to read @seq because only this * thread context updates @seq. */ printk_seq = con->seq; } if (prb_read_valid(prb, printk_seq, NULL)) { ret = true; break; } } console_srcu_read_unlock(cookie); return ret; } static int legacy_kthread_func(void *unused) { for (;;) { wait_event_interruptible(legacy_wait, legacy_kthread_should_wakeup()); if (kthread_should_stop()) break; console_lock(); __console_flush_and_unlock(); } return 0; } static bool legacy_kthread_create(void) { struct task_struct *kt; lockdep_assert_console_list_lock_held(); kt = kthread_run(legacy_kthread_func, NULL, "pr/legacy"); if (WARN_ON(IS_ERR(kt))) { pr_err("failed to start legacy printing thread\n"); return false; } printk_legacy_kthread = kt; /* * It is important that console printing threads are scheduled * shortly after a printk call and with generous runtime budgets. */ sched_set_normal(printk_legacy_kthread, -20); return true; } /** * printk_kthreads_shutdown - shutdown all threaded printers * * On system shutdown all threaded printers are stopped. This allows printk * to transition back to atomic printing, thus providing a robust mechanism * for the final shutdown/reboot messages to be output. */ static void printk_kthreads_shutdown(void) { struct console *con; console_list_lock(); if (printk_kthreads_running) { printk_kthreads_running = false; for_each_console(con) { if (con->flags & CON_NBCON) nbcon_kthread_stop(con); } /* * The threads may have been stopped while printing a * backlog. Flush any records left over. */ nbcon_atomic_flush_pending(); } console_list_unlock(); } static struct syscore_ops printk_syscore_ops = { .shutdown = printk_kthreads_shutdown, }; /* * If appropriate, start nbcon kthreads and set @printk_kthreads_running. * If any kthreads fail to start, those consoles are unregistered. * * Must be called under console_list_lock(). */ static void printk_kthreads_check_locked(void) { struct hlist_node *tmp; struct console *con; lockdep_assert_console_list_lock_held(); if (!printk_kthreads_ready) return; if (have_legacy_console || have_boot_console) { if (!printk_legacy_kthread && force_legacy_kthread() && !legacy_kthread_create()) { /* * All legacy consoles must be unregistered. If there * are any nbcon consoles, they will set up their own * kthread. */ hlist_for_each_entry_safe(con, tmp, &console_list, node) { if (con->flags & CON_NBCON) continue; unregister_console_locked(con); } } } else if (printk_legacy_kthread) { kthread_stop(printk_legacy_kthread); printk_legacy_kthread = NULL; } /* * Printer threads cannot be started as long as any boot console is * registered because there is no way to synchronize the hardware * registers between boot console code and regular console code. * It can only be known that there will be no new boot consoles when * an nbcon console is registered. */ if (have_boot_console || !have_nbcon_console) { /* Clear flag in case all nbcon consoles unregistered. */ printk_kthreads_running = false; return; } if (printk_kthreads_running) return; hlist_for_each_entry_safe(con, tmp, &console_list, node) { if (!(con->flags & CON_NBCON)) continue; if (!nbcon_kthread_create(con)) unregister_console_locked(con); } printk_kthreads_running = true; } static int __init printk_set_kthreads_ready(void) { register_syscore_ops(&printk_syscore_ops); console_list_lock(); printk_kthreads_ready = true; printk_kthreads_check_locked(); console_list_unlock(); return 0; } early_initcall(printk_set_kthreads_ready); #endif /* CONFIG_PRINTK */ static int __read_mostly keep_bootcon; static int __init keep_bootcon_setup(char *str) { keep_bootcon = 1; pr_info("debug: skip boot console de-registration.\n"); return 0; } early_param("keep_bootcon", keep_bootcon_setup); static int console_call_setup(struct console *newcon, char *options) { int err; if (!newcon->setup) return 0; /* Synchronize with possible boot console. */ console_lock(); err = newcon->setup(newcon, options); console_unlock(); return err; } /* * This is called by register_console() to try to match * the newly registered console with any of the ones selected * by either the command line or add_preferred_console() and * setup/enable it. * * Care need to be taken with consoles that are statically * enabled such as netconsole */ static int try_enable_preferred_console(struct console *newcon, bool user_specified) { struct console_cmdline *c; int i, err; for (i = 0, c = console_cmdline; i < MAX_CMDLINECONSOLES && (c->name[0] || c->devname[0]); i++, c++) { /* Console not yet initialized? */ if (!c->name[0]) continue; if (c->user_specified != user_specified) continue; if (!newcon->match || newcon->match(newcon, c->name, c->index, c->options) != 0) { /* default matching */ BUILD_BUG_ON(sizeof(c->name) != sizeof(newcon->name)); if (strcmp(c->name, newcon->name) != 0) continue; if (newcon->index >= 0 && newcon->index != c->index) continue; if (newcon->index < 0) newcon->index = c->index; if (_braille_register_console(newcon, c)) return 0; err = console_call_setup(newcon, c->options); if (err) return err; } newcon->flags |= CON_ENABLED; if (i == preferred_console) newcon->flags |= CON_CONSDEV; return 0; } /* * Some consoles, such as pstore and netconsole, can be enabled even * without matching. Accept the pre-enabled consoles only when match() * and setup() had a chance to be called. */ if (newcon->flags & CON_ENABLED && c->user_specified == user_specified) return 0; return -ENOENT; } /* Try to enable the console unconditionally */ static void try_enable_default_console(struct console *newcon) { if (newcon->index < 0) newcon->index = 0; if (console_call_setup(newcon, NULL) != 0) return; newcon->flags |= CON_ENABLED; if (newcon->device) newcon->flags |= CON_CONSDEV; } /* Return the starting sequence number for a newly registered console. */ static u64 get_init_console_seq(struct console *newcon, bool bootcon_registered) { struct console *con; bool handover; u64 init_seq; if (newcon->flags & (CON_PRINTBUFFER | CON_BOOT)) { /* Get a consistent copy of @syslog_seq. */ mutex_lock(&syslog_lock); init_seq = syslog_seq; mutex_unlock(&syslog_lock); } else { /* Begin with next message added to ringbuffer. */ init_seq = prb_next_seq(prb); /* * If any enabled boot consoles are due to be unregistered * shortly, some may not be caught up and may be the same * device as @newcon. Since it is not known which boot console * is the same device, flush all consoles and, if necessary, * start with the message of the enabled boot console that is * the furthest behind. */ if (bootcon_registered && !keep_bootcon) { /* * Hold the console_lock to stop console printing and * guarantee safe access to console->seq. */ console_lock(); /* * Flush all consoles and set the console to start at * the next unprinted sequence number. */ if (!console_flush_all(true, &init_seq, &handover)) { /* * Flushing failed. Just choose the lowest * sequence of the enabled boot consoles. */ /* * If there was a handover, this context no * longer holds the console_lock. */ if (handover) console_lock(); init_seq = prb_next_seq(prb); for_each_console(con) { u64 seq; if (!(con->flags & CON_BOOT) || !(con->flags & CON_ENABLED)) { continue; } if (con->flags & CON_NBCON) seq = nbcon_seq_read(con); else seq = con->seq; if (seq < init_seq) init_seq = seq; } } console_unlock(); } } return init_seq; } #define console_first() \ hlist_entry(console_list.first, struct console, node) static int unregister_console_locked(struct console *console); /* * The console driver calls this routine during kernel initialization * to register the console printing procedure with printk() and to * print any messages that were printed by the kernel before the * console driver was initialized. * * This can happen pretty early during the boot process (because of * early_printk) - sometimes before setup_arch() completes - be careful * of what kernel features are used - they may not be initialised yet. * * There are two types of consoles - bootconsoles (early_printk) and * "real" consoles (everything which is not a bootconsole) which are * handled differently. * - Any number of bootconsoles can be registered at any time. * - As soon as a "real" console is registered, all bootconsoles * will be unregistered automatically. * - Once a "real" console is registered, any attempt to register a * bootconsoles will be rejected */ void register_console(struct console *newcon) { bool use_device_lock = (newcon->flags & CON_NBCON) && newcon->write_atomic; bool bootcon_registered = false; bool realcon_registered = false; struct console *con; unsigned long flags; u64 init_seq; int err; console_list_lock(); for_each_console(con) { if (WARN(con == newcon, "console '%s%d' already registered\n", con->name, con->index)) { goto unlock; } if (con->flags & CON_BOOT) bootcon_registered = true; else realcon_registered = true; } /* Do not register boot consoles when there already is a real one. */ if ((newcon->flags & CON_BOOT) && realcon_registered) { pr_info("Too late to register bootconsole %s%d\n", newcon->name, newcon->index); goto unlock; } if (newcon->flags & CON_NBCON) { /* * Ensure the nbcon console buffers can be allocated * before modifying any global data. */ if (!nbcon_alloc(newcon)) goto unlock; } /* * See if we want to enable this console driver by default. * * Nope when a console is preferred by the command line, device * tree, or SPCR. * * The first real console with tty binding (driver) wins. More * consoles might get enabled before the right one is found. * * Note that a console with tty binding will have CON_CONSDEV * flag set and will be first in the list. */ if (preferred_console < 0) { if (hlist_empty(&console_list) || !console_first()->device || console_first()->flags & CON_BOOT) { try_enable_default_console(newcon); } } /* See if this console matches one we selected on the command line */ err = try_enable_preferred_console(newcon, true); /* If not, try to match against the platform default(s) */ if (err == -ENOENT) err = try_enable_preferred_console(newcon, false); /* printk() messages are not printed to the Braille console. */ if (err || newcon->flags & CON_BRL) { if (newcon->flags & CON_NBCON) nbcon_free(newcon); goto unlock; } /* * If we have a bootconsole, and are switching to a real console, * don't print everything out again, since when the boot console, and * the real console are the same physical device, it's annoying to * see the beginning boot messages twice */ if (bootcon_registered && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) { newcon->flags &= ~CON_PRINTBUFFER; } newcon->dropped = 0; init_seq = get_init_console_seq(newcon, bootcon_registered); if (newcon->flags & CON_NBCON) { have_nbcon_console = true; nbcon_seq_force(newcon, init_seq); } else { have_legacy_console = true; newcon->seq = init_seq; } if (newcon->flags & CON_BOOT) have_boot_console = true; /* * If another context is actively using the hardware of this new * console, it will not be aware of the nbcon synchronization. This * is a risk that two contexts could access the hardware * simultaneously if this new console is used for atomic printing * and the other context is still using the hardware. * * Use the driver synchronization to ensure that the hardware is not * in use while this new console transitions to being registered. */ if (use_device_lock) newcon->device_lock(newcon, &flags); /* * Put this console in the list - keep the * preferred driver at the head of the list. */ if (hlist_empty(&console_list)) { /* Ensure CON_CONSDEV is always set for the head. */ newcon->flags |= CON_CONSDEV; hlist_add_head_rcu(&newcon->node, &console_list); } else if (newcon->flags & CON_CONSDEV) { /* Only the new head can have CON_CONSDEV set. */ console_srcu_write_flags(console_first(), console_first()->flags & ~CON_CONSDEV); hlist_add_head_rcu(&newcon->node, &console_list); } else { hlist_add_behind_rcu(&newcon->node, console_list.first); } /* * No need to synchronize SRCU here! The caller does not rely * on all contexts being able to see the new console before * register_console() completes. */ /* This new console is now registered. */ if (use_device_lock) newcon->device_unlock(newcon, flags); console_sysfs_notify(); /* * By unregistering the bootconsoles after we enable the real console * we get the "console xxx enabled" message on all the consoles - * boot consoles, real consoles, etc - this is to ensure that end * users know there might be something in the kernel's log buffer that * went to the bootconsole (that they do not see on the real console) */ con_printk(KERN_INFO, newcon, "enabled\n"); if (bootcon_registered && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) && !keep_bootcon) { struct hlist_node *tmp; hlist_for_each_entry_safe(con, tmp, &console_list, node) { if (con->flags & CON_BOOT) unregister_console_locked(con); } } /* Changed console list, may require printer threads to start/stop. */ printk_kthreads_check_locked(); unlock: console_list_unlock(); } EXPORT_SYMBOL(register_console); /* Must be called under console_list_lock(). */ static int unregister_console_locked(struct console *console) { bool use_device_lock = (console->flags & CON_NBCON) && console->write_atomic; bool found_legacy_con = false; bool found_nbcon_con = false; bool found_boot_con = false; unsigned long flags; struct console *c; int res; lockdep_assert_console_list_lock_held(); con_printk(KERN_INFO, console, "disabled\n"); res = _braille_unregister_console(console); if (res < 0) return res; if (res > 0) return 0; if (!console_is_registered_locked(console)) res = -ENODEV; else if (console_is_usable(console, console->flags, true)) __pr_flush(console, 1000, true); /* Disable it unconditionally */ console_srcu_write_flags(console, console->flags & ~CON_ENABLED); if (res < 0) return res; /* * Use the driver synchronization to ensure that the hardware is not * in use while this console transitions to being unregistered. */ if (use_device_lock) console->device_lock(console, &flags); hlist_del_init_rcu(&console->node); if (use_device_lock) console->device_unlock(console, flags); /* * <HISTORICAL> * If this isn't the last console and it has CON_CONSDEV set, we * need to set it on the next preferred console. * </HISTORICAL> * * The above makes no sense as there is no guarantee that the next * console has any device attached. Oh well.... */ if (!hlist_empty(&console_list) && console->flags & CON_CONSDEV) console_srcu_write_flags(console_first(), console_first()->flags | CON_CONSDEV); /* * Ensure that all SRCU list walks have completed. All contexts * must not be able to see this console in the list so that any * exit/cleanup routines can be performed safely. */ synchronize_srcu(&console_srcu); if (console->flags & CON_NBCON) nbcon_free(console); console_sysfs_notify(); if (console->exit) res = console->exit(console); /* * With this console gone, the global flags tracking registered * console types may have changed. Update them. */ for_each_console(c) { if (c->flags & CON_BOOT) found_boot_con = true; if (c->flags & CON_NBCON) found_nbcon_con = true; else found_legacy_con = true; } if (!found_boot_con) have_boot_console = found_boot_con; if (!found_legacy_con) have_legacy_console = found_legacy_con; if (!found_nbcon_con) have_nbcon_console = found_nbcon_con; /* Changed console list, may require printer threads to start/stop. */ printk_kthreads_check_locked(); return res; } int unregister_console(struct console *console) { int res; console_list_lock(); res = unregister_console_locked(console); console_list_unlock(); return res; } EXPORT_SYMBOL(unregister_console); /** * console_force_preferred_locked - force a registered console preferred * @con: The registered console to force preferred. * * Must be called under console_list_lock(). */ void console_force_preferred_locked(struct console *con) { struct console *cur_pref_con; if (!console_is_registered_locked(con)) return; cur_pref_con = console_first(); /* Already preferred? */ if (cur_pref_con == con) return; /* * Delete, but do not re-initialize the entry. This allows the console * to continue to appear registered (via any hlist_unhashed_lockless() * checks), even though it was briefly removed from the console list. */ hlist_del_rcu(&con->node); /* * Ensure that all SRCU list walks have completed so that the console * can be added to the beginning of the console list and its forward * list pointer can be re-initialized. */ synchronize_srcu(&console_srcu); con->flags |= CON_CONSDEV; WARN_ON(!con->device); /* Only the new head can have CON_CONSDEV set. */ console_srcu_write_flags(cur_pref_con, cur_pref_con->flags & ~CON_CONSDEV); hlist_add_head_rcu(&con->node, &console_list); } EXPORT_SYMBOL(console_force_preferred_locked); /* * Initialize the console device. This is called *early*, so * we can't necessarily depend on lots of kernel help here. * Just do some early initializations, and do the complex setup * later. */ void __init console_init(void) { int ret; initcall_t call; initcall_entry_t *ce; #ifdef CONFIG_NULL_TTY_DEFAULT_CONSOLE if (!console_set_on_cmdline) add_preferred_console("ttynull", 0, NULL); #endif /* Setup the default TTY line discipline. */ n_tty_init(); /* * set up the console device so that later boot sequences can * inform about problems etc.. */ ce = __con_initcall_start; trace_initcall_level("console"); while (ce < __con_initcall_end) { call = initcall_from_entry(ce); trace_initcall_start(call); ret = call(); trace_initcall_finish(call, ret); ce++; } } /* * Some boot consoles access data that is in the init section and which will * be discarded after the initcalls have been run. To make sure that no code * will access this data, unregister the boot consoles in a late initcall. * * If for some reason, such as deferred probe or the driver being a loadable * module, the real console hasn't registered yet at this point, there will * be a brief interval in which no messages are logged to the console, which * makes it difficult to diagnose problems that occur during this time. * * To mitigate this problem somewhat, only unregister consoles whose memory * intersects with the init section. Note that all other boot consoles will * get unregistered when the real preferred console is registered. */ static int __init printk_late_init(void) { struct hlist_node *tmp; struct console *con; int ret; console_list_lock(); hlist_for_each_entry_safe(con, tmp, &console_list, node) { if (!(con->flags & CON_BOOT)) continue; /* Check addresses that might be used for enabled consoles. */ if (init_section_intersects(con, sizeof(*con)) || init_section_contains(con->write, 0) || init_section_contains(con->read, 0) || init_section_contains(con->device, 0) || init_section_contains(con->unblank, 0) || init_section_contains(con->data, 0)) { /* * Please, consider moving the reported consoles out * of the init section. */ pr_warn("bootconsole [%s%d] uses init memory and must be disabled even before the real one is ready\n", con->name, con->index); unregister_console_locked(con); } } console_list_unlock(); ret = cpuhp_setup_state_nocalls(CPUHP_PRINTK_DEAD, "printk:dead", NULL, console_cpu_notify); WARN_ON(ret < 0); ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "printk:online", console_cpu_notify, NULL); WARN_ON(ret < 0); printk_sysctl_init(); return 0; } late_initcall(printk_late_init); #if defined CONFIG_PRINTK /* If @con is specified, only wait for that console. Otherwise wait for all. */ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { unsigned long timeout_jiffies = msecs_to_jiffies(timeout_ms); unsigned long remaining_jiffies = timeout_jiffies; struct console_flush_type ft; struct console *c; u64 last_diff = 0; u64 printk_seq; short flags; int cookie; u64 diff; u64 seq; /* Sorry, pr_flush() will not work this early. */ if (system_state < SYSTEM_SCHEDULING) return false; might_sleep(); seq = prb_next_reserve_seq(prb); /* Flush the consoles so that records up to @seq are printed. */ printk_get_console_flush_type(&ft); if (ft.nbcon_atomic) nbcon_atomic_flush_pending(); if (ft.legacy_direct) { console_lock(); console_unlock(); } for (;;) { unsigned long begin_jiffies; unsigned long slept_jiffies; diff = 0; /* * Hold the console_lock to guarantee safe access to * console->seq. Releasing console_lock flushes more * records in case @seq is still not printed on all * usable consoles. * * Holding the console_lock is not necessary if there * are no legacy or boot consoles. However, such a * console could register at any time. Always hold the * console_lock as a precaution rather than * synchronizing against register_console(). */ console_lock(); cookie = console_srcu_read_lock(); for_each_console_srcu(c) { if (con && con != c) continue; flags = console_srcu_read_flags(c); /* * If consoles are not usable, it cannot be expected * that they make forward progress, so only increment * @diff for usable consoles. */ if (!console_is_usable(c, flags, true) && !console_is_usable(c, flags, false)) { continue; } if (flags & CON_NBCON) { printk_seq = nbcon_seq_read(c); } else { printk_seq = c->seq; } if (printk_seq < seq) diff += seq - printk_seq; } console_srcu_read_unlock(cookie); if (diff != last_diff && reset_on_progress) remaining_jiffies = timeout_jiffies; console_unlock(); /* Note: @diff is 0 if there are no usable consoles. */ if (diff == 0 || remaining_jiffies == 0) break; /* msleep(1) might sleep much longer. Check time by jiffies. */ begin_jiffies = jiffies; msleep(1); slept_jiffies = jiffies - begin_jiffies; remaining_jiffies -= min(slept_jiffies, remaining_jiffies); last_diff = diff; } return (diff == 0); } /** * pr_flush() - Wait for printing threads to catch up. * * @timeout_ms: The maximum time (in ms) to wait. * @reset_on_progress: Reset the timeout if forward progress is seen. * * A value of 0 for @timeout_ms means no waiting will occur. A value of -1 * represents infinite waiting. * * If @reset_on_progress is true, the timeout will be reset whenever any * printer has been seen to make some forward progress. * * Context: Process context. May sleep while acquiring console lock. * Return: true if all usable printers are caught up. */ bool pr_flush(int timeout_ms, bool reset_on_progress) { return __pr_flush(NULL, timeout_ms, reset_on_progress); } /* * Delayed printk version, for scheduler-internal messages: */ #define PRINTK_PENDING_WAKEUP 0x01 #define PRINTK_PENDING_OUTPUT 0x02 static DEFINE_PER_CPU(int, printk_pending); static void wake_up_klogd_work_func(struct irq_work *irq_work) { int pending = this_cpu_xchg(printk_pending, 0); if (pending & PRINTK_PENDING_OUTPUT) { if (force_legacy_kthread()) { if (printk_legacy_kthread) wake_up_interruptible(&legacy_wait); } else { if (console_trylock()) console_unlock(); } } if (pending & PRINTK_PENDING_WAKEUP) wake_up_interruptible(&log_wait); } static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = IRQ_WORK_INIT_LAZY(wake_up_klogd_work_func); static void __wake_up_klogd(int val) { if (!printk_percpu_data_ready()) return; preempt_disable(); /* * Guarantee any new records can be seen by tasks preparing to wait * before this context checks if the wait queue is empty. * * The full memory barrier within wq_has_sleeper() pairs with the full * memory barrier within set_current_state() of * prepare_to_wait_event(), which is called after ___wait_event() adds * the waiter but before it has checked the wait condition. * * This pairs with devkmsg_read:A and syslog_print:A. */ if (wq_has_sleeper(&log_wait) || /* LMM(__wake_up_klogd:A) */ (val & PRINTK_PENDING_OUTPUT)) { this_cpu_or(printk_pending, val); irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); } preempt_enable(); } /** * wake_up_klogd - Wake kernel logging daemon * * Use this function when new records have been added to the ringbuffer * and the console printing of those records has already occurred or is * known to be handled by some other context. This function will only * wake the logging daemon. * * Context: Any context. */ void wake_up_klogd(void) { __wake_up_klogd(PRINTK_PENDING_WAKEUP); } /** * defer_console_output - Wake kernel logging daemon and trigger * console printing in a deferred context * * Use this function when new records have been added to the ringbuffer, * this context is responsible for console printing those records, but * the current context is not allowed to perform the console printing. * Trigger an irq_work context to perform the console printing. This * function also wakes the logging daemon. * * Context: Any context. */ void defer_console_output(void) { /* * New messages may have been added directly to the ringbuffer * using vprintk_store(), so wake any waiters as well. */ __wake_up_klogd(PRINTK_PENDING_WAKEUP | PRINTK_PENDING_OUTPUT); } void printk_trigger_flush(void) { defer_console_output(); } int vprintk_deferred(const char *fmt, va_list args) { return vprintk_emit(0, LOGLEVEL_SCHED, NULL, fmt, args); } int _printk_deferred(const char *fmt, ...) { va_list args; int r; va_start(args, fmt); r = vprintk_deferred(fmt, args); va_end(args); return r; } /* * printk rate limiting, lifted from the networking subsystem. * * This enforces a rate limit: not more than 10 kernel messages * every 5s to make a denial-of-service attack impossible. */ DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10); int __printk_ratelimit(const char *func) { return ___ratelimit(&printk_ratelimit_state, func); } EXPORT_SYMBOL(__printk_ratelimit); /** * printk_timed_ratelimit - caller-controlled printk ratelimiting * @caller_jiffies: pointer to caller's state * @interval_msecs: minimum interval between prints * * printk_timed_ratelimit() returns true if more than @interval_msecs * milliseconds have elapsed since the last time printk_timed_ratelimit() * returned true. */ bool printk_timed_ratelimit(unsigned long *caller_jiffies, unsigned int interval_msecs) { unsigned long elapsed = jiffies - *caller_jiffies; if (*caller_jiffies && elapsed <= msecs_to_jiffies(interval_msecs)) return false; *caller_jiffies = jiffies; return true; } EXPORT_SYMBOL(printk_timed_ratelimit); static DEFINE_SPINLOCK(dump_list_lock); static LIST_HEAD(dump_list); /** * kmsg_dump_register - register a kernel log dumper. * @dumper: pointer to the kmsg_dumper structure * * Adds a kernel log dumper to the system. The dump callback in the * structure will be called when the kernel oopses or panics and must be * set. Returns zero on success and %-EINVAL or %-EBUSY otherwise. */ int kmsg_dump_register(struct kmsg_dumper *dumper) { unsigned long flags; int err = -EBUSY; /* The dump callback needs to be set */ if (!dumper->dump) return -EINVAL; spin_lock_irqsave(&dump_list_lock, flags); /* Don't allow registering multiple times */ if (!dumper->registered) { dumper->registered = 1; list_add_tail_rcu(&dumper->list, &dump_list); err = 0; } spin_unlock_irqrestore(&dump_list_lock, flags); return err; } EXPORT_SYMBOL_GPL(kmsg_dump_register); /** * kmsg_dump_unregister - unregister a kmsg dumper. * @dumper: pointer to the kmsg_dumper structure * * Removes a dump device from the system. Returns zero on success and * %-EINVAL otherwise. */ int kmsg_dump_unregister(struct kmsg_dumper *dumper) { unsigned long flags; int err = -EINVAL; spin_lock_irqsave(&dump_list_lock, flags); if (dumper->registered) { dumper->registered = 0; list_del_rcu(&dumper->list); err = 0; } spin_unlock_irqrestore(&dump_list_lock, flags); synchronize_rcu(); return err; } EXPORT_SYMBOL_GPL(kmsg_dump_unregister); static bool always_kmsg_dump; module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR); const char *kmsg_dump_reason_str(enum kmsg_dump_reason reason) { switch (reason) { case KMSG_DUMP_PANIC: return "Panic"; case KMSG_DUMP_OOPS: return "Oops"; case KMSG_DUMP_EMERG: return "Emergency"; case KMSG_DUMP_SHUTDOWN: return "Shutdown"; default: return "Unknown"; } } EXPORT_SYMBOL_GPL(kmsg_dump_reason_str); /** * kmsg_dump_desc - dump kernel log to kernel message dumpers. * @reason: the reason (oops, panic etc) for dumping * @desc: a short string to describe what caused the panic or oops. Can be NULL * if no additional description is available. * * Call each of the registered dumper's dump() callback, which can * retrieve the kmsg records with kmsg_dump_get_line() or * kmsg_dump_get_buffer(). */ void kmsg_dump_desc(enum kmsg_dump_reason reason, const char *desc) { struct kmsg_dumper *dumper; struct kmsg_dump_detail detail = { .reason = reason, .description = desc}; rcu_read_lock(); list_for_each_entry_rcu(dumper, &dump_list, list) { enum kmsg_dump_reason max_reason = dumper->max_reason; /* * If client has not provided a specific max_reason, default * to KMSG_DUMP_OOPS, unless always_kmsg_dump was set. */ if (max_reason == KMSG_DUMP_UNDEF) { max_reason = always_kmsg_dump ? KMSG_DUMP_MAX : KMSG_DUMP_OOPS; } if (reason > max_reason) continue; /* invoke dumper which will iterate over records */ dumper->dump(dumper, &detail); } rcu_read_unlock(); } /** * kmsg_dump_get_line - retrieve one kmsg log line * @iter: kmsg dump iterator * @syslog: include the "<4>" prefixes * @line: buffer to copy the line to * @size: maximum size of the buffer * @len: length of line placed into buffer * * Start at the beginning of the kmsg buffer, with the oldest kmsg * record, and copy one record into the provided buffer. * * Consecutive calls will return the next available record moving * towards the end of the buffer with the youngest messages. * * A return value of FALSE indicates that there are no more records to * read. */ bool kmsg_dump_get_line(struct kmsg_dump_iter *iter, bool syslog, char *line, size_t size, size_t *len) { u64 min_seq = latched_seq_read_nolock(&clear_seq); struct printk_info info; unsigned int line_count; struct printk_record r; size_t l = 0; bool ret = false; if (iter->cur_seq < min_seq) iter->cur_seq = min_seq; prb_rec_init_rd(&r, &info, line, size); /* Read text or count text lines? */ if (line) { if (!prb_read_valid(prb, iter->cur_seq, &r)) goto out; l = record_print_text(&r, syslog, printk_time); } else { if (!prb_read_valid_info(prb, iter->cur_seq, &info, &line_count)) { goto out; } l = get_record_print_text_size(&info, line_count, syslog, printk_time); } iter->cur_seq = r.info->seq + 1; ret = true; out: if (len) *len = l; return ret; } EXPORT_SYMBOL_GPL(kmsg_dump_get_line); /** * kmsg_dump_get_buffer - copy kmsg log lines * @iter: kmsg dump iterator * @syslog: include the "<4>" prefixes * @buf: buffer to copy the line to * @size: maximum size of the buffer * @len_out: length of line placed into buffer * * Start at the end of the kmsg buffer and fill the provided buffer * with as many of the *youngest* kmsg records that fit into it. * If the buffer is large enough, all available kmsg records will be * copied with a single call. * * Consecutive calls will fill the buffer with the next block of * available older records, not including the earlier retrieved ones. * * A return value of FALSE indicates that there are no more records to * read. */ bool kmsg_dump_get_buffer(struct kmsg_dump_iter *iter, bool syslog, char *buf, size_t size, size_t *len_out) { u64 min_seq = latched_seq_read_nolock(&clear_seq); struct printk_info info; struct printk_record r; u64 seq; u64 next_seq; size_t len = 0; bool ret = false; bool time = printk_time; if (!buf || !size) goto out; if (iter->cur_seq < min_seq) iter->cur_seq = min_seq; if (prb_read_valid_info(prb, iter->cur_seq, &info, NULL)) { if (info.seq != iter->cur_seq) { /* messages are gone, move to first available one */ iter->cur_seq = info.seq; } } /* last entry */ if (iter->cur_seq >= iter->next_seq) goto out; /* * Find first record that fits, including all following records, * into the user-provided buffer for this dump. Pass in size-1 * because this function (by way of record_print_text()) will * not write more than size-1 bytes of text into @buf. */ seq = find_first_fitting_seq(iter->cur_seq, iter->next_seq, size - 1, syslog, time); /* * Next kmsg_dump_get_buffer() invocation will dump block of * older records stored right before this one. */ next_seq = seq; prb_rec_init_rd(&r, &info, buf, size); prb_for_each_record(seq, prb, seq, &r) { if (r.info->seq >= iter->next_seq) break; len += record_print_text(&r, syslog, time); /* Adjust record to store to remaining buffer space. */ prb_rec_init_rd(&r, &info, buf + len, size - len); } iter->next_seq = next_seq; ret = true; out: if (len_out) *len_out = len; return ret; } EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); /** * kmsg_dump_rewind - reset the iterator * @iter: kmsg dump iterator * * Reset the dumper's iterator so that kmsg_dump_get_line() and * kmsg_dump_get_buffer() can be called again and used multiple * times within the same dumper.dump() callback. */ void kmsg_dump_rewind(struct kmsg_dump_iter *iter) { iter->cur_seq = latched_seq_read_nolock(&clear_seq); iter->next_seq = prb_next_seq(prb); } EXPORT_SYMBOL_GPL(kmsg_dump_rewind); /** * console_try_replay_all - try to replay kernel log on consoles * * Try to obtain lock on console subsystem and replay all * available records in printk buffer on the consoles. * Does nothing if lock is not obtained. * * Context: Any, except for NMI. */ void console_try_replay_all(void) { struct console_flush_type ft; printk_get_console_flush_type(&ft); if (console_trylock()) { __console_rewind_all(); if (ft.nbcon_atomic) nbcon_atomic_flush_pending(); if (ft.nbcon_offload) nbcon_kthreads_wake(); if (ft.legacy_offload) defer_console_output(); /* Consoles are flushed as part of console_unlock(). */ console_unlock(); } } #endif #ifdef CONFIG_SMP static atomic_t printk_cpu_sync_owner = ATOMIC_INIT(-1); static atomic_t printk_cpu_sync_nested = ATOMIC_INIT(0); bool is_printk_cpu_sync_owner(void) { return (atomic_read(&printk_cpu_sync_owner) == raw_smp_processor_id()); } /** * __printk_cpu_sync_wait() - Busy wait until the printk cpu-reentrant * spinning lock is not owned by any CPU. * * Context: Any context. */ void __printk_cpu_sync_wait(void) { do { cpu_relax(); } while (atomic_read(&printk_cpu_sync_owner) != -1); } EXPORT_SYMBOL(__printk_cpu_sync_wait); /** * __printk_cpu_sync_try_get() - Try to acquire the printk cpu-reentrant * spinning lock. * * If no processor has the lock, the calling processor takes the lock and * becomes the owner. If the calling processor is already the owner of the * lock, this function succeeds immediately. * * Context: Any context. Expects interrupts to be disabled. * Return: 1 on success, otherwise 0. */ int __printk_cpu_sync_try_get(void) { int cpu; int old; cpu = smp_processor_id(); /* * Guarantee loads and stores from this CPU when it is the lock owner * are _not_ visible to the previous lock owner. This pairs with * __printk_cpu_sync_put:B. * * Memory barrier involvement: * * If __printk_cpu_sync_try_get:A reads from __printk_cpu_sync_put:B, * then __printk_cpu_sync_put:A can never read from * __printk_cpu_sync_try_get:B. * * Relies on: * * RELEASE from __printk_cpu_sync_put:A to __printk_cpu_sync_put:B * of the previous CPU * matching * ACQUIRE from __printk_cpu_sync_try_get:A to * __printk_cpu_sync_try_get:B of this CPU */ old = atomic_cmpxchg_acquire(&printk_cpu_sync_owner, -1, cpu); /* LMM(__printk_cpu_sync_try_get:A) */ if (old == -1) { /* * This CPU is now the owner and begins loading/storing * data: LMM(__printk_cpu_sync_try_get:B) */ return 1; } else if (old == cpu) { /* This CPU is already the owner. */ atomic_inc(&printk_cpu_sync_nested); return 1; } return 0; } EXPORT_SYMBOL(__printk_cpu_sync_try_get); /** * __printk_cpu_sync_put() - Release the printk cpu-reentrant spinning lock. * * The calling processor must be the owner of the lock. * * Context: Any context. Expects interrupts to be disabled. */ void __printk_cpu_sync_put(void) { if (atomic_read(&printk_cpu_sync_nested)) { atomic_dec(&printk_cpu_sync_nested); return; } /* * This CPU is finished loading/storing data: * LMM(__printk_cpu_sync_put:A) */ /* * Guarantee loads and stores from this CPU when it was the * lock owner are visible to the next lock owner. This pairs * with __printk_cpu_sync_try_get:A. * * Memory barrier involvement: * * If __printk_cpu_sync_try_get:A reads from __printk_cpu_sync_put:B, * then __printk_cpu_sync_try_get:B reads from __printk_cpu_sync_put:A. * * Relies on: * * RELEASE from __printk_cpu_sync_put:A to __printk_cpu_sync_put:B * of this CPU * matching * ACQUIRE from __printk_cpu_sync_try_get:A to * __printk_cpu_sync_try_get:B of the next CPU */ atomic_set_release(&printk_cpu_sync_owner, -1); /* LMM(__printk_cpu_sync_put:B) */ } EXPORT_SYMBOL(__printk_cpu_sync_put); #endif /* CONFIG_SMP */
2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2012 ARM Ltd. */ #ifndef __ASM_HW_BREAKPOINT_H #define __ASM_HW_BREAKPOINT_H #include <asm/cputype.h> #include <asm/cpufeature.h> #include <asm/sysreg.h> #include <asm/virt.h> struct arch_hw_breakpoint_ctrl { u32 __reserved : 19, len : 8, type : 2, privilege : 2, enabled : 1; }; struct arch_hw_breakpoint { u64 address; u64 trigger; struct arch_hw_breakpoint_ctrl ctrl; }; /* Privilege Levels */ #define AARCH64_BREAKPOINT_EL1 1 #define AARCH64_BREAKPOINT_EL0 2 #define DBG_HMC_HYP (1 << 13) static inline u32 encode_ctrl_reg(struct arch_hw_breakpoint_ctrl ctrl) { u32 val = (ctrl.len << 5) | (ctrl.type << 3) | (ctrl.privilege << 1) | ctrl.enabled; if (is_kernel_in_hyp_mode() && ctrl.privilege == AARCH64_BREAKPOINT_EL1) val |= DBG_HMC_HYP; return val; } static inline void decode_ctrl_reg(u32 reg, struct arch_hw_breakpoint_ctrl *ctrl) { ctrl->enabled = reg & 0x1; reg >>= 1; ctrl->privilege = reg & 0x3; reg >>= 2; ctrl->type = reg & 0x3; reg >>= 2; ctrl->len = reg & 0xff; } /* Breakpoint */ #define ARM_BREAKPOINT_EXECUTE 0 /* Watchpoints */ #define ARM_BREAKPOINT_LOAD 1 #define ARM_BREAKPOINT_STORE 2 /* Lengths */ #define ARM_BREAKPOINT_LEN_1 0x1 #define ARM_BREAKPOINT_LEN_2 0x3 #define ARM_BREAKPOINT_LEN_3 0x7 #define ARM_BREAKPOINT_LEN_4 0xf #define ARM_BREAKPOINT_LEN_5 0x1f #define ARM_BREAKPOINT_LEN_6 0x3f #define ARM_BREAKPOINT_LEN_7 0x7f #define ARM_BREAKPOINT_LEN_8 0xff /* Kernel stepping */ #define ARM_KERNEL_STEP_NONE 0 #define ARM_KERNEL_STEP_ACTIVE 1 #define ARM_KERNEL_STEP_SUSPEND 2 /* * Limits. * Changing these will require modifications to the register accessors. */ #define ARM_MAX_BRP 16 #define ARM_MAX_WRP 16 /* Virtual debug register bases. */ #define AARCH64_DBG_REG_BVR 0 #define AARCH64_DBG_REG_BCR (AARCH64_DBG_REG_BVR + ARM_MAX_BRP) #define AARCH64_DBG_REG_WVR (AARCH64_DBG_REG_BCR + ARM_MAX_BRP) #define AARCH64_DBG_REG_WCR (AARCH64_DBG_REG_WVR + ARM_MAX_WRP) /* Debug register names. */ #define AARCH64_DBG_REG_NAME_BVR bvr #define AARCH64_DBG_REG_NAME_BCR bcr #define AARCH64_DBG_REG_NAME_WVR wvr #define AARCH64_DBG_REG_NAME_WCR wcr /* Accessor macros for the debug registers. */ #define AARCH64_DBG_READ(N, REG, VAL) do {\ VAL = read_sysreg(dbg##REG##N##_el1);\ } while (0) #define AARCH64_DBG_WRITE(N, REG, VAL) do {\ write_sysreg(VAL, dbg##REG##N##_el1);\ } while (0) struct task_struct; struct notifier_block; struct perf_event_attr; struct perf_event; struct pmu; extern int arch_bp_generic_fields(struct arch_hw_breakpoint_ctrl ctrl, int *gen_len, int *gen_type, int *offset); extern int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw); extern int hw_breakpoint_arch_parse(struct perf_event *bp, const struct perf_event_attr *attr, struct arch_hw_breakpoint *hw); extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused, unsigned long val, void *data); extern int arch_install_hw_breakpoint(struct perf_event *bp); extern void arch_uninstall_hw_breakpoint(struct perf_event *bp); extern void hw_breakpoint_pmu_read(struct perf_event *bp); extern int hw_breakpoint_slots(int type); #ifdef CONFIG_HAVE_HW_BREAKPOINT extern void hw_breakpoint_thread_switch(struct task_struct *next); extern void ptrace_hw_copy_thread(struct task_struct *task); #else static inline void hw_breakpoint_thread_switch(struct task_struct *next) { } static inline void ptrace_hw_copy_thread(struct task_struct *task) { } #endif /* Determine number of BRP registers available. */ static inline int get_num_brps(void) { u64 dfr0 = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1); return 1 + cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_BRPs_SHIFT); } /* Determine number of WRP registers available. */ static inline int get_num_wrps(void) { u64 dfr0 = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1); return 1 + cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_WRPs_SHIFT); } #ifdef CONFIG_CPU_PM extern void cpu_suspend_set_dbg_restorer(int (*hw_bp_restore)(unsigned int)); #else static inline void cpu_suspend_set_dbg_restorer(int (*hw_bp_restore)(unsigned int)) { } #endif #endif /* __ASM_BREAKPOINT_H */
28 18 10 28 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. NET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the Ethernet handlers. * * Version: @(#)eth.h 1.0.4 05/13/93 * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * * Relocated to include/linux where it belongs by Alan Cox * <gw4pts@gw4pts.ampr.org> */ #ifndef _LINUX_ETHERDEVICE_H #define _LINUX_ETHERDEVICE_H #include <linux/if_ether.h> #include <linux/netdevice.h> #include <linux/random.h> #include <linux/crc32.h> #include <linux/unaligned.h> #include <asm/bitsperlong.h> #ifdef __KERNEL__ struct device; struct fwnode_handle; int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr); int platform_get_ethdev_address(struct device *dev, struct net_device *netdev); unsigned char *arch_get_platform_mac_address(void); int nvmem_get_mac_address(struct device *dev, void *addrbuf); int device_get_mac_address(struct device *dev, char *addr); int device_get_ethdev_address(struct device *dev, struct net_device *netdev); int fwnode_get_mac_address(struct fwnode_handle *fwnode, char *addr); u32 eth_get_headlen(const struct net_device *dev, const void *data, u32 len); __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev); extern const struct header_ops eth_header_ops; int eth_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned len); int eth_header_parse(const struct sk_buff *skb, unsigned char *haddr); int eth_header_cache(const struct neighbour *neigh, struct hh_cache *hh, __be16 type); void eth_header_cache_update(struct hh_cache *hh, const struct net_device *dev, const unsigned char *haddr); __be16 eth_header_parse_protocol(const struct sk_buff *skb); int eth_prepare_mac_addr_change(struct net_device *dev, void *p); void eth_commit_mac_addr_change(struct net_device *dev, void *p); int eth_mac_addr(struct net_device *dev, void *p); int eth_validate_addr(struct net_device *dev); struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs, unsigned int rxqs); #define alloc_etherdev(sizeof_priv) alloc_etherdev_mq(sizeof_priv, 1) #define alloc_etherdev_mq(sizeof_priv, count) alloc_etherdev_mqs(sizeof_priv, count, count) struct net_device *devm_alloc_etherdev_mqs(struct device *dev, int sizeof_priv, unsigned int txqs, unsigned int rxqs); #define devm_alloc_etherdev(dev, sizeof_priv) devm_alloc_etherdev_mqs(dev, sizeof_priv, 1, 1) struct sk_buff *eth_gro_receive(struct list_head *head, struct sk_buff *skb); int eth_gro_complete(struct sk_buff *skb, int nhoff); /* Reserved Ethernet Addresses per IEEE 802.1Q */ static const u8 eth_reserved_addr_base[ETH_ALEN] __aligned(2) = { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 }; #define eth_stp_addr eth_reserved_addr_base static const u8 eth_ipv4_mcast_addr_base[ETH_ALEN] __aligned(2) = { 0x01, 0x00, 0x5e, 0x00, 0x00, 0x00 }; static const u8 eth_ipv6_mcast_addr_base[ETH_ALEN] __aligned(2) = { 0x33, 0x33, 0x00, 0x00, 0x00, 0x00 }; /** * is_link_local_ether_addr - Determine if given Ethernet address is link-local * @addr: Pointer to a six-byte array containing the Ethernet address * * Return: true if address is link local reserved addr (01:80:c2:00:00:0X) per * IEEE 802.1Q 8.6.3 Frame filtering. * * Please note: addr must be aligned to u16. */ static inline bool is_link_local_ether_addr(const u8 *addr) { __be16 *a = (__be16 *)addr; static const __be16 *b = (const __be16 *)eth_reserved_addr_base; static const __be16 m = cpu_to_be16(0xfff0); #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) return (((*(const u32 *)addr) ^ (*(const u32 *)b)) | (__force int)((a[2] ^ b[2]) & m)) == 0; #else return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | ((a[2] ^ b[2]) & m)) == 0; #endif } /** * is_zero_ether_addr - Determine if give Ethernet address is all zeros. * @addr: Pointer to a six-byte array containing the Ethernet address * * Return: true if the address is all zeroes. * * Please note: addr must be aligned to u16. */ static inline bool is_zero_ether_addr(const u8 *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) return ((*(const u32 *)addr) | (*(const u16 *)(addr + 4))) == 0; #else return (*(const u16 *)(addr + 0) | *(const u16 *)(addr + 2) | *(const u16 *)(addr + 4)) == 0; #endif } /** * is_multicast_ether_addr - Determine if the Ethernet address is a multicast. * @addr: Pointer to a six-byte array containing the Ethernet address * * Return: true if the address is a multicast address. * By definition the broadcast address is also a multicast address. */ static inline bool is_multicast_ether_addr(const u8 *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) u32 a = *(const u32 *)addr; #else u16 a = *(const u16 *)addr; #endif #ifdef __BIG_ENDIAN return 0x01 & (a >> ((sizeof(a) * 8) - 8)); #else return 0x01 & a; #endif } static inline bool is_multicast_ether_addr_64bits(const u8 *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 #ifdef __BIG_ENDIAN return 0x01 & ((*(const u64 *)addr) >> 56); #else return 0x01 & (*(const u64 *)addr); #endif #else return is_multicast_ether_addr(addr); #endif } /** * is_local_ether_addr - Determine if the Ethernet address is locally-assigned one (IEEE 802). * @addr: Pointer to a six-byte array containing the Ethernet address * * Return: true if the address is a local address. */ static inline bool is_local_ether_addr(const u8 *addr) { return 0x02 & addr[0]; } /** * is_broadcast_ether_addr - Determine if the Ethernet address is broadcast * @addr: Pointer to a six-byte array containing the Ethernet address * * Return: true if the address is the broadcast address. * * Please note: addr must be aligned to u16. */ static inline bool is_broadcast_ether_addr(const u8 *addr) { return (*(const u16 *)(addr + 0) & *(const u16 *)(addr + 2) & *(const u16 *)(addr + 4)) == 0xffff; } /** * is_unicast_ether_addr - Determine if the Ethernet address is unicast * @addr: Pointer to a six-byte array containing the Ethernet address * * Return: true if the address is a unicast address. */ static inline bool is_unicast_ether_addr(const u8 *addr) { return !is_multicast_ether_addr(addr); } /** * is_valid_ether_addr - Determine if the given Ethernet address is valid * @addr: Pointer to a six-byte array containing the Ethernet address * * Check that the Ethernet address (MAC) is not 00:00:00:00:00:00, is not * a multicast address, and is not FF:FF:FF:FF:FF:FF. * * Return: true if the address is valid. * * Please note: addr must be aligned to u16. */ static inline bool is_valid_ether_addr(const u8 *addr) { /* FF:FF:FF:FF:FF:FF is a multicast address so we don't need to * explicitly check for it here. */ return !is_multicast_ether_addr(addr) && !is_zero_ether_addr(addr); } /** * eth_proto_is_802_3 - Determine if a given Ethertype/length is a protocol * @proto: Ethertype/length value to be tested * * Check that the value from the Ethertype/length field is a valid Ethertype. * * Return: true if the valid is an 802.3 supported Ethertype. */ static inline bool eth_proto_is_802_3(__be16 proto) { #ifndef __BIG_ENDIAN /* if CPU is little endian mask off bits representing LSB */ proto &= htons(0xFF00); #endif /* cast both to u16 and compare since LSB can be ignored */ return (__force u16)proto >= (__force u16)htons(ETH_P_802_3_MIN); } /** * eth_random_addr - Generate software assigned random Ethernet address * @addr: Pointer to a six-byte array containing the Ethernet address * * Generate a random Ethernet address (MAC) that is not multicast * and has the local assigned bit set. */ static inline void eth_random_addr(u8 *addr) { get_random_bytes(addr, ETH_ALEN); addr[0] &= 0xfe; /* clear multicast bit */ addr[0] |= 0x02; /* set local assignment bit (IEEE802) */ } /** * eth_broadcast_addr - Assign broadcast address * @addr: Pointer to a six-byte array containing the Ethernet address * * Assign the broadcast address to the given address array. */ static inline void eth_broadcast_addr(u8 *addr) { memset(addr, 0xff, ETH_ALEN); } /** * eth_zero_addr - Assign zero address * @addr: Pointer to a six-byte array containing the Ethernet address * * Assign the zero address to the given address array. */ static inline void eth_zero_addr(u8 *addr) { memset(addr, 0x00, ETH_ALEN); } /** * eth_hw_addr_random - Generate software assigned random Ethernet and * set device flag * @dev: pointer to net_device structure * * Generate a random Ethernet address (MAC) to be used by a net device * and set addr_assign_type so the state can be read by sysfs and be * used by userspace. */ static inline void eth_hw_addr_random(struct net_device *dev) { u8 addr[ETH_ALEN]; eth_random_addr(addr); __dev_addr_set(dev, addr, ETH_ALEN); dev->addr_assign_type = NET_ADDR_RANDOM; } /** * eth_hw_addr_crc - Calculate CRC from netdev_hw_addr * @ha: pointer to hardware address * * Calculate CRC from a hardware address as basis for filter hashes. */ static inline u32 eth_hw_addr_crc(struct netdev_hw_addr *ha) { return ether_crc(ETH_ALEN, ha->addr); } /** * ether_addr_copy - Copy an Ethernet address * @dst: Pointer to a six-byte array Ethernet address destination * @src: Pointer to a six-byte array Ethernet address source * * Please note: dst & src must both be aligned to u16. */ static inline void ether_addr_copy(u8 *dst, const u8 *src) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) *(u32 *)dst = *(const u32 *)src; *(u16 *)(dst + 4) = *(const u16 *)(src + 4); #else u16 *a = (u16 *)dst; const u16 *b = (const u16 *)src; a[0] = b[0]; a[1] = b[1]; a[2] = b[2]; #endif } /** * eth_hw_addr_set - Assign Ethernet address to a net_device * @dev: pointer to net_device structure * @addr: address to assign * * Assign given address to the net_device, addr_assign_type is not changed. */ static inline void eth_hw_addr_set(struct net_device *dev, const u8 *addr) { __dev_addr_set(dev, addr, ETH_ALEN); } /** * eth_hw_addr_inherit - Copy dev_addr from another net_device * @dst: pointer to net_device to copy dev_addr to * @src: pointer to net_device to copy dev_addr from * * Copy the Ethernet address from one net_device to another along with * the address attributes (addr_assign_type). */ static inline void eth_hw_addr_inherit(struct net_device *dst, struct net_device *src) { dst->addr_assign_type = src->addr_assign_type; eth_hw_addr_set(dst, src->dev_addr); } /** * ether_addr_equal - Compare two Ethernet addresses * @addr1: Pointer to a six-byte array containing the Ethernet address * @addr2: Pointer other six-byte array containing the Ethernet address * * Compare two Ethernet addresses, returns true if equal * * Please note: addr1 & addr2 must both be aligned to u16. */ static inline bool ether_addr_equal(const u8 *addr1, const u8 *addr2) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) u32 fold = ((*(const u32 *)addr1) ^ (*(const u32 *)addr2)) | ((*(const u16 *)(addr1 + 4)) ^ (*(const u16 *)(addr2 + 4))); return fold == 0; #else const u16 *a = (const u16 *)addr1; const u16 *b = (const u16 *)addr2; return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | (a[2] ^ b[2])) == 0; #endif } /** * ether_addr_equal_64bits - Compare two Ethernet addresses * @addr1: Pointer to an array of 8 bytes * @addr2: Pointer to an other array of 8 bytes * * Compare two Ethernet addresses, returns true if equal, false otherwise. * * The function doesn't need any conditional branches and possibly uses * word memory accesses on CPU allowing cheap unaligned memory reads. * arrays = { byte1, byte2, byte3, byte4, byte5, byte6, pad1, pad2 } * * Please note that alignment of addr1 & addr2 are only guaranteed to be 16 bits. */ static inline bool ether_addr_equal_64bits(const u8 *addr1, const u8 *addr2) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 u64 fold = (*(const u64 *)addr1) ^ (*(const u64 *)addr2); #ifdef __BIG_ENDIAN return (fold >> 16) == 0; #else return (fold << 16) == 0; #endif #else return ether_addr_equal(addr1, addr2); #endif } /** * ether_addr_equal_unaligned - Compare two not u16 aligned Ethernet addresses * @addr1: Pointer to a six-byte array containing the Ethernet address * @addr2: Pointer other six-byte array containing the Ethernet address * * Compare two Ethernet addresses, returns true if equal * * Please note: Use only when any Ethernet address may not be u16 aligned. */ static inline bool ether_addr_equal_unaligned(const u8 *addr1, const u8 *addr2) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) return ether_addr_equal(addr1, addr2); #else return memcmp(addr1, addr2, ETH_ALEN) == 0; #endif } /** * ether_addr_equal_masked - Compare two Ethernet addresses with a mask * @addr1: Pointer to a six-byte array containing the 1st Ethernet address * @addr2: Pointer to a six-byte array containing the 2nd Ethernet address * @mask: Pointer to a six-byte array containing the Ethernet address bitmask * * Compare two Ethernet addresses with a mask, returns true if for every bit * set in the bitmask the equivalent bits in the ethernet addresses are equal. * Using a mask with all bits set is a slower ether_addr_equal. */ static inline bool ether_addr_equal_masked(const u8 *addr1, const u8 *addr2, const u8 *mask) { int i; for (i = 0; i < ETH_ALEN; i++) { if ((addr1[i] ^ addr2[i]) & mask[i]) return false; } return true; } static inline bool ether_addr_is_ipv4_mcast(const u8 *addr) { u8 mask[ETH_ALEN] = { 0xff, 0xff, 0xff, 0x80, 0x00, 0x00 }; return ether_addr_equal_masked(addr, eth_ipv4_mcast_addr_base, mask); } static inline bool ether_addr_is_ipv6_mcast(const u8 *addr) { u8 mask[ETH_ALEN] = { 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 }; return ether_addr_equal_masked(addr, eth_ipv6_mcast_addr_base, mask); } static inline bool ether_addr_is_ip_mcast(const u8 *addr) { return ether_addr_is_ipv4_mcast(addr) || ether_addr_is_ipv6_mcast(addr); } /** * ether_addr_to_u64 - Convert an Ethernet address into a u64 value. * @addr: Pointer to a six-byte array containing the Ethernet address * * Return: a u64 value of the address */ static inline u64 ether_addr_to_u64(const u8 *addr) { u64 u = 0; int i; for (i = 0; i < ETH_ALEN; i++) u = u << 8 | addr[i]; return u; } /** * u64_to_ether_addr - Convert a u64 to an Ethernet address. * @u: u64 to convert to an Ethernet MAC address * @addr: Pointer to a six-byte array to contain the Ethernet address */ static inline void u64_to_ether_addr(u64 u, u8 *addr) { int i; for (i = ETH_ALEN - 1; i >= 0; i--) { addr[i] = u & 0xff; u = u >> 8; } } /** * eth_addr_dec - Decrement the given MAC address * * @addr: Pointer to a six-byte array containing Ethernet address to decrement */ static inline void eth_addr_dec(u8 *addr) { u64 u = ether_addr_to_u64(addr); u--; u64_to_ether_addr(u, addr); } /** * eth_addr_inc() - Increment the given MAC address. * @addr: Pointer to a six-byte array containing Ethernet address to increment. */ static inline void eth_addr_inc(u8 *addr) { u64 u = ether_addr_to_u64(addr); u++; u64_to_ether_addr(u, addr); } /** * eth_addr_add() - Add (or subtract) an offset to/from the given MAC address. * * @offset: Offset to add. * @addr: Pointer to a six-byte array containing Ethernet address to increment. */ static inline void eth_addr_add(u8 *addr, long offset) { u64 u = ether_addr_to_u64(addr); u += offset; u64_to_ether_addr(u, addr); } /** * is_etherdev_addr - Tell if given Ethernet address belongs to the device. * @dev: Pointer to a device structure * @addr: Pointer to a six-byte array containing the Ethernet address * * Compare passed address with all addresses of the device. Return true if the * address if one of the device addresses. * * Note that this function calls ether_addr_equal_64bits() so take care of * the right padding. */ static inline bool is_etherdev_addr(const struct net_device *dev, const u8 addr[6 + 2]) { struct netdev_hw_addr *ha; bool res = false; rcu_read_lock(); for_each_dev_addr(dev, ha) { res = ether_addr_equal_64bits(addr, ha->addr); if (res) break; } rcu_read_unlock(); return res; } #endif /* __KERNEL__ */ /** * compare_ether_header - Compare two Ethernet headers * @a: Pointer to Ethernet header * @b: Pointer to Ethernet header * * Compare two Ethernet headers, returns 0 if equal. * This assumes that the network header (i.e., IP header) is 4-byte * aligned OR the platform can handle unaligned access. This is the * case for all packets coming into netif_receive_skb or similar * entry points. */ static inline unsigned long compare_ether_header(const void *a, const void *b) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 unsigned long fold; /* * We want to compare 14 bytes: * [a0 ... a13] ^ [b0 ... b13] * Use two long XOR, ORed together, with an overlap of two bytes. * [a0 a1 a2 a3 a4 a5 a6 a7 ] ^ [b0 b1 b2 b3 b4 b5 b6 b7 ] | * [a6 a7 a8 a9 a10 a11 a12 a13] ^ [b6 b7 b8 b9 b10 b11 b12 b13] * This means the [a6 a7] ^ [b6 b7] part is done two times. */ fold = *(unsigned long *)a ^ *(unsigned long *)b; fold |= *(unsigned long *)(a + 6) ^ *(unsigned long *)(b + 6); return fold; #else u32 *a32 = (u32 *)((u8 *)a + 2); u32 *b32 = (u32 *)((u8 *)b + 2); return (*(u16 *)a ^ *(u16 *)b) | (a32[0] ^ b32[0]) | (a32[1] ^ b32[1]) | (a32[2] ^ b32[2]); #endif } /** * eth_hw_addr_gen - Generate and assign Ethernet address to a port * @dev: pointer to port's net_device structure * @base_addr: base Ethernet address * @id: offset to add to the base address * * Generate a MAC address using a base address and an offset and assign it * to a net_device. Commonly used by switch drivers which need to compute * addresses for all their ports. addr_assign_type is not changed. */ static inline void eth_hw_addr_gen(struct net_device *dev, const u8 *base_addr, unsigned int id) { u64 u = ether_addr_to_u64(base_addr); u8 addr[ETH_ALEN]; u += id; u64_to_ether_addr(u, addr); eth_hw_addr_set(dev, addr); } /** * eth_skb_pkt_type - Assign packet type if destination address does not match * @skb: Assigned a packet type if address does not match @dev address * @dev: Network device used to compare packet address against * * If the destination MAC address of the packet does not match the network * device address, assign an appropriate packet type. */ static inline void eth_skb_pkt_type(struct sk_buff *skb, const struct net_device *dev) { const struct ethhdr *eth = eth_hdr(skb); if (unlikely(!ether_addr_equal_64bits(eth->h_dest, dev->dev_addr))) { if (unlikely(is_multicast_ether_addr_64bits(eth->h_dest))) { if (ether_addr_equal_64bits(eth->h_dest, dev->broadcast)) skb->pkt_type = PACKET_BROADCAST; else skb->pkt_type = PACKET_MULTICAST; } else { skb->pkt_type = PACKET_OTHERHOST; } } } static inline struct ethhdr *eth_skb_pull_mac(struct sk_buff *skb) { struct ethhdr *eth = (struct ethhdr *)skb->data; skb_pull_inline(skb, ETH_HLEN); return eth; } /** * eth_skb_pad - Pad buffer to minimum number of octets for Ethernet frame * @skb: Buffer to pad * * An Ethernet frame should have a minimum size of 60 bytes. This function * takes short frames and pads them with zeros up to the 60 byte limit. */ static inline int eth_skb_pad(struct sk_buff *skb) { return skb_put_padto(skb, ETH_ZLEN); } #endif /* _LINUX_ETHERDEVICE_H */
27 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 /* SPDX-License-Identifier: GPL-2.0 */ /* * Statically sized hash table implementation * (C) 2012 Sasha Levin <levinsasha928@gmail.com> */ #ifndef _LINUX_HASHTABLE_H #define _LINUX_HASHTABLE_H #include <linux/list.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/hash.h> #include <linux/rculist.h> #define DEFINE_HASHTABLE(name, bits) \ struct hlist_head name[1 << (bits)] = \ { [0 ... ((1 << (bits)) - 1)] = HLIST_HEAD_INIT } #define DEFINE_READ_MOSTLY_HASHTABLE(name, bits) \ struct hlist_head name[1 << (bits)] __read_mostly = \ { [0 ... ((1 << (bits)) - 1)] = HLIST_HEAD_INIT } #define DECLARE_HASHTABLE(name, bits) \ struct hlist_head name[1 << (bits)] #define HASH_SIZE(name) (ARRAY_SIZE(name)) #define HASH_BITS(name) ilog2(HASH_SIZE(name)) /* Use hash_32 when possible to allow for fast 32bit hashing in 64bit kernels. */ #define hash_min(val, bits) \ (sizeof(val) <= 4 ? hash_32(val, bits) : hash_long(val, bits)) static inline void __hash_init(struct hlist_head *ht, unsigned int sz) { unsigned int i; for (i = 0; i < sz; i++) INIT_HLIST_HEAD(&ht[i]); } /** * hash_init - initialize a hash table * @hashtable: hashtable to be initialized * * Calculates the size of the hashtable from the given parameter, otherwise * same as hash_init_size. * * This has to be a macro since HASH_BITS() will not work on pointers since * it calculates the size during preprocessing. */ #define hash_init(hashtable) __hash_init(hashtable, HASH_SIZE(hashtable)) /** * hash_add - add an object to a hashtable * @hashtable: hashtable to add to * @node: the &struct hlist_node of the object to be added * @key: the key of the object to be added */ #define hash_add(hashtable, node, key) \ hlist_add_head(node, &hashtable[hash_min(key, HASH_BITS(hashtable))]) /** * hash_add_rcu - add an object to a rcu enabled hashtable * @hashtable: hashtable to add to * @node: the &struct hlist_node of the object to be added * @key: the key of the object to be added */ #define hash_add_rcu(hashtable, node, key) \ hlist_add_head_rcu(node, &hashtable[hash_min(key, HASH_BITS(hashtable))]) /** * hash_hashed - check whether an object is in any hashtable * @node: the &struct hlist_node of the object to be checked */ static inline bool hash_hashed(struct hlist_node *node) { return !hlist_unhashed(node); } static inline bool __hash_empty(struct hlist_head *ht, unsigned int sz) { unsigned int i; for (i = 0; i < sz; i++) if (!hlist_empty(&ht[i])) return false; return true; } /** * hash_empty - check whether a hashtable is empty * @hashtable: hashtable to check * * This has to be a macro since HASH_BITS() will not work on pointers since * it calculates the size during preprocessing. */ #define hash_empty(hashtable) __hash_empty(hashtable, HASH_SIZE(hashtable)) /** * hash_del - remove an object from a hashtable * @node: &struct hlist_node of the object to remove */ static inline void hash_del(struct hlist_node *node) { hlist_del_init(node); } /** * hash_del_rcu - remove an object from a rcu enabled hashtable * @node: &struct hlist_node of the object to remove */ static inline void hash_del_rcu(struct hlist_node *node) { hlist_del_init_rcu(node); } /** * hash_for_each - iterate over a hashtable * @name: hashtable to iterate * @bkt: integer to use as bucket loop cursor * @obj: the type * to use as a loop cursor for each entry * @member: the name of the hlist_node within the struct */ #define hash_for_each(name, bkt, obj, member) \ for ((bkt) = 0, obj = NULL; obj == NULL && (bkt) < HASH_SIZE(name);\ (bkt)++)\ hlist_for_each_entry(obj, &name[bkt], member) /** * hash_for_each_rcu - iterate over a rcu enabled hashtable * @name: hashtable to iterate * @bkt: integer to use as bucket loop cursor * @obj: the type * to use as a loop cursor for each entry * @member: the name of the hlist_node within the struct */ #define hash_for_each_rcu(name, bkt, obj, member) \ for ((bkt) = 0, obj = NULL; obj == NULL && (bkt) < HASH_SIZE(name);\ (bkt)++)\ hlist_for_each_entry_rcu(obj, &name[bkt], member) /** * hash_for_each_safe - iterate over a hashtable safe against removal of * hash entry * @name: hashtable to iterate * @bkt: integer to use as bucket loop cursor * @tmp: a &struct hlist_node used for temporary storage * @obj: the type * to use as a loop cursor for each entry * @member: the name of the hlist_node within the struct */ #define hash_for_each_safe(name, bkt, tmp, obj, member) \ for ((bkt) = 0, obj = NULL; obj == NULL && (bkt) < HASH_SIZE(name);\ (bkt)++)\ hlist_for_each_entry_safe(obj, tmp, &name[bkt], member) /** * hash_for_each_possible - iterate over all possible objects hashing to the * same bucket * @name: hashtable to iterate * @obj: the type * to use as a loop cursor for each entry * @member: the name of the hlist_node within the struct * @key: the key of the objects to iterate over */ #define hash_for_each_possible(name, obj, member, key) \ hlist_for_each_entry(obj, &name[hash_min(key, HASH_BITS(name))], member) /** * hash_for_each_possible_rcu - iterate over all possible objects hashing to the * same bucket in an rcu enabled hashtable * @name: hashtable to iterate * @obj: the type * to use as a loop cursor for each entry * @member: the name of the hlist_node within the struct * @key: the key of the objects to iterate over */ #define hash_for_each_possible_rcu(name, obj, member, key, cond...) \ hlist_for_each_entry_rcu(obj, &name[hash_min(key, HASH_BITS(name))],\ member, ## cond) /** * hash_for_each_possible_rcu_notrace - iterate over all possible objects hashing * to the same bucket in an rcu enabled hashtable in a rcu enabled hashtable * @name: hashtable to iterate * @obj: the type * to use as a loop cursor for each entry * @member: the name of the hlist_node within the struct * @key: the key of the objects to iterate over * * This is the same as hash_for_each_possible_rcu() except that it does * not do any RCU debugging or tracing. */ #define hash_for_each_possible_rcu_notrace(name, obj, member, key) \ hlist_for_each_entry_rcu_notrace(obj, \ &name[hash_min(key, HASH_BITS(name))], member) /** * hash_for_each_possible_safe - iterate over all possible objects hashing to the * same bucket safe against removals * @name: hashtable to iterate * @obj: the type * to use as a loop cursor for each entry * @tmp: a &struct hlist_node used for temporary storage * @member: the name of the hlist_node within the struct * @key: the key of the objects to iterate over */ #define hash_for_each_possible_safe(name, obj, tmp, member, key) \ hlist_for_each_entry_safe(obj, tmp,\ &name[hash_min(key, HASH_BITS(name))], member) #endif
134 5 134 127 129 15 15 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 // SPDX-License-Identifier: GPL-2.0 /* * linux/kernel/capability.c * * Copyright (C) 1997 Andrew Main <zefram@fysh.org> * * Integrated into 2.1.97+, Andrew G. Morgan <morgan@kernel.org> * 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/audit.h> #include <linux/capability.h> #include <linux/mm.h> #include <linux/export.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/pid_namespace.h> #include <linux/user_namespace.h> #include <linux/uaccess.h> int file_caps_enabled = 1; static int __init file_caps_disable(char *str) { file_caps_enabled = 0; return 1; } __setup("no_file_caps", file_caps_disable); #ifdef CONFIG_MULTIUSER /* * More recent versions of libcap are available from: * * http://www.kernel.org/pub/linux/libs/security/linux-privs/ */ static void warn_legacy_capability_use(void) { pr_info_once("warning: `%s' uses 32-bit capabilities (legacy support in use)\n", current->comm); } /* * Version 2 capabilities worked fine, but the linux/capability.h file * that accompanied their introduction encouraged their use without * the necessary user-space source code changes. As such, we have * created a version 3 with equivalent functionality to version 2, but * with a header change to protect legacy source code from using * version 2 when it wanted to use version 1. If your system has code * that trips the following warning, it is using version 2 specific * capabilities and may be doing so insecurely. * * The remedy is to either upgrade your version of libcap (to 2.10+, * if the application is linked against it), or recompile your * application with modern kernel headers and this warning will go * away. */ static void warn_deprecated_v2(void) { pr_info_once("warning: `%s' uses deprecated v2 capabilities in a way that may be insecure\n", current->comm); } /* * Version check. Return the number of u32s in each capability flag * array, or a negative value on error. */ static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy) { __u32 version; if (get_user(version, &header->version)) return -EFAULT; switch (version) { case _LINUX_CAPABILITY_VERSION_1: warn_legacy_capability_use(); *tocopy = _LINUX_CAPABILITY_U32S_1; break; case _LINUX_CAPABILITY_VERSION_2: warn_deprecated_v2(); fallthrough; /* v3 is otherwise equivalent to v2 */ case _LINUX_CAPABILITY_VERSION_3: *tocopy = _LINUX_CAPABILITY_U32S_3; break; default: if (put_user((u32)_KERNEL_CAPABILITY_VERSION, &header->version)) return -EFAULT; return -EINVAL; } return 0; } /* * The only thing that can change the capabilities of the current * process is the current process. As such, we can't be in this code * at the same time as we are in the process of setting capabilities * in this process. The net result is that we can limit our use of * locks to when we are reading the caps of another process. */ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp, kernel_cap_t *pIp, kernel_cap_t *pPp) { int ret; if (pid && (pid != task_pid_vnr(current))) { const struct task_struct *target; rcu_read_lock(); target = find_task_by_vpid(pid); if (!target) ret = -ESRCH; else ret = security_capget(target, pEp, pIp, pPp); rcu_read_unlock(); } else ret = security_capget(current, pEp, pIp, pPp); return ret; } /** * sys_capget - get the capabilities of a given process. * @header: pointer to struct that contains capability version and * target pid data * @dataptr: pointer to struct that contains the effective, permitted, * and inheritable capabilities that are returned * * Returns 0 on success and < 0 on error. */ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr) { int ret = 0; pid_t pid; unsigned tocopy; kernel_cap_t pE, pI, pP; struct __user_cap_data_struct kdata[2]; ret = cap_validate_magic(header, &tocopy); if ((dataptr == NULL) || (ret != 0)) return ((dataptr == NULL) && (ret == -EINVAL)) ? 0 : ret; if (get_user(pid, &header->pid)) return -EFAULT; if (pid < 0) return -EINVAL; ret = cap_get_target_pid(pid, &pE, &pI, &pP); if (ret) return ret; /* * Annoying legacy format with 64-bit capabilities exposed * as two sets of 32-bit fields, so we need to split the * capability values up. */ kdata[0].effective = pE.val; kdata[1].effective = pE.val >> 32; kdata[0].permitted = pP.val; kdata[1].permitted = pP.val >> 32; kdata[0].inheritable = pI.val; kdata[1].inheritable = pI.val >> 32; /* * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S, * we silently drop the upper capabilities here. This * has the effect of making older libcap * implementations implicitly drop upper capability * bits when they perform a: capget/modify/capset * sequence. * * This behavior is considered fail-safe * behavior. Upgrading the application to a newer * version of libcap will enable access to the newer * capabilities. * * An alternative would be to return an error here * (-ERANGE), but that causes legacy applications to * unexpectedly fail; the capget/modify/capset aborts * before modification is attempted and the application * fails. */ if (copy_to_user(dataptr, kdata, tocopy * sizeof(kdata[0]))) return -EFAULT; return 0; } static kernel_cap_t mk_kernel_cap(u32 low, u32 high) { return (kernel_cap_t) { (low | ((u64)high << 32)) & CAP_VALID_MASK }; } /** * sys_capset - set capabilities for a process or (*) a group of processes * @header: pointer to struct that contains capability version and * target pid data * @data: pointer to struct that contains the effective, permitted, * and inheritable capabilities * * Set capabilities for the current process only. The ability to any other * process(es) has been deprecated and removed. * * The restrictions on setting capabilities are specified as: * * I: any raised capabilities must be a subset of the old permitted * P: any raised capabilities must be a subset of the old permitted * E: must be set to a subset of new permitted * * Returns 0 on success and < 0 on error. */ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data) { struct __user_cap_data_struct kdata[2] = { { 0, }, }; unsigned tocopy, copybytes; kernel_cap_t inheritable, permitted, effective; struct cred *new; int ret; pid_t pid; ret = cap_validate_magic(header, &tocopy); if (ret != 0) return ret; if (get_user(pid, &header->pid)) return -EFAULT; /* may only affect current now */ if (pid != 0 && pid != task_pid_vnr(current)) return -EPERM; copybytes = tocopy * sizeof(struct __user_cap_data_struct); if (copybytes > sizeof(kdata)) return -EFAULT; if (copy_from_user(&kdata, data, copybytes)) return -EFAULT; effective = mk_kernel_cap(kdata[0].effective, kdata[1].effective); permitted = mk_kernel_cap(kdata[0].permitted, kdata[1].permitted); inheritable = mk_kernel_cap(kdata[0].inheritable, kdata[1].inheritable); new = prepare_creds(); if (!new) return -ENOMEM; ret = security_capset(new, current_cred(), &effective, &inheritable, &permitted); if (ret < 0) goto error; audit_log_capset(new, current_cred()); return commit_creds(new); error: abort_creds(new); return ret; } /** * has_ns_capability - Does a task have a capability in a specific user ns * @t: The task in question * @ns: target user namespace * @cap: The capability to be tested for * * Return true if the specified task has the given superior capability * currently in effect to the specified user namespace, false if not. * * Note that this does not set PF_SUPERPRIV on the task. */ bool has_ns_capability(struct task_struct *t, struct user_namespace *ns, int cap) { int ret; rcu_read_lock(); ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NONE); rcu_read_unlock(); return (ret == 0); } /** * has_ns_capability_noaudit - Does a task have a capability (unaudited) * in a specific user ns. * @t: The task in question * @ns: target user namespace * @cap: The capability to be tested for * * Return true if the specified task has the given superior capability * currently in effect to the specified user namespace, false if not. * Do not write an audit message for the check. * * Note that this does not set PF_SUPERPRIV on the task. */ bool has_ns_capability_noaudit(struct task_struct *t, struct user_namespace *ns, int cap) { int ret; rcu_read_lock(); ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NOAUDIT); rcu_read_unlock(); return (ret == 0); } /** * has_capability_noaudit - Does a task have a capability (unaudited) in the * initial user ns * @t: The task in question * @cap: The capability to be tested for * * Return true if the specified task has the given superior capability * currently in effect to init_user_ns, false if not. Don't write an * audit message for the check. * * Note that this does not set PF_SUPERPRIV on the task. */ bool has_capability_noaudit(struct task_struct *t, int cap) { return has_ns_capability_noaudit(t, &init_user_ns, cap); } EXPORT_SYMBOL(has_capability_noaudit); static bool ns_capable_common(struct user_namespace *ns, int cap, unsigned int opts) { int capable; if (unlikely(!cap_valid(cap))) { pr_crit("capable() called with invalid cap=%u\n", cap); BUG(); } capable = security_capable(current_cred(), ns, cap, opts); if (capable == 0) { current->flags |= PF_SUPERPRIV; return true; } return false; } /** * ns_capable - Determine if the current task has a superior capability in effect * @ns: The usernamespace we want the capability in * @cap: The capability to be tested for * * Return true if the current task has the given superior capability currently * available for use, false if not. * * This sets PF_SUPERPRIV on the task if the capability is available on the * assumption that it's about to be used. */ bool ns_capable(struct user_namespace *ns, int cap) { return ns_capable_common(ns, cap, CAP_OPT_NONE); } EXPORT_SYMBOL(ns_capable); /** * ns_capable_noaudit - Determine if the current task has a superior capability * (unaudited) in effect * @ns: The usernamespace we want the capability in * @cap: The capability to be tested for * * Return true if the current task has the given superior capability currently * available for use, false if not. * * This sets PF_SUPERPRIV on the task if the capability is available on the * assumption that it's about to be used. */ bool ns_capable_noaudit(struct user_namespace *ns, int cap) { return ns_capable_common(ns, cap, CAP_OPT_NOAUDIT); } EXPORT_SYMBOL(ns_capable_noaudit); /** * ns_capable_setid - Determine if the current task has a superior capability * in effect, while signalling that this check is being done from within a * setid or setgroups syscall. * @ns: The usernamespace we want the capability in * @cap: The capability to be tested for * * Return true if the current task has the given superior capability currently * available for use, false if not. * * This sets PF_SUPERPRIV on the task if the capability is available on the * assumption that it's about to be used. */ bool ns_capable_setid(struct user_namespace *ns, int cap) { return ns_capable_common(ns, cap, CAP_OPT_INSETID); } EXPORT_SYMBOL(ns_capable_setid); /** * capable - Determine if the current task has a superior capability in effect * @cap: The capability to be tested for * * Return true if the current task has the given superior capability currently * available for use, false if not. * * This sets PF_SUPERPRIV on the task if the capability is available on the * assumption that it's about to be used. */ bool capable(int cap) { return ns_capable(&init_user_ns, cap); } EXPORT_SYMBOL(capable); #endif /* CONFIG_MULTIUSER */ /** * file_ns_capable - Determine if the file's opener had a capability in effect * @file: The file we want to check * @ns: The usernamespace we want the capability in * @cap: The capability to be tested for * * Return true if task that opened the file had a capability in effect * when the file was opened. * * This does not set PF_SUPERPRIV because the caller may not * actually be privileged. */ bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap) { if (WARN_ON_ONCE(!cap_valid(cap))) return false; if (security_capable(file->f_cred, ns, cap, CAP_OPT_NONE) == 0) return true; return false; } EXPORT_SYMBOL(file_ns_capable); /** * privileged_wrt_inode_uidgid - Do capabilities in the namespace work over the inode? * @ns: The user namespace in question * @idmap: idmap of the mount @inode was found from * @inode: The inode in question * * Return true if the inode uid and gid are within the namespace. */ bool privileged_wrt_inode_uidgid(struct user_namespace *ns, struct mnt_idmap *idmap, const struct inode *inode) { return vfsuid_has_mapping(ns, i_uid_into_vfsuid(idmap, inode)) && vfsgid_has_mapping(ns, i_gid_into_vfsgid(idmap, inode)); } /** * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped * @idmap: idmap of the mount @inode was found from * @inode: The inode in question * @cap: The capability in question * * Return true if the current task has the given capability targeted at * its own user namespace and that the given inode's uid and gid are * mapped into the current user namespace. */ bool capable_wrt_inode_uidgid(struct mnt_idmap *idmap, const struct inode *inode, int cap) { struct user_namespace *ns = current_user_ns(); return ns_capable(ns, cap) && privileged_wrt_inode_uidgid(ns, idmap, inode); } EXPORT_SYMBOL(capable_wrt_inode_uidgid); /** * ptracer_capable - Determine if the ptracer holds CAP_SYS_PTRACE in the namespace * @tsk: The task that may be ptraced * @ns: The user namespace to search for CAP_SYS_PTRACE in * * Return true if the task that is ptracing the current task had CAP_SYS_PTRACE * in the specified user namespace. */ bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns) { int ret = 0; /* An absent tracer adds no restrictions */ const struct cred *cred; rcu_read_lock(); cred = rcu_dereference(tsk->ptracer_cred); if (cred) ret = security_capable(cred, ns, CAP_SYS_PTRACE, CAP_OPT_NOAUDIT); rcu_read_unlock(); return (ret == 0); }
6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 // SPDX-License-Identifier: GPL-2.0-only /* * This is a module which is used for queueing packets and communicating with * userspace via nfnetlink. * * (C) 2005 by Harald Welte <laforge@netfilter.org> * (C) 2007 by Patrick McHardy <kaber@trash.net> * * Based on the old ipv4-only ip_queue.c: * (C) 2000-2002 James Morris <jmorris@intercode.com.au> * (C) 2003-2005 Netfilter Core Team <coreteam@netfilter.org> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/init.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/notifier.h> #include <linux/netdevice.h> #include <linux/netfilter.h> #include <linux/proc_fs.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <linux/netfilter_bridge.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_queue.h> #include <linux/netfilter/nf_conntrack_common.h> #include <linux/list.h> #include <linux/cgroup-defs.h> #include <net/gso.h> #include <net/sock.h> #include <net/tcp_states.h> #include <net/netfilter/nf_queue.h> #include <net/netns/generic.h> #include <linux/atomic.h> #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) #include "../bridge/br_private.h" #endif #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <net/netfilter/nf_conntrack.h> #endif #define NFQNL_QMAX_DEFAULT 1024 /* We're using struct nlattr which has 16bit nla_len. Note that nla_len * includes the header length. Thus, the maximum packet length that we * support is 65531 bytes. We send truncated packets if the specified length * is larger than that. Userspace can check for presence of NFQA_CAP_LEN * attribute to detect truncation. */ #define NFQNL_MAX_COPY_RANGE (0xffff - NLA_HDRLEN) struct nfqnl_instance { struct hlist_node hlist; /* global list of queues */ struct rcu_head rcu; u32 peer_portid; unsigned int queue_maxlen; unsigned int copy_range; unsigned int queue_dropped; unsigned int queue_user_dropped; u_int16_t queue_num; /* number of this queue */ u_int8_t copy_mode; u_int32_t flags; /* Set using NFQA_CFG_FLAGS */ /* * Following fields are dirtied for each queued packet, * keep them in same cache line if possible. */ spinlock_t lock ____cacheline_aligned_in_smp; unsigned int queue_total; unsigned int id_sequence; /* 'sequence' of pkt ids */ struct list_head queue_list; /* packets in queue */ }; typedef int (*nfqnl_cmpfn)(struct nf_queue_entry *, unsigned long); static unsigned int nfnl_queue_net_id __read_mostly; #define INSTANCE_BUCKETS 16 struct nfnl_queue_net { spinlock_t instances_lock; struct hlist_head instance_table[INSTANCE_BUCKETS]; }; static struct nfnl_queue_net *nfnl_queue_pernet(struct net *net) { return net_generic(net, nfnl_queue_net_id); } static inline u_int8_t instance_hashfn(u_int16_t queue_num) { return ((queue_num >> 8) ^ queue_num) % INSTANCE_BUCKETS; } static struct nfqnl_instance * instance_lookup(struct nfnl_queue_net *q, u_int16_t queue_num) { struct hlist_head *head; struct nfqnl_instance *inst; head = &q->instance_table[instance_hashfn(queue_num)]; hlist_for_each_entry_rcu(inst, head, hlist) { if (inst->queue_num == queue_num) return inst; } return NULL; } static struct nfqnl_instance * instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid) { struct nfqnl_instance *inst; unsigned int h; int err; spin_lock(&q->instances_lock); if (instance_lookup(q, queue_num)) { err = -EEXIST; goto out_unlock; } inst = kzalloc(sizeof(*inst), GFP_ATOMIC); if (!inst) { err = -ENOMEM; goto out_unlock; } inst->queue_num = queue_num; inst->peer_portid = portid; inst->queue_maxlen = NFQNL_QMAX_DEFAULT; inst->copy_range = NFQNL_MAX_COPY_RANGE; inst->copy_mode = NFQNL_COPY_NONE; spin_lock_init(&inst->lock); INIT_LIST_HEAD(&inst->queue_list); if (!try_module_get(THIS_MODULE)) { err = -EAGAIN; goto out_free; } h = instance_hashfn(queue_num); hlist_add_head_rcu(&inst->hlist, &q->instance_table[h]); spin_unlock(&q->instances_lock); return inst; out_free: kfree(inst); out_unlock: spin_unlock(&q->instances_lock); return ERR_PTR(err); } static void nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data); static void instance_destroy_rcu(struct rcu_head *head) { struct nfqnl_instance *inst = container_of(head, struct nfqnl_instance, rcu); rcu_read_lock(); nfqnl_flush(inst, NULL, 0); rcu_read_unlock(); kfree(inst); module_put(THIS_MODULE); } static void __instance_destroy(struct nfqnl_instance *inst) { hlist_del_rcu(&inst->hlist); call_rcu(&inst->rcu, instance_destroy_rcu); } static void instance_destroy(struct nfnl_queue_net *q, struct nfqnl_instance *inst) { spin_lock(&q->instances_lock); __instance_destroy(inst); spin_unlock(&q->instances_lock); } static inline void __enqueue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry) { list_add_tail(&entry->list, &queue->queue_list); queue->queue_total++; } static void __dequeue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry) { list_del(&entry->list); queue->queue_total--; } static struct nf_queue_entry * find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id) { struct nf_queue_entry *entry = NULL, *i; spin_lock_bh(&queue->lock); list_for_each_entry(i, &queue->queue_list, list) { if (i->id == id) { entry = i; break; } } if (entry) __dequeue_entry(queue, entry); spin_unlock_bh(&queue->lock); return entry; } static unsigned int nf_iterate(struct sk_buff *skb, struct nf_hook_state *state, const struct nf_hook_entries *hooks, unsigned int *index) { const struct nf_hook_entry *hook; unsigned int verdict, i = *index; while (i < hooks->num_hook_entries) { hook = &hooks->hooks[i]; repeat: verdict = nf_hook_entry_hookfn(hook, skb, state); if (verdict != NF_ACCEPT) { *index = i; if (verdict != NF_REPEAT) return verdict; goto repeat; } i++; } *index = i; return NF_ACCEPT; } static struct nf_hook_entries *nf_hook_entries_head(const struct net *net, u8 pf, u8 hooknum) { switch (pf) { #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE case NFPROTO_BRIDGE: return rcu_dereference(net->nf.hooks_bridge[hooknum]); #endif case NFPROTO_IPV4: return rcu_dereference(net->nf.hooks_ipv4[hooknum]); case NFPROTO_IPV6: return rcu_dereference(net->nf.hooks_ipv6[hooknum]); default: WARN_ON_ONCE(1); return NULL; } return NULL; } static int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry) { #ifdef CONFIG_INET const struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry); if (entry->state.hook == NF_INET_LOCAL_OUT) { const struct iphdr *iph = ip_hdr(skb); if (!(iph->tos == rt_info->tos && skb->mark == rt_info->mark && iph->daddr == rt_info->daddr && iph->saddr == rt_info->saddr)) return ip_route_me_harder(entry->state.net, entry->state.sk, skb, RTN_UNSPEC); } #endif return 0; } static int nf_reroute(struct sk_buff *skb, struct nf_queue_entry *entry) { const struct nf_ipv6_ops *v6ops; int ret = 0; switch (entry->state.pf) { case AF_INET: ret = nf_ip_reroute(skb, entry); break; case AF_INET6: v6ops = rcu_dereference(nf_ipv6_ops); if (v6ops) ret = v6ops->reroute(skb, entry); break; } return ret; } /* caller must hold rcu read-side lock */ static void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) { const struct nf_hook_entry *hook_entry; const struct nf_hook_entries *hooks; struct sk_buff *skb = entry->skb; const struct net *net; unsigned int i; int err; u8 pf; net = entry->state.net; pf = entry->state.pf; hooks = nf_hook_entries_head(net, pf, entry->state.hook); i = entry->hook_index; if (!hooks || i >= hooks->num_hook_entries) { kfree_skb_reason(skb, SKB_DROP_REASON_NETFILTER_DROP); nf_queue_entry_free(entry); return; } hook_entry = &hooks->hooks[i]; /* Continue traversal iff userspace said ok... */ if (verdict == NF_REPEAT) verdict = nf_hook_entry_hookfn(hook_entry, skb, &entry->state); if (verdict == NF_ACCEPT) { if (nf_reroute(skb, entry) < 0) verdict = NF_DROP; } if (verdict == NF_ACCEPT) { next_hook: ++i; verdict = nf_iterate(skb, &entry->state, hooks, &i); } switch (verdict & NF_VERDICT_MASK) { case NF_ACCEPT: case NF_STOP: local_bh_disable(); entry->state.okfn(entry->state.net, entry->state.sk, skb); local_bh_enable(); break; case NF_QUEUE: err = nf_queue(skb, &entry->state, i, verdict); if (err == 1) goto next_hook; break; case NF_STOLEN: break; default: kfree_skb(skb); } nf_queue_entry_free(entry); } static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict) { const struct nf_ct_hook *ct_hook; if (verdict == NF_ACCEPT || verdict == NF_REPEAT || verdict == NF_STOP) { unsigned int ct_verdict = verdict; rcu_read_lock(); ct_hook = rcu_dereference(nf_ct_hook); if (ct_hook) ct_verdict = ct_hook->update(entry->state.net, entry->skb); rcu_read_unlock(); switch (ct_verdict & NF_VERDICT_MASK) { case NF_ACCEPT: /* follow userspace verdict, could be REPEAT */ break; case NF_STOLEN: nf_queue_entry_free(entry); return; default: verdict = ct_verdict & NF_VERDICT_MASK; break; } } nf_reinject(entry, verdict); } static void nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data) { struct nf_queue_entry *entry, *next; spin_lock_bh(&queue->lock); list_for_each_entry_safe(entry, next, &queue->queue_list, list) { if (!cmpfn || cmpfn(entry, data)) { list_del(&entry->list); queue->queue_total--; nfqnl_reinject(entry, NF_DROP); } } spin_unlock_bh(&queue->lock); } static int nfqnl_put_packet_info(struct sk_buff *nlskb, struct sk_buff *packet, bool csum_verify) { __u32 flags = 0; if (packet->ip_summed == CHECKSUM_PARTIAL) flags = NFQA_SKB_CSUMNOTREADY; else if (csum_verify) flags = NFQA_SKB_CSUM_NOTVERIFIED; if (skb_is_gso(packet)) flags |= NFQA_SKB_GSO; return flags ? nla_put_be32(nlskb, NFQA_SKB_INFO, htonl(flags)) : 0; } static int nfqnl_put_sk_uidgid(struct sk_buff *skb, struct sock *sk) { const struct cred *cred; if (!sk_fullsock(sk)) return 0; read_lock_bh(&sk->sk_callback_lock); if (sk->sk_socket && sk->sk_socket->file) { cred = sk->sk_socket->file->f_cred; if (nla_put_be32(skb, NFQA_UID, htonl(from_kuid_munged(&init_user_ns, cred->fsuid)))) goto nla_put_failure; if (nla_put_be32(skb, NFQA_GID, htonl(from_kgid_munged(&init_user_ns, cred->fsgid)))) goto nla_put_failure; } read_unlock_bh(&sk->sk_callback_lock); return 0; nla_put_failure: read_unlock_bh(&sk->sk_callback_lock); return -1; } static int nfqnl_put_sk_classid(struct sk_buff *skb, struct sock *sk) { #if IS_ENABLED(CONFIG_CGROUP_NET_CLASSID) if (sk && sk_fullsock(sk)) { u32 classid = sock_cgroup_classid(&sk->sk_cgrp_data); if (classid && nla_put_be32(skb, NFQA_CGROUP_CLASSID, htonl(classid))) return -1; } #endif return 0; } static int nfqnl_get_sk_secctx(struct sk_buff *skb, struct lsm_context *ctx) { int seclen = 0; #if IS_ENABLED(CONFIG_NETWORK_SECMARK) if (!skb || !sk_fullsock(skb->sk)) return 0; read_lock_bh(&skb->sk->sk_callback_lock); if (skb->secmark) seclen = security_secid_to_secctx(skb->secmark, ctx); read_unlock_bh(&skb->sk->sk_callback_lock); #endif return seclen; } static u32 nfqnl_get_bridge_size(struct nf_queue_entry *entry) { struct sk_buff *entskb = entry->skb; u32 nlalen = 0; if (entry->state.pf != PF_BRIDGE || !skb_mac_header_was_set(entskb)) return 0; if (skb_vlan_tag_present(entskb)) nlalen += nla_total_size(nla_total_size(sizeof(__be16)) + nla_total_size(sizeof(__be16))); if (entskb->network_header > entskb->mac_header) nlalen += nla_total_size((entskb->network_header - entskb->mac_header)); return nlalen; } static int nfqnl_put_bridge(struct nf_queue_entry *entry, struct sk_buff *skb) { struct sk_buff *entskb = entry->skb; if (entry->state.pf != PF_BRIDGE || !skb_mac_header_was_set(entskb)) return 0; if (skb_vlan_tag_present(entskb)) { struct nlattr *nest; nest = nla_nest_start(skb, NFQA_VLAN); if (!nest) goto nla_put_failure; if (nla_put_be16(skb, NFQA_VLAN_TCI, htons(entskb->vlan_tci)) || nla_put_be16(skb, NFQA_VLAN_PROTO, entskb->vlan_proto)) goto nla_put_failure; nla_nest_end(skb, nest); } if (entskb->mac_header < entskb->network_header) { int len = (int)(entskb->network_header - entskb->mac_header); if (nla_put(skb, NFQA_L2HDR, len, skb_mac_header(entskb))) goto nla_put_failure; } return 0; nla_put_failure: return -1; } static int nf_queue_checksum_help(struct sk_buff *entskb) { if (skb_csum_is_sctp(entskb)) return skb_crc32c_csum_help(entskb); return skb_checksum_help(entskb); } static struct sk_buff * nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, struct nf_queue_entry *entry, __be32 **packet_id_ptr) { size_t size; size_t data_len = 0, cap_len = 0; unsigned int hlen = 0; struct sk_buff *skb; struct nlattr *nla; struct nfqnl_msg_packet_hdr *pmsg; struct nlmsghdr *nlh; struct sk_buff *entskb = entry->skb; struct net_device *indev; struct net_device *outdev; struct nf_conn *ct = NULL; enum ip_conntrack_info ctinfo = 0; const struct nfnl_ct_hook *nfnl_ct; bool csum_verify; struct lsm_context ctx = { NULL, 0, 0 }; int seclen = 0; ktime_t tstamp; size = nlmsg_total_size(sizeof(struct nfgenmsg)) + nla_total_size(sizeof(struct nfqnl_msg_packet_hdr)) + nla_total_size(sizeof(u_int32_t)) /* ifindex */ + nla_total_size(sizeof(u_int32_t)) /* ifindex */ #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + nla_total_size(sizeof(u_int32_t)) /* ifindex */ + nla_total_size(sizeof(u_int32_t)) /* ifindex */ #endif + nla_total_size(sizeof(u_int32_t)) /* mark */ + nla_total_size(sizeof(u_int32_t)) /* priority */ + nla_total_size(sizeof(struct nfqnl_msg_packet_hw)) + nla_total_size(sizeof(u_int32_t)) /* skbinfo */ #if IS_ENABLED(CONFIG_CGROUP_NET_CLASSID) + nla_total_size(sizeof(u_int32_t)) /* classid */ #endif + nla_total_size(sizeof(u_int32_t)); /* cap_len */ tstamp = skb_tstamp_cond(entskb, false); if (tstamp) size += nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp)); size += nfqnl_get_bridge_size(entry); if (entry->state.hook <= NF_INET_FORWARD || (entry->state.hook == NF_INET_POST_ROUTING && entskb->sk == NULL)) csum_verify = !skb_csum_unnecessary(entskb); else csum_verify = false; outdev = entry->state.out; switch ((enum nfqnl_config_mode)READ_ONCE(queue->copy_mode)) { case NFQNL_COPY_META: case NFQNL_COPY_NONE: break; case NFQNL_COPY_PACKET: if (!(queue->flags & NFQA_CFG_F_GSO) && entskb->ip_summed == CHECKSUM_PARTIAL && nf_queue_checksum_help(entskb)) return NULL; data_len = READ_ONCE(queue->copy_range); if (data_len > entskb->len) data_len = entskb->len; hlen = skb_zerocopy_headlen(entskb); hlen = min_t(unsigned int, hlen, data_len); size += sizeof(struct nlattr) + hlen; cap_len = entskb->len; break; } nfnl_ct = rcu_dereference(nfnl_ct_hook); #if IS_ENABLED(CONFIG_NF_CONNTRACK) if (queue->flags & NFQA_CFG_F_CONNTRACK) { if (nfnl_ct != NULL) { ct = nf_ct_get(entskb, &ctinfo); if (ct != NULL) size += nfnl_ct->build_size(ct); } } #endif if (queue->flags & NFQA_CFG_F_UID_GID) { size += (nla_total_size(sizeof(u_int32_t)) /* uid */ + nla_total_size(sizeof(u_int32_t))); /* gid */ } if ((queue->flags & NFQA_CFG_F_SECCTX) && entskb->sk) { seclen = nfqnl_get_sk_secctx(entskb, &ctx); if (seclen < 0) return NULL; if (seclen) size += nla_total_size(seclen); } skb = alloc_skb(size, GFP_ATOMIC); if (!skb) { skb_tx_error(entskb); goto nlmsg_failure; } nlh = nfnl_msg_put(skb, 0, 0, nfnl_msg_type(NFNL_SUBSYS_QUEUE, NFQNL_MSG_PACKET), 0, entry->state.pf, NFNETLINK_V0, htons(queue->queue_num)); if (!nlh) { skb_tx_error(entskb); kfree_skb(skb); goto nlmsg_failure; } nla = __nla_reserve(skb, NFQA_PACKET_HDR, sizeof(*pmsg)); pmsg = nla_data(nla); pmsg->hw_protocol = entskb->protocol; pmsg->hook = entry->state.hook; *packet_id_ptr = &pmsg->packet_id; indev = entry->state.in; if (indev) { #if !IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (nla_put_be32(skb, NFQA_IFINDEX_INDEV, htonl(indev->ifindex))) goto nla_put_failure; #else if (entry->state.pf == PF_BRIDGE) { /* Case 1: indev is physical input device, we need to * look for bridge group (when called from * netfilter_bridge) */ if (nla_put_be32(skb, NFQA_IFINDEX_PHYSINDEV, htonl(indev->ifindex)) || /* this is the bridge group "brX" */ /* rcu_read_lock()ed by __nf_queue */ nla_put_be32(skb, NFQA_IFINDEX_INDEV, htonl(br_port_get_rcu(indev)->br->dev->ifindex))) goto nla_put_failure; } else { int physinif; /* Case 2: indev is bridge group, we need to look for * physical device (when called from ipv4) */ if (nla_put_be32(skb, NFQA_IFINDEX_INDEV, htonl(indev->ifindex))) goto nla_put_failure; physinif = nf_bridge_get_physinif(entskb); if (physinif && nla_put_be32(skb, NFQA_IFINDEX_PHYSINDEV, htonl(physinif))) goto nla_put_failure; } #endif } if (outdev) { #if !IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (nla_put_be32(skb, NFQA_IFINDEX_OUTDEV, htonl(outdev->ifindex))) goto nla_put_failure; #else if (entry->state.pf == PF_BRIDGE) { /* Case 1: outdev is physical output device, we need to * look for bridge group (when called from * netfilter_bridge) */ if (nla_put_be32(skb, NFQA_IFINDEX_PHYSOUTDEV, htonl(outdev->ifindex)) || /* this is the bridge group "brX" */ /* rcu_read_lock()ed by __nf_queue */ nla_put_be32(skb, NFQA_IFINDEX_OUTDEV, htonl(br_port_get_rcu(outdev)->br->dev->ifindex))) goto nla_put_failure; } else { int physoutif; /* Case 2: outdev is bridge group, we need to look for * physical output device (when called from ipv4) */ if (nla_put_be32(skb, NFQA_IFINDEX_OUTDEV, htonl(outdev->ifindex))) goto nla_put_failure; physoutif = nf_bridge_get_physoutif(entskb); if (physoutif && nla_put_be32(skb, NFQA_IFINDEX_PHYSOUTDEV, htonl(physoutif))) goto nla_put_failure; } #endif } if (entskb->mark && nla_put_be32(skb, NFQA_MARK, htonl(entskb->mark))) goto nla_put_failure; if (entskb->priority && nla_put_be32(skb, NFQA_PRIORITY, htonl(entskb->priority))) goto nla_put_failure; if (indev && entskb->dev && skb_mac_header_was_set(entskb) && skb_mac_header_len(entskb) != 0) { struct nfqnl_msg_packet_hw phw; int len; memset(&phw, 0, sizeof(phw)); len = dev_parse_header(entskb, phw.hw_addr); if (len) { phw.hw_addrlen = htons(len); if (nla_put(skb, NFQA_HWADDR, sizeof(phw), &phw)) goto nla_put_failure; } } if (nfqnl_put_bridge(entry, skb) < 0) goto nla_put_failure; if (entry->state.hook <= NF_INET_FORWARD && tstamp) { struct nfqnl_msg_packet_timestamp ts; struct timespec64 kts = ktime_to_timespec64(tstamp); ts.sec = cpu_to_be64(kts.tv_sec); ts.usec = cpu_to_be64(kts.tv_nsec / NSEC_PER_USEC); if (nla_put(skb, NFQA_TIMESTAMP, sizeof(ts), &ts)) goto nla_put_failure; } if ((queue->flags & NFQA_CFG_F_UID_GID) && entskb->sk && nfqnl_put_sk_uidgid(skb, entskb->sk) < 0) goto nla_put_failure; if (nfqnl_put_sk_classid(skb, entskb->sk) < 0) goto nla_put_failure; if (seclen > 0 && nla_put(skb, NFQA_SECCTX, ctx.len, ctx.context)) goto nla_put_failure; if (ct && nfnl_ct->build(skb, ct, ctinfo, NFQA_CT, NFQA_CT_INFO) < 0) goto nla_put_failure; if (cap_len > data_len && nla_put_be32(skb, NFQA_CAP_LEN, htonl(cap_len))) goto nla_put_failure; if (nfqnl_put_packet_info(skb, entskb, csum_verify)) goto nla_put_failure; if (data_len) { struct nlattr *nla; if (skb_tailroom(skb) < sizeof(*nla) + hlen) goto nla_put_failure; nla = skb_put(skb, sizeof(*nla)); nla->nla_type = NFQA_PAYLOAD; nla->nla_len = nla_attr_size(data_len); if (skb_zerocopy(skb, entskb, data_len, hlen)) goto nla_put_failure; } nlh->nlmsg_len = skb->len; if (seclen >= 0) security_release_secctx(&ctx); return skb; nla_put_failure: skb_tx_error(entskb); kfree_skb(skb); net_err_ratelimited("nf_queue: error creating packet message\n"); nlmsg_failure: if (seclen >= 0) security_release_secctx(&ctx); return NULL; } static bool nf_ct_drop_unconfirmed(const struct nf_queue_entry *entry) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) static const unsigned long flags = IPS_CONFIRMED | IPS_DYING; struct nf_conn *ct = (void *)skb_nfct(entry->skb); unsigned long status; unsigned int use; if (!ct) return false; status = READ_ONCE(ct->status); if ((status & flags) == IPS_DYING) return true; if (status & IPS_CONFIRMED) return false; /* in some cases skb_clone() can occur after initial conntrack * pickup, but conntrack assumes exclusive skb->_nfct ownership for * unconfirmed entries. * * This happens for br_netfilter and with ip multicast routing. * We can't be solved with serialization here because one clone could * have been queued for local delivery. */ use = refcount_read(&ct->ct_general.use); if (likely(use == 1)) return false; /* Can't decrement further? Exclusive ownership. */ if (!refcount_dec_not_one(&ct->ct_general.use)) return false; skb_set_nfct(entry->skb, 0); /* No nf_ct_put(): we already decremented .use and it cannot * drop down to 0. */ return true; #endif return false; } static int __nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue, struct nf_queue_entry *entry) { struct sk_buff *nskb; int err = -ENOBUFS; __be32 *packet_id_ptr; int failopen = 0; nskb = nfqnl_build_packet_message(net, queue, entry, &packet_id_ptr); if (nskb == NULL) { err = -ENOMEM; goto err_out; } spin_lock_bh(&queue->lock); if (nf_ct_drop_unconfirmed(entry)) goto err_out_free_nskb; if (queue->queue_total >= queue->queue_maxlen) { if (queue->flags & NFQA_CFG_F_FAIL_OPEN) { failopen = 1; err = 0; } else { queue->queue_dropped++; net_warn_ratelimited("nf_queue: full at %d entries, dropping packets(s)\n", queue->queue_total); } goto err_out_free_nskb; } entry->id = ++queue->id_sequence; *packet_id_ptr = htonl(entry->id); /* nfnetlink_unicast will either free the nskb or add it to a socket */ err = nfnetlink_unicast(nskb, net, queue->peer_portid); if (err < 0) { if (queue->flags & NFQA_CFG_F_FAIL_OPEN) { failopen = 1; err = 0; } else { queue->queue_user_dropped++; } goto err_out_unlock; } __enqueue_entry(queue, entry); spin_unlock_bh(&queue->lock); return 0; err_out_free_nskb: kfree_skb(nskb); err_out_unlock: spin_unlock_bh(&queue->lock); if (failopen) nfqnl_reinject(entry, NF_ACCEPT); err_out: return err; } static struct nf_queue_entry * nf_queue_entry_dup(struct nf_queue_entry *e) { struct nf_queue_entry *entry = kmemdup(e, e->size, GFP_ATOMIC); if (!entry) return NULL; if (nf_queue_entry_get_refs(entry)) return entry; kfree(entry); return NULL; } #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) /* When called from bridge netfilter, skb->data must point to MAC header * before calling skb_gso_segment(). Else, original MAC header is lost * and segmented skbs will be sent to wrong destination. */ static void nf_bridge_adjust_skb_data(struct sk_buff *skb) { if (nf_bridge_info_get(skb)) __skb_push(skb, skb->network_header - skb->mac_header); } static void nf_bridge_adjust_segmented_data(struct sk_buff *skb) { if (nf_bridge_info_get(skb)) __skb_pull(skb, skb->network_header - skb->mac_header); } #else #define nf_bridge_adjust_skb_data(s) do {} while (0) #define nf_bridge_adjust_segmented_data(s) do {} while (0) #endif static int __nfqnl_enqueue_packet_gso(struct net *net, struct nfqnl_instance *queue, struct sk_buff *skb, struct nf_queue_entry *entry) { int ret = -ENOMEM; struct nf_queue_entry *entry_seg; nf_bridge_adjust_segmented_data(skb); if (skb->next == NULL) { /* last packet, no need to copy entry */ struct sk_buff *gso_skb = entry->skb; entry->skb = skb; ret = __nfqnl_enqueue_packet(net, queue, entry); if (ret) entry->skb = gso_skb; return ret; } skb_mark_not_on_list(skb); entry_seg = nf_queue_entry_dup(entry); if (entry_seg) { entry_seg->skb = skb; ret = __nfqnl_enqueue_packet(net, queue, entry_seg); if (ret) nf_queue_entry_free(entry_seg); } return ret; } static int nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) { unsigned int queued; struct nfqnl_instance *queue; struct sk_buff *skb, *segs, *nskb; int err = -ENOBUFS; struct net *net = entry->state.net; struct nfnl_queue_net *q = nfnl_queue_pernet(net); /* rcu_read_lock()ed by nf_hook_thresh */ queue = instance_lookup(q, queuenum); if (!queue) return -ESRCH; if (queue->copy_mode == NFQNL_COPY_NONE) return -EINVAL; skb = entry->skb; switch (entry->state.pf) { case NFPROTO_IPV4: skb->protocol = htons(ETH_P_IP); break; case NFPROTO_IPV6: skb->protocol = htons(ETH_P_IPV6); break; } if (!skb_is_gso(skb) || ((queue->flags & NFQA_CFG_F_GSO) && !skb_is_gso_sctp(skb))) return __nfqnl_enqueue_packet(net, queue, entry); nf_bridge_adjust_skb_data(skb); segs = skb_gso_segment(skb, 0); /* Does not use PTR_ERR to limit the number of error codes that can be * returned by nf_queue. For instance, callers rely on -ESRCH to * mean 'ignore this hook'. */ if (IS_ERR_OR_NULL(segs)) goto out_err; queued = 0; err = 0; skb_list_walk_safe(segs, segs, nskb) { if (err == 0) err = __nfqnl_enqueue_packet_gso(net, queue, segs, entry); if (err == 0) queued++; else kfree_skb(segs); } if (queued) { if (err) /* some segments are already queued */ nf_queue_entry_free(entry); kfree_skb(skb); return 0; } out_err: nf_bridge_adjust_segmented_data(skb); return err; } static int nfqnl_mangle(void *data, unsigned int data_len, struct nf_queue_entry *e, int diff) { struct sk_buff *nskb; if (diff < 0) { unsigned int min_len = skb_transport_offset(e->skb); if (data_len < min_len) return -EINVAL; if (pskb_trim(e->skb, data_len)) return -ENOMEM; } else if (diff > 0) { if (data_len > 0xFFFF) return -EINVAL; if (diff > skb_tailroom(e->skb)) { nskb = skb_copy_expand(e->skb, skb_headroom(e->skb), diff, GFP_ATOMIC); if (!nskb) return -ENOMEM; kfree_skb(e->skb); e->skb = nskb; } skb_put(e->skb, diff); } if (skb_ensure_writable(e->skb, data_len)) return -ENOMEM; skb_copy_to_linear_data(e->skb, data, data_len); e->skb->ip_summed = CHECKSUM_NONE; return 0; } static int nfqnl_set_mode(struct nfqnl_instance *queue, unsigned char mode, unsigned int range) { int status = 0; spin_lock_bh(&queue->lock); switch (mode) { case NFQNL_COPY_NONE: case NFQNL_COPY_META: queue->copy_mode = mode; queue->copy_range = 0; break; case NFQNL_COPY_PACKET: queue->copy_mode = mode; if (range == 0 || range > NFQNL_MAX_COPY_RANGE) queue->copy_range = NFQNL_MAX_COPY_RANGE; else queue->copy_range = range; break; default: status = -EINVAL; } spin_unlock_bh(&queue->lock); return status; } static int dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex) { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) int physinif, physoutif; physinif = nf_bridge_get_physinif(entry->skb); physoutif = nf_bridge_get_physoutif(entry->skb); if (physinif == ifindex || physoutif == ifindex) return 1; #endif if (entry->state.in) if (entry->state.in->ifindex == ifindex) return 1; if (entry->state.out) if (entry->state.out->ifindex == ifindex) return 1; return 0; } /* drop all packets with either indev or outdev == ifindex from all queue * instances */ static void nfqnl_dev_drop(struct net *net, int ifindex) { int i; struct nfnl_queue_net *q = nfnl_queue_pernet(net); rcu_read_lock(); for (i = 0; i < INSTANCE_BUCKETS; i++) { struct nfqnl_instance *inst; struct hlist_head *head = &q->instance_table[i]; hlist_for_each_entry_rcu(inst, head, hlist) nfqnl_flush(inst, dev_cmp, ifindex); } rcu_read_unlock(); } static int nfqnl_rcv_dev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); /* Drop any packets associated with the downed device */ if (event == NETDEV_DOWN) nfqnl_dev_drop(dev_net(dev), dev->ifindex); return NOTIFY_DONE; } static struct notifier_block nfqnl_dev_notifier = { .notifier_call = nfqnl_rcv_dev_event, }; static void nfqnl_nf_hook_drop(struct net *net) { struct nfnl_queue_net *q = nfnl_queue_pernet(net); int i; /* This function is also called on net namespace error unwind, * when pernet_ops->init() failed and ->exit() functions of the * previous pernet_ops gets called. * * This may result in a call to nfqnl_nf_hook_drop() before * struct nfnl_queue_net was allocated. */ if (!q) return; for (i = 0; i < INSTANCE_BUCKETS; i++) { struct nfqnl_instance *inst; struct hlist_head *head = &q->instance_table[i]; hlist_for_each_entry_rcu(inst, head, hlist) nfqnl_flush(inst, NULL, 0); } } static int nfqnl_rcv_nl_event(struct notifier_block *this, unsigned long event, void *ptr) { struct netlink_notify *n = ptr; struct nfnl_queue_net *q = nfnl_queue_pernet(n->net); if (event == NETLINK_URELEASE && n->protocol == NETLINK_NETFILTER) { int i; /* destroy all instances for this portid */ spin_lock(&q->instances_lock); for (i = 0; i < INSTANCE_BUCKETS; i++) { struct hlist_node *t2; struct nfqnl_instance *inst; struct hlist_head *head = &q->instance_table[i]; hlist_for_each_entry_safe(inst, t2, head, hlist) { if (n->portid == inst->peer_portid) __instance_destroy(inst); } } spin_unlock(&q->instances_lock); } return NOTIFY_DONE; } static struct notifier_block nfqnl_rtnl_notifier = { .notifier_call = nfqnl_rcv_nl_event, }; static const struct nla_policy nfqa_vlan_policy[NFQA_VLAN_MAX + 1] = { [NFQA_VLAN_TCI] = { .type = NLA_U16}, [NFQA_VLAN_PROTO] = { .type = NLA_U16}, }; static const struct nla_policy nfqa_verdict_policy[NFQA_MAX+1] = { [NFQA_VERDICT_HDR] = { .len = sizeof(struct nfqnl_msg_verdict_hdr) }, [NFQA_MARK] = { .type = NLA_U32 }, [NFQA_PAYLOAD] = { .type = NLA_UNSPEC }, [NFQA_CT] = { .type = NLA_UNSPEC }, [NFQA_EXP] = { .type = NLA_UNSPEC }, [NFQA_VLAN] = { .type = NLA_NESTED }, [NFQA_PRIORITY] = { .type = NLA_U32 }, }; static const struct nla_policy nfqa_verdict_batch_policy[NFQA_MAX+1] = { [NFQA_VERDICT_HDR] = { .len = sizeof(struct nfqnl_msg_verdict_hdr) }, [NFQA_MARK] = { .type = NLA_U32 }, [NFQA_PRIORITY] = { .type = NLA_U32 }, }; static struct nfqnl_instance * verdict_instance_lookup(struct nfnl_queue_net *q, u16 queue_num, u32 nlportid) { struct nfqnl_instance *queue; queue = instance_lookup(q, queue_num); if (!queue) return ERR_PTR(-ENODEV); if (queue->peer_portid != nlportid) return ERR_PTR(-EPERM); return queue; } static struct nfqnl_msg_verdict_hdr* verdicthdr_get(const struct nlattr * const nfqa[]) { struct nfqnl_msg_verdict_hdr *vhdr; unsigned int verdict; if (!nfqa[NFQA_VERDICT_HDR]) return NULL; vhdr = nla_data(nfqa[NFQA_VERDICT_HDR]); verdict = ntohl(vhdr->verdict) & NF_VERDICT_MASK; if (verdict > NF_MAX_VERDICT || verdict == NF_STOLEN) return NULL; return vhdr; } static int nfq_id_after(unsigned int id, unsigned int max) { return (int)(id - max) > 0; } static int nfqnl_recv_verdict_batch(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nfqa[]) { struct nfnl_queue_net *q = nfnl_queue_pernet(info->net); u16 queue_num = ntohs(info->nfmsg->res_id); struct nf_queue_entry *entry, *tmp; struct nfqnl_msg_verdict_hdr *vhdr; struct nfqnl_instance *queue; unsigned int verdict, maxid; LIST_HEAD(batch_list); queue = verdict_instance_lookup(q, queue_num, NETLINK_CB(skb).portid); if (IS_ERR(queue)) return PTR_ERR(queue); vhdr = verdicthdr_get(nfqa); if (!vhdr) return -EINVAL; verdict = ntohl(vhdr->verdict); maxid = ntohl(vhdr->id); spin_lock_bh(&queue->lock); list_for_each_entry_safe(entry, tmp, &queue->queue_list, list) { if (nfq_id_after(entry->id, maxid)) break; __dequeue_entry(queue, entry); list_add_tail(&entry->list, &batch_list); } spin_unlock_bh(&queue->lock); if (list_empty(&batch_list)) return -ENOENT; list_for_each_entry_safe(entry, tmp, &batch_list, list) { if (nfqa[NFQA_MARK]) entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK])); if (nfqa[NFQA_PRIORITY]) entry->skb->priority = ntohl(nla_get_be32(nfqa[NFQA_PRIORITY])); nfqnl_reinject(entry, verdict); } return 0; } static struct nf_conn *nfqnl_ct_parse(const struct nfnl_ct_hook *nfnl_ct, const struct nlmsghdr *nlh, const struct nlattr * const nfqa[], struct nf_queue_entry *entry, enum ip_conntrack_info *ctinfo) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) struct nf_conn *ct; ct = nf_ct_get(entry->skb, ctinfo); if (ct == NULL) return NULL; if (nfnl_ct->parse(nfqa[NFQA_CT], ct) < 0) return NULL; if (nfqa[NFQA_EXP]) nfnl_ct->attach_expect(nfqa[NFQA_EXP], ct, NETLINK_CB(entry->skb).portid, nlmsg_report(nlh)); return ct; #else return NULL; #endif } static int nfqa_parse_bridge(struct nf_queue_entry *entry, const struct nlattr * const nfqa[]) { if (nfqa[NFQA_VLAN]) { struct nlattr *tb[NFQA_VLAN_MAX + 1]; int err; err = nla_parse_nested_deprecated(tb, NFQA_VLAN_MAX, nfqa[NFQA_VLAN], nfqa_vlan_policy, NULL); if (err < 0) return err; if (!tb[NFQA_VLAN_TCI] || !tb[NFQA_VLAN_PROTO]) return -EINVAL; __vlan_hwaccel_put_tag(entry->skb, nla_get_be16(tb[NFQA_VLAN_PROTO]), ntohs(nla_get_be16(tb[NFQA_VLAN_TCI]))); } if (nfqa[NFQA_L2HDR]) { int mac_header_len = entry->skb->network_header - entry->skb->mac_header; if (mac_header_len != nla_len(nfqa[NFQA_L2HDR])) return -EINVAL; else if (mac_header_len > 0) memcpy(skb_mac_header(entry->skb), nla_data(nfqa[NFQA_L2HDR]), mac_header_len); } return 0; } static int nfqnl_recv_verdict(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nfqa[]) { struct nfnl_queue_net *q = nfnl_queue_pernet(info->net); u_int16_t queue_num = ntohs(info->nfmsg->res_id); const struct nfnl_ct_hook *nfnl_ct; struct nfqnl_msg_verdict_hdr *vhdr; enum ip_conntrack_info ctinfo; struct nfqnl_instance *queue; struct nf_queue_entry *entry; struct nf_conn *ct = NULL; unsigned int verdict; int err; queue = verdict_instance_lookup(q, queue_num, NETLINK_CB(skb).portid); if (IS_ERR(queue)) return PTR_ERR(queue); vhdr = verdicthdr_get(nfqa); if (!vhdr) return -EINVAL; verdict = ntohl(vhdr->verdict); entry = find_dequeue_entry(queue, ntohl(vhdr->id)); if (entry == NULL) return -ENOENT; /* rcu lock already held from nfnl->call_rcu. */ nfnl_ct = rcu_dereference(nfnl_ct_hook); if (nfqa[NFQA_CT]) { if (nfnl_ct != NULL) ct = nfqnl_ct_parse(nfnl_ct, info->nlh, nfqa, entry, &ctinfo); } if (entry->state.pf == PF_BRIDGE) { err = nfqa_parse_bridge(entry, nfqa); if (err < 0) return err; } if (nfqa[NFQA_PAYLOAD]) { u16 payload_len = nla_len(nfqa[NFQA_PAYLOAD]); int diff = payload_len - entry->skb->len; if (nfqnl_mangle(nla_data(nfqa[NFQA_PAYLOAD]), payload_len, entry, diff) < 0) verdict = NF_DROP; if (ct && diff) nfnl_ct->seq_adjust(entry->skb, ct, ctinfo, diff); } if (nfqa[NFQA_MARK]) entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK])); if (nfqa[NFQA_PRIORITY]) entry->skb->priority = ntohl(nla_get_be32(nfqa[NFQA_PRIORITY])); nfqnl_reinject(entry, verdict); return 0; } static int nfqnl_recv_unsupp(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { return -ENOTSUPP; } static const struct nla_policy nfqa_cfg_policy[NFQA_CFG_MAX+1] = { [NFQA_CFG_CMD] = { .len = sizeof(struct nfqnl_msg_config_cmd) }, [NFQA_CFG_PARAMS] = { .len = sizeof(struct nfqnl_msg_config_params) }, [NFQA_CFG_QUEUE_MAXLEN] = { .type = NLA_U32 }, [NFQA_CFG_MASK] = { .type = NLA_U32 }, [NFQA_CFG_FLAGS] = { .type = NLA_U32 }, }; static const struct nf_queue_handler nfqh = { .outfn = nfqnl_enqueue_packet, .nf_hook_drop = nfqnl_nf_hook_drop, }; static int nfqnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nfqa[]) { struct nfnl_queue_net *q = nfnl_queue_pernet(info->net); u_int16_t queue_num = ntohs(info->nfmsg->res_id); struct nfqnl_msg_config_cmd *cmd = NULL; struct nfqnl_instance *queue; __u32 flags = 0, mask = 0; int ret = 0; if (nfqa[NFQA_CFG_CMD]) { cmd = nla_data(nfqa[NFQA_CFG_CMD]); /* Obsolete commands without queue context */ switch (cmd->command) { case NFQNL_CFG_CMD_PF_BIND: return 0; case NFQNL_CFG_CMD_PF_UNBIND: return 0; } } /* Check if we support these flags in first place, dependencies should * be there too not to break atomicity. */ if (nfqa[NFQA_CFG_FLAGS]) { if (!nfqa[NFQA_CFG_MASK]) { /* A mask is needed to specify which flags are being * changed. */ return -EINVAL; } flags = ntohl(nla_get_be32(nfqa[NFQA_CFG_FLAGS])); mask = ntohl(nla_get_be32(nfqa[NFQA_CFG_MASK])); if (flags >= NFQA_CFG_F_MAX) return -EOPNOTSUPP; #if !IS_ENABLED(CONFIG_NETWORK_SECMARK) if (flags & mask & NFQA_CFG_F_SECCTX) return -EOPNOTSUPP; #endif if ((flags & mask & NFQA_CFG_F_CONNTRACK) && !rcu_access_pointer(nfnl_ct_hook)) { #ifdef CONFIG_MODULES nfnl_unlock(NFNL_SUBSYS_QUEUE); request_module("ip_conntrack_netlink"); nfnl_lock(NFNL_SUBSYS_QUEUE); if (rcu_access_pointer(nfnl_ct_hook)) return -EAGAIN; #endif return -EOPNOTSUPP; } } rcu_read_lock(); queue = instance_lookup(q, queue_num); if (queue && queue->peer_portid != NETLINK_CB(skb).portid) { ret = -EPERM; goto err_out_unlock; } if (cmd != NULL) { switch (cmd->command) { case NFQNL_CFG_CMD_BIND: if (queue) { ret = -EBUSY; goto err_out_unlock; } queue = instance_create(q, queue_num, NETLINK_CB(skb).portid); if (IS_ERR(queue)) { ret = PTR_ERR(queue); goto err_out_unlock; } break; case NFQNL_CFG_CMD_UNBIND: if (!queue) { ret = -ENODEV; goto err_out_unlock; } instance_destroy(q, queue); goto err_out_unlock; case NFQNL_CFG_CMD_PF_BIND: case NFQNL_CFG_CMD_PF_UNBIND: break; default: ret = -ENOTSUPP; goto err_out_unlock; } } if (!queue) { ret = -ENODEV; goto err_out_unlock; } if (nfqa[NFQA_CFG_PARAMS]) { struct nfqnl_msg_config_params *params = nla_data(nfqa[NFQA_CFG_PARAMS]); nfqnl_set_mode(queue, params->copy_mode, ntohl(params->copy_range)); } if (nfqa[NFQA_CFG_QUEUE_MAXLEN]) { __be32 *queue_maxlen = nla_data(nfqa[NFQA_CFG_QUEUE_MAXLEN]); spin_lock_bh(&queue->lock); queue->queue_maxlen = ntohl(*queue_maxlen); spin_unlock_bh(&queue->lock); } if (nfqa[NFQA_CFG_FLAGS]) { spin_lock_bh(&queue->lock); queue->flags &= ~mask; queue->flags |= flags & mask; spin_unlock_bh(&queue->lock); } err_out_unlock: rcu_read_unlock(); return ret; } static const struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = { [NFQNL_MSG_PACKET] = { .call = nfqnl_recv_unsupp, .type = NFNL_CB_RCU, .attr_count = NFQA_MAX, }, [NFQNL_MSG_VERDICT] = { .call = nfqnl_recv_verdict, .type = NFNL_CB_RCU, .attr_count = NFQA_MAX, .policy = nfqa_verdict_policy }, [NFQNL_MSG_CONFIG] = { .call = nfqnl_recv_config, .type = NFNL_CB_MUTEX, .attr_count = NFQA_CFG_MAX, .policy = nfqa_cfg_policy }, [NFQNL_MSG_VERDICT_BATCH] = { .call = nfqnl_recv_verdict_batch, .type = NFNL_CB_RCU, .attr_count = NFQA_MAX, .policy = nfqa_verdict_batch_policy }, }; static const struct nfnetlink_subsystem nfqnl_subsys = { .name = "nf_queue", .subsys_id = NFNL_SUBSYS_QUEUE, .cb_count = NFQNL_MSG_MAX, .cb = nfqnl_cb, }; #ifdef CONFIG_PROC_FS struct iter_state { struct seq_net_private p; unsigned int bucket; }; static struct hlist_node *get_first(struct seq_file *seq) { struct iter_state *st = seq->private; struct net *net; struct nfnl_queue_net *q; if (!st) return NULL; net = seq_file_net(seq); q = nfnl_queue_pernet(net); for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) { if (!hlist_empty(&q->instance_table[st->bucket])) return q->instance_table[st->bucket].first; } return NULL; } static struct hlist_node *get_next(struct seq_file *seq, struct hlist_node *h) { struct iter_state *st = seq->private; struct net *net = seq_file_net(seq); h = h->next; while (!h) { struct nfnl_queue_net *q; if (++st->bucket >= INSTANCE_BUCKETS) return NULL; q = nfnl_queue_pernet(net); h = q->instance_table[st->bucket].first; } return h; } static struct hlist_node *get_idx(struct seq_file *seq, loff_t pos) { struct hlist_node *head; head = get_first(seq); if (head) while (pos && (head = get_next(seq, head))) pos--; return pos ? NULL : head; } static void *seq_start(struct seq_file *s, loff_t *pos) __acquires(nfnl_queue_pernet(seq_file_net(s))->instances_lock) { spin_lock(&nfnl_queue_pernet(seq_file_net(s))->instances_lock); return get_idx(s, *pos); } static void *seq_next(struct seq_file *s, void *v, loff_t *pos) { (*pos)++; return get_next(s, v); } static void seq_stop(struct seq_file *s, void *v) __releases(nfnl_queue_pernet(seq_file_net(s))->instances_lock) { spin_unlock(&nfnl_queue_pernet(seq_file_net(s))->instances_lock); } static int seq_show(struct seq_file *s, void *v) { const struct nfqnl_instance *inst = v; seq_printf(s, "%5u %6u %5u %1u %5u %5u %5u %8u %2d\n", inst->queue_num, inst->peer_portid, inst->queue_total, inst->copy_mode, inst->copy_range, inst->queue_dropped, inst->queue_user_dropped, inst->id_sequence, 1); return 0; } static const struct seq_operations nfqnl_seq_ops = { .start = seq_start, .next = seq_next, .stop = seq_stop, .show = seq_show, }; #endif /* PROC_FS */ static int __net_init nfnl_queue_net_init(struct net *net) { unsigned int i; struct nfnl_queue_net *q = nfnl_queue_pernet(net); for (i = 0; i < INSTANCE_BUCKETS; i++) INIT_HLIST_HEAD(&q->instance_table[i]); spin_lock_init(&q->instances_lock); #ifdef CONFIG_PROC_FS if (!proc_create_net("nfnetlink_queue", 0440, net->nf.proc_netfilter, &nfqnl_seq_ops, sizeof(struct iter_state))) return -ENOMEM; #endif return 0; } static void __net_exit nfnl_queue_net_exit(struct net *net) { struct nfnl_queue_net *q = nfnl_queue_pernet(net); unsigned int i; #ifdef CONFIG_PROC_FS remove_proc_entry("nfnetlink_queue", net->nf.proc_netfilter); #endif for (i = 0; i < INSTANCE_BUCKETS; i++) WARN_ON_ONCE(!hlist_empty(&q->instance_table[i])); } static struct pernet_operations nfnl_queue_net_ops = { .init = nfnl_queue_net_init, .exit = nfnl_queue_net_exit, .id = &nfnl_queue_net_id, .size = sizeof(struct nfnl_queue_net), }; static int __init nfnetlink_queue_init(void) { int status; status = register_pernet_subsys(&nfnl_queue_net_ops); if (status < 0) { pr_err("failed to register pernet ops\n"); goto out; } netlink_register_notifier(&nfqnl_rtnl_notifier); status = nfnetlink_subsys_register(&nfqnl_subsys); if (status < 0) { pr_err("failed to create netlink socket\n"); goto cleanup_netlink_notifier; } status = register_netdevice_notifier(&nfqnl_dev_notifier); if (status < 0) { pr_err("failed to register netdevice notifier\n"); goto cleanup_netlink_subsys; } nf_register_queue_handler(&nfqh); return status; cleanup_netlink_subsys: nfnetlink_subsys_unregister(&nfqnl_subsys); cleanup_netlink_notifier: netlink_unregister_notifier(&nfqnl_rtnl_notifier); unregister_pernet_subsys(&nfnl_queue_net_ops); out: return status; } static void __exit nfnetlink_queue_fini(void) { nf_unregister_queue_handler(); unregister_netdevice_notifier(&nfqnl_dev_notifier); nfnetlink_subsys_unregister(&nfqnl_subsys); netlink_unregister_notifier(&nfqnl_rtnl_notifier); unregister_pernet_subsys(&nfnl_queue_net_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ } MODULE_DESCRIPTION("netfilter packet queue handler"); MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_QUEUE); module_init(nfnetlink_queue_init); module_exit(nfnetlink_queue_fini);
6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 // SPDX-License-Identifier: GPL-2.0 /* * Shared Memory Communications over RDMA (SMC-R) and RoCE * * Generic netlink support functions to configure an SMC-R PNET table * * Copyright IBM Corp. 2016 * * Author(s): Thomas Richter <tmricht@linux.vnet.ibm.com> */ #include <linux/module.h> #include <linux/list.h> #include <linux/ctype.h> #include <linux/mutex.h> #include <net/netlink.h> #include <net/genetlink.h> #include <uapi/linux/if.h> #include <uapi/linux/smc.h> #include <rdma/ib_verbs.h> #include <net/netns/generic.h> #include "smc_netns.h" #include "smc_pnet.h" #include "smc_ib.h" #include "smc_ism.h" #include "smc_core.h" static struct net_device *__pnet_find_base_ndev(struct net_device *ndev); static struct net_device *pnet_find_base_ndev(struct net_device *ndev); static const struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = { [SMC_PNETID_NAME] = { .type = NLA_NUL_STRING, .len = SMC_MAX_PNETID_LEN }, [SMC_PNETID_ETHNAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, [SMC_PNETID_IBNAME] = { .type = NLA_NUL_STRING, .len = IB_DEVICE_NAME_MAX - 1 }, [SMC_PNETID_IBPORT] = { .type = NLA_U8 } }; static struct genl_family smc_pnet_nl_family; enum smc_pnet_nametype { SMC_PNET_ETH = 1, SMC_PNET_IB = 2, }; /* pnet entry stored in pnet table */ struct smc_pnetentry { struct list_head list; char pnet_name[SMC_MAX_PNETID_LEN + 1]; enum smc_pnet_nametype type; union { struct { char eth_name[IFNAMSIZ + 1]; struct net_device *ndev; netdevice_tracker dev_tracker; }; struct { char ib_name[IB_DEVICE_NAME_MAX + 1]; u8 ib_port; }; }; }; /* Check if the pnetid is set */ bool smc_pnet_is_pnetid_set(u8 *pnetid) { if (pnetid[0] == 0 || pnetid[0] == _S) return false; return true; } /* Check if two given pnetids match */ static bool smc_pnet_match(u8 *pnetid1, u8 *pnetid2) { int i; for (i = 0; i < SMC_MAX_PNETID_LEN; i++) { if ((pnetid1[i] == 0 || pnetid1[i] == _S) && (pnetid2[i] == 0 || pnetid2[i] == _S)) break; if (pnetid1[i] != pnetid2[i]) return false; } return true; } /* Remove a pnetid from the pnet table. */ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name) { struct smc_pnetentry *pnetelem, *tmp_pe; struct smc_pnettable *pnettable; struct smc_ib_device *ibdev; struct smcd_dev *smcd; struct smc_net *sn; int rc = -ENOENT; int ibport; /* get pnettable for namespace */ sn = net_generic(net, smc_net_id); pnettable = &sn->pnettable; /* remove table entry */ mutex_lock(&pnettable->lock); list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) { if (!pnet_name || smc_pnet_match(pnetelem->pnet_name, pnet_name)) { list_del(&pnetelem->list); if (pnetelem->type == SMC_PNET_ETH && pnetelem->ndev) { netdev_put(pnetelem->ndev, &pnetelem->dev_tracker); pr_warn_ratelimited("smc: net device %s " "erased user defined " "pnetid %.16s\n", pnetelem->eth_name, pnetelem->pnet_name); } kfree(pnetelem); rc = 0; } } mutex_unlock(&pnettable->lock); /* if this is not the initial namespace, stop here */ if (net != &init_net) return rc; /* remove ib devices */ mutex_lock(&smc_ib_devices.mutex); list_for_each_entry(ibdev, &smc_ib_devices.list, list) { for (ibport = 0; ibport < SMC_MAX_PORTS; ibport++) { if (ibdev->pnetid_by_user[ibport] && (!pnet_name || smc_pnet_match(pnet_name, ibdev->pnetid[ibport]))) { pr_warn_ratelimited("smc: ib device %s ibport " "%d erased user defined " "pnetid %.16s\n", ibdev->ibdev->name, ibport + 1, ibdev->pnetid[ibport]); memset(ibdev->pnetid[ibport], 0, SMC_MAX_PNETID_LEN); ibdev->pnetid_by_user[ibport] = false; rc = 0; } } } mutex_unlock(&smc_ib_devices.mutex); /* remove smcd devices */ mutex_lock(&smcd_dev_list.mutex); list_for_each_entry(smcd, &smcd_dev_list.list, list) { if (smcd->pnetid_by_user && (!pnet_name || smc_pnet_match(pnet_name, smcd->pnetid))) { pr_warn_ratelimited("smc: smcd device %s " "erased user defined pnetid " "%.16s\n", dev_name(smcd->ops->get_dev(smcd)), smcd->pnetid); memset(smcd->pnetid, 0, SMC_MAX_PNETID_LEN); smcd->pnetid_by_user = false; rc = 0; } } mutex_unlock(&smcd_dev_list.mutex); return rc; } /* Add the reference to a given network device to the pnet table. */ static int smc_pnet_add_by_ndev(struct net_device *ndev) { struct smc_pnetentry *pnetelem, *tmp_pe; struct smc_pnettable *pnettable; struct net *net = dev_net(ndev); struct smc_net *sn; int rc = -ENOENT; /* get pnettable for namespace */ sn = net_generic(net, smc_net_id); pnettable = &sn->pnettable; mutex_lock(&pnettable->lock); list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) { if (pnetelem->type == SMC_PNET_ETH && !pnetelem->ndev && !strncmp(pnetelem->eth_name, ndev->name, IFNAMSIZ)) { netdev_hold(ndev, &pnetelem->dev_tracker, GFP_ATOMIC); pnetelem->ndev = ndev; rc = 0; pr_warn_ratelimited("smc: adding net device %s with " "user defined pnetid %.16s\n", pnetelem->eth_name, pnetelem->pnet_name); break; } } mutex_unlock(&pnettable->lock); return rc; } /* Remove the reference to a given network device from the pnet table. */ static int smc_pnet_remove_by_ndev(struct net_device *ndev) { struct smc_pnetentry *pnetelem, *tmp_pe; struct smc_pnettable *pnettable; struct net *net = dev_net(ndev); struct smc_net *sn; int rc = -ENOENT; /* get pnettable for namespace */ sn = net_generic(net, smc_net_id); pnettable = &sn->pnettable; mutex_lock(&pnettable->lock); list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) { if (pnetelem->type == SMC_PNET_ETH && pnetelem->ndev == ndev) { netdev_put(pnetelem->ndev, &pnetelem->dev_tracker); pnetelem->ndev = NULL; rc = 0; pr_warn_ratelimited("smc: removing net device %s with " "user defined pnetid %.16s\n", pnetelem->eth_name, pnetelem->pnet_name); break; } } mutex_unlock(&pnettable->lock); return rc; } /* Apply pnetid to ib device when no pnetid is set. */ static bool smc_pnet_apply_ib(struct smc_ib_device *ib_dev, u8 ib_port, char *pnet_name) { bool applied = false; mutex_lock(&smc_ib_devices.mutex); if (!smc_pnet_is_pnetid_set(ib_dev->pnetid[ib_port - 1])) { memcpy(ib_dev->pnetid[ib_port - 1], pnet_name, SMC_MAX_PNETID_LEN); ib_dev->pnetid_by_user[ib_port - 1] = true; applied = true; } mutex_unlock(&smc_ib_devices.mutex); return applied; } /* Apply pnetid to smcd device when no pnetid is set. */ static bool smc_pnet_apply_smcd(struct smcd_dev *smcd_dev, char *pnet_name) { bool applied = false; mutex_lock(&smcd_dev_list.mutex); if (!smc_pnet_is_pnetid_set(smcd_dev->pnetid)) { memcpy(smcd_dev->pnetid, pnet_name, SMC_MAX_PNETID_LEN); smcd_dev->pnetid_by_user = true; applied = true; } mutex_unlock(&smcd_dev_list.mutex); return applied; } /* The limit for pnetid is 16 characters. * Valid characters should be (single-byte character set) a-z, A-Z, 0-9. * Lower case letters are converted to upper case. * Interior blanks should not be used. */ static bool smc_pnetid_valid(const char *pnet_name, char *pnetid) { char *bf = skip_spaces(pnet_name); size_t len = strlen(bf); char *end = bf + len; if (!len) return false; while (--end >= bf && isspace(*end)) ; if (end - bf >= SMC_MAX_PNETID_LEN) return false; while (bf <= end) { if (!isalnum(*bf)) return false; *pnetid++ = islower(*bf) ? toupper(*bf) : *bf; bf++; } *pnetid = '\0'; return true; } /* Find an infiniband device by a given name. The device might not exist. */ static struct smc_ib_device *smc_pnet_find_ib(char *ib_name) { struct smc_ib_device *ibdev; mutex_lock(&smc_ib_devices.mutex); list_for_each_entry(ibdev, &smc_ib_devices.list, list) { if (!strncmp(ibdev->ibdev->name, ib_name, sizeof(ibdev->ibdev->name)) || (ibdev->ibdev->dev.parent && !strncmp(dev_name(ibdev->ibdev->dev.parent), ib_name, IB_DEVICE_NAME_MAX - 1))) { goto out; } } ibdev = NULL; out: mutex_unlock(&smc_ib_devices.mutex); return ibdev; } /* Find an smcd device by a given name. The device might not exist. */ static struct smcd_dev *smc_pnet_find_smcd(char *smcd_name) { struct smcd_dev *smcd_dev; mutex_lock(&smcd_dev_list.mutex); list_for_each_entry(smcd_dev, &smcd_dev_list.list, list) { if (!strncmp(dev_name(smcd_dev->ops->get_dev(smcd_dev)), smcd_name, IB_DEVICE_NAME_MAX - 1)) goto out; } smcd_dev = NULL; out: mutex_unlock(&smcd_dev_list.mutex); return smcd_dev; } static int smc_pnet_add_eth(struct smc_pnettable *pnettable, struct net *net, char *eth_name, char *pnet_name) { struct smc_pnetentry *tmp_pe, *new_pe; struct net_device *ndev, *base_ndev; u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; bool new_netdev; int rc; /* check if (base) netdev already has a pnetid. If there is one, we do * not want to add a pnet table entry */ rc = -EEXIST; ndev = dev_get_by_name(net, eth_name); /* dev_hold() */ if (ndev) { base_ndev = pnet_find_base_ndev(ndev); if (!smc_pnetid_by_dev_port(base_ndev->dev.parent, base_ndev->dev_port, ndev_pnetid)) goto out_put; } /* add a new netdev entry to the pnet table if there isn't one */ rc = -ENOMEM; new_pe = kzalloc(sizeof(*new_pe), GFP_KERNEL); if (!new_pe) goto out_put; new_pe->type = SMC_PNET_ETH; memcpy(new_pe->pnet_name, pnet_name, SMC_MAX_PNETID_LEN); strncpy(new_pe->eth_name, eth_name, IFNAMSIZ); rc = -EEXIST; new_netdev = true; mutex_lock(&pnettable->lock); list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { if (tmp_pe->type == SMC_PNET_ETH && !strncmp(tmp_pe->eth_name, eth_name, IFNAMSIZ)) { new_netdev = false; break; } } if (new_netdev) { if (ndev) { new_pe->ndev = ndev; netdev_tracker_alloc(ndev, &new_pe->dev_tracker, GFP_ATOMIC); } list_add_tail(&new_pe->list, &pnettable->pnetlist); mutex_unlock(&pnettable->lock); } else { mutex_unlock(&pnettable->lock); kfree(new_pe); goto out_put; } if (ndev) pr_warn_ratelimited("smc: net device %s " "applied user defined pnetid %.16s\n", new_pe->eth_name, new_pe->pnet_name); return 0; out_put: dev_put(ndev); return rc; } static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name, u8 ib_port, char *pnet_name) { struct smc_pnetentry *tmp_pe, *new_pe; struct smc_ib_device *ib_dev; bool smcddev_applied = true; bool ibdev_applied = true; struct smcd_dev *smcd; struct device *dev; bool new_ibdev; /* try to apply the pnetid to active devices */ ib_dev = smc_pnet_find_ib(ib_name); if (ib_dev) { ibdev_applied = smc_pnet_apply_ib(ib_dev, ib_port, pnet_name); if (ibdev_applied) pr_warn_ratelimited("smc: ib device %s ibport %d " "applied user defined pnetid " "%.16s\n", ib_dev->ibdev->name, ib_port, ib_dev->pnetid[ib_port - 1]); } smcd = smc_pnet_find_smcd(ib_name); if (smcd) { smcddev_applied = smc_pnet_apply_smcd(smcd, pnet_name); if (smcddev_applied) { dev = smcd->ops->get_dev(smcd); pr_warn_ratelimited("smc: smcd device %s " "applied user defined pnetid " "%.16s\n", dev_name(dev), smcd->pnetid); } } /* Apply fails when a device has a hardware-defined pnetid set, do not * add a pnet table entry in that case. */ if (!ibdev_applied || !smcddev_applied) return -EEXIST; /* add a new ib entry to the pnet table if there isn't one */ new_pe = kzalloc(sizeof(*new_pe), GFP_KERNEL); if (!new_pe) return -ENOMEM; new_pe->type = SMC_PNET_IB; memcpy(new_pe->pnet_name, pnet_name, SMC_MAX_PNETID_LEN); strncpy(new_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX); new_pe->ib_port = ib_port; new_ibdev = true; mutex_lock(&pnettable->lock); list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { if (tmp_pe->type == SMC_PNET_IB && !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX)) { new_ibdev = false; break; } } if (new_ibdev) { list_add_tail(&new_pe->list, &pnettable->pnetlist); mutex_unlock(&pnettable->lock); } else { mutex_unlock(&pnettable->lock); kfree(new_pe); } return (new_ibdev) ? 0 : -EEXIST; } /* Append a pnetid to the end of the pnet table if not already on this list. */ static int smc_pnet_enter(struct net *net, struct nlattr *tb[]) { char pnet_name[SMC_MAX_PNETID_LEN + 1]; struct smc_pnettable *pnettable; bool new_netdev = false; bool new_ibdev = false; struct smc_net *sn; u8 ibport = 1; char *string; int rc; /* get pnettable for namespace */ sn = net_generic(net, smc_net_id); pnettable = &sn->pnettable; rc = -EINVAL; if (!tb[SMC_PNETID_NAME]) goto error; string = (char *)nla_data(tb[SMC_PNETID_NAME]); if (!smc_pnetid_valid(string, pnet_name)) goto error; if (tb[SMC_PNETID_ETHNAME]) { string = (char *)nla_data(tb[SMC_PNETID_ETHNAME]); rc = smc_pnet_add_eth(pnettable, net, string, pnet_name); if (!rc) new_netdev = true; else if (rc != -EEXIST) goto error; } /* if this is not the initial namespace, stop here */ if (net != &init_net) return new_netdev ? 0 : -EEXIST; rc = -EINVAL; if (tb[SMC_PNETID_IBNAME]) { string = (char *)nla_data(tb[SMC_PNETID_IBNAME]); string = strim(string); if (tb[SMC_PNETID_IBPORT]) { ibport = nla_get_u8(tb[SMC_PNETID_IBPORT]); if (ibport < 1 || ibport > SMC_MAX_PORTS) goto error; } rc = smc_pnet_add_ib(pnettable, string, ibport, pnet_name); if (!rc) new_ibdev = true; else if (rc != -EEXIST) goto error; } return (new_netdev || new_ibdev) ? 0 : -EEXIST; error: return rc; } /* Convert an smc_pnetentry to a netlink attribute sequence */ static int smc_pnet_set_nla(struct sk_buff *msg, struct smc_pnetentry *pnetelem) { if (nla_put_string(msg, SMC_PNETID_NAME, pnetelem->pnet_name)) return -1; if (pnetelem->type == SMC_PNET_ETH) { if (nla_put_string(msg, SMC_PNETID_ETHNAME, pnetelem->eth_name)) return -1; } else { if (nla_put_string(msg, SMC_PNETID_ETHNAME, "n/a")) return -1; } if (pnetelem->type == SMC_PNET_IB) { if (nla_put_string(msg, SMC_PNETID_IBNAME, pnetelem->ib_name) || nla_put_u8(msg, SMC_PNETID_IBPORT, pnetelem->ib_port)) return -1; } else { if (nla_put_string(msg, SMC_PNETID_IBNAME, "n/a") || nla_put_u8(msg, SMC_PNETID_IBPORT, 0xff)) return -1; } return 0; } static int smc_pnet_add(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); return smc_pnet_enter(net, info->attrs); } static int smc_pnet_del(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); if (!info->attrs[SMC_PNETID_NAME]) return -EINVAL; return smc_pnet_remove_by_pnetid(net, (char *)nla_data(info->attrs[SMC_PNETID_NAME])); } static int smc_pnet_dump_start(struct netlink_callback *cb) { cb->args[0] = 0; return 0; } static int smc_pnet_dumpinfo(struct sk_buff *skb, u32 portid, u32 seq, u32 flags, struct smc_pnetentry *pnetelem) { void *hdr; hdr = genlmsg_put(skb, portid, seq, &smc_pnet_nl_family, flags, SMC_PNETID_GET); if (!hdr) return -ENOMEM; if (smc_pnet_set_nla(skb, pnetelem) < 0) { genlmsg_cancel(skb, hdr); return -EMSGSIZE; } genlmsg_end(skb, hdr); return 0; } static int _smc_pnet_dump(struct net *net, struct sk_buff *skb, u32 portid, u32 seq, u8 *pnetid, int start_idx) { struct smc_pnettable *pnettable; struct smc_pnetentry *pnetelem; struct smc_net *sn; int idx = 0; /* get pnettable for namespace */ sn = net_generic(net, smc_net_id); pnettable = &sn->pnettable; /* dump pnettable entries */ mutex_lock(&pnettable->lock); list_for_each_entry(pnetelem, &pnettable->pnetlist, list) { if (pnetid && !smc_pnet_match(pnetelem->pnet_name, pnetid)) continue; if (idx++ < start_idx) continue; /* if this is not the initial namespace, dump only netdev */ if (net != &init_net && pnetelem->type != SMC_PNET_ETH) continue; if (smc_pnet_dumpinfo(skb, portid, seq, NLM_F_MULTI, pnetelem)) { --idx; break; } } mutex_unlock(&pnettable->lock); return idx; } static int smc_pnet_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); int idx; idx = _smc_pnet_dump(net, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NULL, cb->args[0]); cb->args[0] = idx; return skb->len; } /* Retrieve one PNETID entry */ static int smc_pnet_get(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct sk_buff *msg; void *hdr; if (!info->attrs[SMC_PNETID_NAME]) return -EINVAL; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; _smc_pnet_dump(net, msg, info->snd_portid, info->snd_seq, nla_data(info->attrs[SMC_PNETID_NAME]), 0); /* finish multi part message and send it */ hdr = nlmsg_put(msg, info->snd_portid, info->snd_seq, NLMSG_DONE, 0, NLM_F_MULTI); if (!hdr) { nlmsg_free(msg); return -EMSGSIZE; } return genlmsg_reply(msg, info); } /* Remove and delete all pnetids from pnet table. */ static int smc_pnet_flush(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); smc_pnet_remove_by_pnetid(net, NULL); return 0; } /* SMC_PNETID generic netlink operation definition */ static const struct genl_ops smc_pnet_ops[] = { { .cmd = SMC_PNETID_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, /* can be retrieved by unprivileged users */ .doit = smc_pnet_get, .dumpit = smc_pnet_dump, .start = smc_pnet_dump_start }, { .cmd = SMC_PNETID_ADD, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = smc_pnet_add }, { .cmd = SMC_PNETID_DEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = smc_pnet_del }, { .cmd = SMC_PNETID_FLUSH, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = smc_pnet_flush } }; /* SMC_PNETID family definition */ static struct genl_family smc_pnet_nl_family __ro_after_init = { .hdrsize = 0, .name = SMCR_GENL_FAMILY_NAME, .version = SMCR_GENL_FAMILY_VERSION, .maxattr = SMC_PNETID_MAX, .policy = smc_pnet_policy, .netnsok = true, .module = THIS_MODULE, .ops = smc_pnet_ops, .n_ops = ARRAY_SIZE(smc_pnet_ops), .resv_start_op = SMC_PNETID_FLUSH + 1, }; bool smc_pnet_is_ndev_pnetid(struct net *net, u8 *pnetid) { struct smc_net *sn = net_generic(net, smc_net_id); struct smc_pnetids_ndev_entry *pe; bool rc = false; read_lock(&sn->pnetids_ndev.lock); list_for_each_entry(pe, &sn->pnetids_ndev.list, list) { if (smc_pnet_match(pnetid, pe->pnetid)) { rc = true; goto unlock; } } unlock: read_unlock(&sn->pnetids_ndev.lock); return rc; } static int smc_pnet_add_pnetid(struct net *net, u8 *pnetid) { struct smc_net *sn = net_generic(net, smc_net_id); struct smc_pnetids_ndev_entry *pe, *pi; pe = kzalloc(sizeof(*pe), GFP_KERNEL); if (!pe) return -ENOMEM; write_lock(&sn->pnetids_ndev.lock); list_for_each_entry(pi, &sn->pnetids_ndev.list, list) { if (smc_pnet_match(pnetid, pi->pnetid)) { refcount_inc(&pi->refcnt); kfree(pe); goto unlock; } } refcount_set(&pe->refcnt, 1); memcpy(pe->pnetid, pnetid, SMC_MAX_PNETID_LEN); list_add_tail(&pe->list, &sn->pnetids_ndev.list); unlock: write_unlock(&sn->pnetids_ndev.lock); return 0; } static void smc_pnet_remove_pnetid(struct net *net, u8 *pnetid) { struct smc_net *sn = net_generic(net, smc_net_id); struct smc_pnetids_ndev_entry *pe, *pe2; write_lock(&sn->pnetids_ndev.lock); list_for_each_entry_safe(pe, pe2, &sn->pnetids_ndev.list, list) { if (smc_pnet_match(pnetid, pe->pnetid)) { if (refcount_dec_and_test(&pe->refcnt)) { list_del(&pe->list); kfree(pe); } break; } } write_unlock(&sn->pnetids_ndev.lock); } static void smc_pnet_add_base_pnetid(struct net *net, struct net_device *dev, u8 *ndev_pnetid) { struct net_device *base_dev; base_dev = __pnet_find_base_ndev(dev); if (base_dev->flags & IFF_UP && !smc_pnetid_by_dev_port(base_dev->dev.parent, base_dev->dev_port, ndev_pnetid)) { /* add to PNETIDs list */ smc_pnet_add_pnetid(net, ndev_pnetid); } } /* create initial list of netdevice pnetids */ static void smc_pnet_create_pnetids_list(struct net *net) { u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; struct net_device *dev; /* Newly created netns do not have devices. * Do not even acquire rtnl. */ if (list_empty(&net->dev_base_head)) return; /* Note: This might not be needed, because smc_pnet_netdev_event() * is also calling smc_pnet_add_base_pnetid() when handling * NETDEV_UP event. */ rtnl_lock(); for_each_netdev(net, dev) smc_pnet_add_base_pnetid(net, dev, ndev_pnetid); rtnl_unlock(); } /* clean up list of netdevice pnetids */ static void smc_pnet_destroy_pnetids_list(struct net *net) { struct smc_net *sn = net_generic(net, smc_net_id); struct smc_pnetids_ndev_entry *pe, *temp_pe; write_lock(&sn->pnetids_ndev.lock); list_for_each_entry_safe(pe, temp_pe, &sn->pnetids_ndev.list, list) { list_del(&pe->list); kfree(pe); } write_unlock(&sn->pnetids_ndev.lock); } static int smc_pnet_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *event_dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(event_dev); u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; switch (event) { case NETDEV_REBOOT: case NETDEV_UNREGISTER: smc_pnet_remove_by_ndev(event_dev); smc_ib_ndev_change(event_dev, event); return NOTIFY_OK; case NETDEV_REGISTER: smc_pnet_add_by_ndev(event_dev); smc_ib_ndev_change(event_dev, event); return NOTIFY_OK; case NETDEV_UP: smc_pnet_add_base_pnetid(net, event_dev, ndev_pnetid); return NOTIFY_OK; case NETDEV_DOWN: event_dev = __pnet_find_base_ndev(event_dev); if (!smc_pnetid_by_dev_port(event_dev->dev.parent, event_dev->dev_port, ndev_pnetid)) { /* remove from PNETIDs list */ smc_pnet_remove_pnetid(net, ndev_pnetid); } return NOTIFY_OK; default: return NOTIFY_DONE; } } static struct notifier_block smc_netdev_notifier = { .notifier_call = smc_pnet_netdev_event }; /* init network namespace */ int smc_pnet_net_init(struct net *net) { struct smc_net *sn = net_generic(net, smc_net_id); struct smc_pnettable *pnettable = &sn->pnettable; struct smc_pnetids_ndev *pnetids_ndev = &sn->pnetids_ndev; INIT_LIST_HEAD(&pnettable->pnetlist); mutex_init(&pnettable->lock); INIT_LIST_HEAD(&pnetids_ndev->list); rwlock_init(&pnetids_ndev->lock); smc_pnet_create_pnetids_list(net); return 0; } int __init smc_pnet_init(void) { int rc; rc = genl_register_family(&smc_pnet_nl_family); if (rc) return rc; rc = register_netdevice_notifier(&smc_netdev_notifier); if (rc) genl_unregister_family(&smc_pnet_nl_family); return rc; } /* exit network namespace */ void smc_pnet_net_exit(struct net *net) { /* flush pnet table */ smc_pnet_remove_by_pnetid(net, NULL); smc_pnet_destroy_pnetids_list(net); } void smc_pnet_exit(void) { unregister_netdevice_notifier(&smc_netdev_notifier); genl_unregister_family(&smc_pnet_nl_family); } static struct net_device *__pnet_find_base_ndev(struct net_device *ndev) { int i, nest_lvl; ASSERT_RTNL(); nest_lvl = ndev->lower_level; for (i = 0; i < nest_lvl; i++) { struct list_head *lower = &ndev->adj_list.lower; if (list_empty(lower)) break; lower = lower->next; ndev = netdev_lower_get_next(ndev, &lower); } return ndev; } /* Determine one base device for stacked net devices. * If the lower device level contains more than one devices * (for instance with bonding slaves), just the first device * is used to reach a base device. */ static struct net_device *pnet_find_base_ndev(struct net_device *ndev) { rtnl_lock(); ndev = __pnet_find_base_ndev(ndev); rtnl_unlock(); return ndev; } static int smc_pnet_find_ndev_pnetid_by_table(struct net_device *ndev, u8 *pnetid) { struct smc_pnettable *pnettable; struct net *net = dev_net(ndev); struct smc_pnetentry *pnetelem; struct smc_net *sn; int rc = -ENOENT; /* get pnettable for namespace */ sn = net_generic(net, smc_net_id); pnettable = &sn->pnettable; mutex_lock(&pnettable->lock); list_for_each_entry(pnetelem, &pnettable->pnetlist, list) { if (pnetelem->type == SMC_PNET_ETH && ndev == pnetelem->ndev) { /* get pnetid of netdev device */ memcpy(pnetid, pnetelem->pnet_name, SMC_MAX_PNETID_LEN); rc = 0; break; } } mutex_unlock(&pnettable->lock); return rc; } static int smc_pnet_determine_gid(struct smc_ib_device *ibdev, int i, struct smc_init_info *ini) { if (!ini->check_smcrv2 && !smc_ib_determine_gid(ibdev, i, ini->vlan_id, ini->ib_gid, NULL, NULL)) { ini->ib_dev = ibdev; ini->ib_port = i; return 0; } if (ini->check_smcrv2 && !smc_ib_determine_gid(ibdev, i, ini->vlan_id, ini->smcrv2.ib_gid_v2, NULL, &ini->smcrv2)) { ini->smcrv2.ib_dev_v2 = ibdev; ini->smcrv2.ib_port_v2 = i; return 0; } return -ENODEV; } /* find a roce device for the given pnetid */ static void _smc_pnet_find_roce_by_pnetid(u8 *pnet_id, struct smc_init_info *ini, struct smc_ib_device *known_dev, struct net *net) { struct smc_ib_device *ibdev; int i; mutex_lock(&smc_ib_devices.mutex); list_for_each_entry(ibdev, &smc_ib_devices.list, list) { if (ibdev == known_dev || !rdma_dev_access_netns(ibdev->ibdev, net)) continue; for (i = 1; i <= SMC_MAX_PORTS; i++) { if (!rdma_is_port_valid(ibdev->ibdev, i)) continue; if (smc_pnet_match(ibdev->pnetid[i - 1], pnet_id) && smc_ib_port_active(ibdev, i) && !test_bit(i - 1, ibdev->ports_going_away)) { if (!smc_pnet_determine_gid(ibdev, i, ini)) goto out; } } } out: mutex_unlock(&smc_ib_devices.mutex); } /* find alternate roce device with same pnet_id, vlan_id and net namespace */ void smc_pnet_find_alt_roce(struct smc_link_group *lgr, struct smc_init_info *ini, struct smc_ib_device *known_dev) { struct net *net = lgr->net; _smc_pnet_find_roce_by_pnetid(lgr->pnet_id, ini, known_dev, net); } /* if handshake network device belongs to a roce device, return its * IB device and port */ static void smc_pnet_find_rdma_dev(struct net_device *netdev, struct smc_init_info *ini) { struct net *net = dev_net(netdev); struct smc_ib_device *ibdev; mutex_lock(&smc_ib_devices.mutex); list_for_each_entry(ibdev, &smc_ib_devices.list, list) { struct net_device *ndev; int i; /* check rdma net namespace */ if (!rdma_dev_access_netns(ibdev->ibdev, net)) continue; for (i = 1; i <= SMC_MAX_PORTS; i++) { if (!rdma_is_port_valid(ibdev->ibdev, i)) continue; ndev = ib_device_get_netdev(ibdev->ibdev, i); if (!ndev) continue; dev_put(ndev); if (netdev == ndev && smc_ib_port_active(ibdev, i) && !test_bit(i - 1, ibdev->ports_going_away)) { if (!smc_pnet_determine_gid(ibdev, i, ini)) break; } } } mutex_unlock(&smc_ib_devices.mutex); } /* Determine the corresponding IB device port based on the hardware PNETID. * Searching stops at the first matching active IB device port with vlan_id * configured. * If nothing found, check pnetid table. * If nothing found, try to use handshake device */ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev, struct smc_init_info *ini) { u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; struct net_device *base_ndev; struct net *net; base_ndev = pnet_find_base_ndev(ndev); net = dev_net(ndev); if (smc_pnetid_by_dev_port(base_ndev->dev.parent, base_ndev->dev_port, ndev_pnetid) && smc_pnet_find_ndev_pnetid_by_table(base_ndev, ndev_pnetid) && smc_pnet_find_ndev_pnetid_by_table(ndev, ndev_pnetid)) { smc_pnet_find_rdma_dev(base_ndev, ini); return; /* pnetid could not be determined */ } _smc_pnet_find_roce_by_pnetid(ndev_pnetid, ini, NULL, net); } static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev, struct smc_init_info *ini) { u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; struct smcd_dev *ismdev; ndev = pnet_find_base_ndev(ndev); if (smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port, ndev_pnetid) && smc_pnet_find_ndev_pnetid_by_table(ndev, ndev_pnetid)) return; /* pnetid could not be determined */ mutex_lock(&smcd_dev_list.mutex); list_for_each_entry(ismdev, &smcd_dev_list.list, list) { if (smc_pnet_match(ismdev->pnetid, ndev_pnetid) && !ismdev->going_away && (!ini->ism_peer_gid[0].gid || !smc_ism_cantalk(&ini->ism_peer_gid[0], ini->vlan_id, ismdev))) { ini->ism_dev[0] = ismdev; break; } } mutex_unlock(&smcd_dev_list.mutex); } /* PNET table analysis for a given sock: * determine ib_device and port belonging to used internal TCP socket * ethernet interface. */ void smc_pnet_find_roce_resource(struct sock *sk, struct smc_init_info *ini) { struct dst_entry *dst = sk_dst_get(sk); if (!dst) goto out; if (!dst->dev) goto out_rel; smc_pnet_find_roce_by_pnetid(dst->dev, ini); out_rel: dst_release(dst); out: return; } void smc_pnet_find_ism_resource(struct sock *sk, struct smc_init_info *ini) { struct dst_entry *dst = sk_dst_get(sk); ini->ism_dev[0] = NULL; if (!dst) goto out; if (!dst->dev) goto out_rel; smc_pnet_find_ism_by_pnetid(dst->dev, ini); out_rel: dst_release(dst); out: return; } /* Lookup and apply a pnet table entry to the given ib device. */ int smc_pnetid_by_table_ib(struct smc_ib_device *smcibdev, u8 ib_port) { char *ib_name = smcibdev->ibdev->name; struct smc_pnettable *pnettable; struct smc_pnetentry *tmp_pe; struct smc_net *sn; int rc = -ENOENT; /* get pnettable for init namespace */ sn = net_generic(&init_net, smc_net_id); pnettable = &sn->pnettable; mutex_lock(&pnettable->lock); list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { if (tmp_pe->type == SMC_PNET_IB && !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX) && tmp_pe->ib_port == ib_port) { smc_pnet_apply_ib(smcibdev, ib_port, tmp_pe->pnet_name); rc = 0; break; } } mutex_unlock(&pnettable->lock); return rc; } /* Lookup and apply a pnet table entry to the given smcd device. */ int smc_pnetid_by_table_smcd(struct smcd_dev *smcddev) { const char *ib_name = dev_name(smcddev->ops->get_dev(smcddev)); struct smc_pnettable *pnettable; struct smc_pnetentry *tmp_pe; struct smc_net *sn; int rc = -ENOENT; /* get pnettable for init namespace */ sn = net_generic(&init_net, smc_net_id); pnettable = &sn->pnettable; mutex_lock(&pnettable->lock); list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { if (tmp_pe->type == SMC_PNET_IB && !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX)) { smc_pnet_apply_smcd(smcddev, tmp_pe->pnet_name); rc = 0; break; } } mutex_unlock(&pnettable->lock); return rc; }
5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 // SPDX-License-Identifier: GPL-2.0-or-later /* * Multicast support for IPv6 * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * * Based on linux/ipv4/igmp.c and linux/ipv4/ip_sockglue.c */ /* Changes: * * yoshfuji : fix format of router-alert option * YOSHIFUJI Hideaki @USAGI: * Fixed source address for MLD message based on * <draft-ietf-magma-mld-source-05.txt>. * YOSHIFUJI Hideaki @USAGI: * - Ignore Queries for invalid addresses. * - MLD for link-local addresses. * David L Stevens <dlstevens@us.ibm.com>: * - MLDv2 support */ #include <linux/module.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/jiffies.h> #include <linux/net.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/netdevice.h> #include <linux/if_addr.h> #include <linux/if_arp.h> #include <linux/route.h> #include <linux/rtnetlink.h> #include <linux/init.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/pkt_sched.h> #include <net/mld.h> #include <linux/workqueue.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> #include <net/net_namespace.h> #include <net/netlink.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ipv6.h> #include <net/protocol.h> #include <net/if_inet6.h> #include <net/ndisc.h> #include <net/addrconf.h> #include <net/ip6_route.h> #include <net/inet_common.h> #include <net/ip6_checksum.h> /* Ensure that we have struct in6_addr aligned on 32bit word. */ static int __mld2_query_bugs[] __attribute__((__unused__)) = { BUILD_BUG_ON_ZERO(offsetof(struct mld2_query, mld2q_srcs) % 4), BUILD_BUG_ON_ZERO(offsetof(struct mld2_report, mld2r_grec) % 4), BUILD_BUG_ON_ZERO(offsetof(struct mld2_grec, grec_mca) % 4) }; static struct workqueue_struct *mld_wq; static struct in6_addr mld2_all_mcr = MLD2_ALL_MCR_INIT; static void igmp6_join_group(struct ifmcaddr6 *ma); static void igmp6_leave_group(struct ifmcaddr6 *ma); static void mld_mca_work(struct work_struct *work); static void mld_ifc_event(struct inet6_dev *idev); static bool mld_in_v1_mode(const struct inet6_dev *idev); static int sf_setstate(struct ifmcaddr6 *pmc); static void sf_markstate(struct ifmcaddr6 *pmc); static void ip6_mc_clear_src(struct ifmcaddr6 *pmc); static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca, int sfmode, int sfcount, const struct in6_addr *psfsrc, int delta); static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, int sfmode, int sfcount, const struct in6_addr *psfsrc, int delta); static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, struct inet6_dev *idev); static int __ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr, unsigned int mode); #define MLD_QRV_DEFAULT 2 /* RFC3810, 9.2. Query Interval */ #define MLD_QI_DEFAULT (125 * HZ) /* RFC3810, 9.3. Query Response Interval */ #define MLD_QRI_DEFAULT (10 * HZ) /* RFC3810, 8.1 Query Version Distinctions */ #define MLD_V1_QUERY_LEN 24 #define MLD_V2_QUERY_LEN_MIN 28 #define IPV6_MLD_MAX_MSF 64 int sysctl_mld_max_msf __read_mostly = IPV6_MLD_MAX_MSF; int sysctl_mld_qrv __read_mostly = MLD_QRV_DEFAULT; /* * socket join on multicast group */ #define mc_dereference(e, idev) \ rcu_dereference_protected(e, lockdep_is_held(&(idev)->mc_lock)) #define sock_dereference(e, sk) \ rcu_dereference_protected(e, lockdep_sock_is_held(sk)) #define for_each_pmc_socklock(np, sk, pmc) \ for (pmc = sock_dereference((np)->ipv6_mc_list, sk); \ pmc; \ pmc = sock_dereference(pmc->next, sk)) #define for_each_pmc_rcu(np, pmc) \ for (pmc = rcu_dereference((np)->ipv6_mc_list); \ pmc; \ pmc = rcu_dereference(pmc->next)) #define for_each_psf_mclock(mc, psf) \ for (psf = mc_dereference((mc)->mca_sources, mc->idev); \ psf; \ psf = mc_dereference(psf->sf_next, mc->idev)) #define for_each_psf_rcu(mc, psf) \ for (psf = rcu_dereference((mc)->mca_sources); \ psf; \ psf = rcu_dereference(psf->sf_next)) #define for_each_psf_tomb(mc, psf) \ for (psf = mc_dereference((mc)->mca_tomb, mc->idev); \ psf; \ psf = mc_dereference(psf->sf_next, mc->idev)) #define for_each_mc_mclock(idev, mc) \ for (mc = mc_dereference((idev)->mc_list, idev); \ mc; \ mc = mc_dereference(mc->next, idev)) #define for_each_mc_rcu(idev, mc) \ for (mc = rcu_dereference((idev)->mc_list); \ mc; \ mc = rcu_dereference(mc->next)) #define for_each_mc_tomb(idev, mc) \ for (mc = mc_dereference((idev)->mc_tomb, idev); \ mc; \ mc = mc_dereference(mc->next, idev)) static int unsolicited_report_interval(struct inet6_dev *idev) { int iv; if (mld_in_v1_mode(idev)) iv = READ_ONCE(idev->cnf.mldv1_unsolicited_report_interval); else iv = READ_ONCE(idev->cnf.mldv2_unsolicited_report_interval); return iv > 0 ? iv : 1; } static int __ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr, unsigned int mode) { struct net_device *dev = NULL; struct ipv6_mc_socklist *mc_lst; struct ipv6_pinfo *np = inet6_sk(sk); struct net *net = sock_net(sk); int err; ASSERT_RTNL(); if (!ipv6_addr_is_multicast(addr)) return -EINVAL; for_each_pmc_socklock(np, sk, mc_lst) { if ((ifindex == 0 || mc_lst->ifindex == ifindex) && ipv6_addr_equal(&mc_lst->addr, addr)) return -EADDRINUSE; } mc_lst = sock_kmalloc(sk, sizeof(struct ipv6_mc_socklist), GFP_KERNEL); if (!mc_lst) return -ENOMEM; mc_lst->next = NULL; mc_lst->addr = *addr; if (ifindex == 0) { struct rt6_info *rt; rt = rt6_lookup(net, addr, NULL, 0, NULL, 0); if (rt) { dev = rt->dst.dev; ip6_rt_put(rt); } } else dev = __dev_get_by_index(net, ifindex); if (!dev) { sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); return -ENODEV; } mc_lst->ifindex = dev->ifindex; mc_lst->sfmode = mode; RCU_INIT_POINTER(mc_lst->sflist, NULL); /* * now add/increase the group membership on the device */ err = __ipv6_dev_mc_inc(dev, addr, mode); if (err) { sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); return err; } mc_lst->next = np->ipv6_mc_list; rcu_assign_pointer(np->ipv6_mc_list, mc_lst); return 0; } int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) { return __ipv6_sock_mc_join(sk, ifindex, addr, MCAST_EXCLUDE); } EXPORT_SYMBOL(ipv6_sock_mc_join); int ipv6_sock_mc_join_ssm(struct sock *sk, int ifindex, const struct in6_addr *addr, unsigned int mode) { return __ipv6_sock_mc_join(sk, ifindex, addr, mode); } /* * socket leave on multicast group */ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_mc_socklist *mc_lst; struct ipv6_mc_socklist __rcu **lnk; struct net *net = sock_net(sk); ASSERT_RTNL(); if (!ipv6_addr_is_multicast(addr)) return -EINVAL; for (lnk = &np->ipv6_mc_list; (mc_lst = sock_dereference(*lnk, sk)) != NULL; lnk = &mc_lst->next) { if ((ifindex == 0 || mc_lst->ifindex == ifindex) && ipv6_addr_equal(&mc_lst->addr, addr)) { struct net_device *dev; *lnk = mc_lst->next; dev = __dev_get_by_index(net, mc_lst->ifindex); if (dev) { struct inet6_dev *idev = __in6_dev_get(dev); ip6_mc_leave_src(sk, mc_lst, idev); if (idev) __ipv6_dev_mc_dec(idev, &mc_lst->addr); } else { ip6_mc_leave_src(sk, mc_lst, NULL); } atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc); kfree_rcu(mc_lst, rcu); return 0; } } return -EADDRNOTAVAIL; } EXPORT_SYMBOL(ipv6_sock_mc_drop); static struct inet6_dev *ip6_mc_find_dev_rtnl(struct net *net, const struct in6_addr *group, int ifindex) { struct net_device *dev = NULL; struct inet6_dev *idev = NULL; if (ifindex == 0) { struct rt6_info *rt = rt6_lookup(net, group, NULL, 0, NULL, 0); if (rt) { dev = rt->dst.dev; ip6_rt_put(rt); } } else { dev = __dev_get_by_index(net, ifindex); } if (!dev) return NULL; idev = __in6_dev_get(dev); if (!idev) return NULL; if (idev->dead) return NULL; return idev; } void __ipv6_sock_mc_close(struct sock *sk) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_mc_socklist *mc_lst; struct net *net = sock_net(sk); ASSERT_RTNL(); while ((mc_lst = sock_dereference(np->ipv6_mc_list, sk)) != NULL) { struct net_device *dev; np->ipv6_mc_list = mc_lst->next; dev = __dev_get_by_index(net, mc_lst->ifindex); if (dev) { struct inet6_dev *idev = __in6_dev_get(dev); ip6_mc_leave_src(sk, mc_lst, idev); if (idev) __ipv6_dev_mc_dec(idev, &mc_lst->addr); } else { ip6_mc_leave_src(sk, mc_lst, NULL); } atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc); kfree_rcu(mc_lst, rcu); } } void ipv6_sock_mc_close(struct sock *sk) { struct ipv6_pinfo *np = inet6_sk(sk); if (!rcu_access_pointer(np->ipv6_mc_list)) return; rtnl_lock(); lock_sock(sk); __ipv6_sock_mc_close(sk); release_sock(sk); rtnl_unlock(); } int ip6_mc_source(int add, int omode, struct sock *sk, struct group_source_req *pgsr) { struct in6_addr *source, *group; struct ipv6_mc_socklist *pmc; struct inet6_dev *idev; struct ipv6_pinfo *inet6 = inet6_sk(sk); struct ip6_sf_socklist *psl; struct net *net = sock_net(sk); int i, j, rv; int leavegroup = 0; int err; source = &((struct sockaddr_in6 *)&pgsr->gsr_source)->sin6_addr; group = &((struct sockaddr_in6 *)&pgsr->gsr_group)->sin6_addr; if (!ipv6_addr_is_multicast(group)) return -EINVAL; idev = ip6_mc_find_dev_rtnl(net, group, pgsr->gsr_interface); if (!idev) return -ENODEV; err = -EADDRNOTAVAIL; mutex_lock(&idev->mc_lock); for_each_pmc_socklock(inet6, sk, pmc) { if (pgsr->gsr_interface && pmc->ifindex != pgsr->gsr_interface) continue; if (ipv6_addr_equal(&pmc->addr, group)) break; } if (!pmc) { /* must have a prior join */ err = -EINVAL; goto done; } /* if a source filter was set, must be the same mode as before */ if (rcu_access_pointer(pmc->sflist)) { if (pmc->sfmode != omode) { err = -EINVAL; goto done; } } else if (pmc->sfmode != omode) { /* allow mode switches for empty-set filters */ ip6_mc_add_src(idev, group, omode, 0, NULL, 0); ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0); pmc->sfmode = omode; } psl = sock_dereference(pmc->sflist, sk); if (!add) { if (!psl) goto done; /* err = -EADDRNOTAVAIL */ rv = !0; for (i = 0; i < psl->sl_count; i++) { rv = !ipv6_addr_equal(&psl->sl_addr[i], source); if (rv == 0) break; } if (rv) /* source not found */ goto done; /* err = -EADDRNOTAVAIL */ /* special case - (INCLUDE, empty) == LEAVE_GROUP */ if (psl->sl_count == 1 && omode == MCAST_INCLUDE) { leavegroup = 1; goto done; } /* update the interface filter */ ip6_mc_del_src(idev, group, omode, 1, source, 1); for (j = i+1; j < psl->sl_count; j++) psl->sl_addr[j-1] = psl->sl_addr[j]; psl->sl_count--; err = 0; goto done; } /* else, add a new source to the filter */ if (psl && psl->sl_count >= sysctl_mld_max_msf) { err = -ENOBUFS; goto done; } if (!psl || psl->sl_count == psl->sl_max) { struct ip6_sf_socklist *newpsl; int count = IP6_SFBLOCK; if (psl) count += psl->sl_max; newpsl = sock_kmalloc(sk, struct_size(newpsl, sl_addr, count), GFP_KERNEL); if (!newpsl) { err = -ENOBUFS; goto done; } newpsl->sl_max = count; newpsl->sl_count = count - IP6_SFBLOCK; if (psl) { for (i = 0; i < psl->sl_count; i++) newpsl->sl_addr[i] = psl->sl_addr[i]; atomic_sub(struct_size(psl, sl_addr, psl->sl_max), &sk->sk_omem_alloc); } rcu_assign_pointer(pmc->sflist, newpsl); kfree_rcu(psl, rcu); psl = newpsl; } rv = 1; /* > 0 for insert logic below if sl_count is 0 */ for (i = 0; i < psl->sl_count; i++) { rv = !ipv6_addr_equal(&psl->sl_addr[i], source); if (rv == 0) /* There is an error in the address. */ goto done; } for (j = psl->sl_count-1; j >= i; j--) psl->sl_addr[j+1] = psl->sl_addr[j]; psl->sl_addr[i] = *source; psl->sl_count++; err = 0; /* update the interface list */ ip6_mc_add_src(idev, group, omode, 1, source, 1); done: mutex_unlock(&idev->mc_lock); if (leavegroup) err = ipv6_sock_mc_drop(sk, pgsr->gsr_interface, group); return err; } int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf, struct sockaddr_storage *list) { const struct in6_addr *group; struct ipv6_mc_socklist *pmc; struct inet6_dev *idev; struct ipv6_pinfo *inet6 = inet6_sk(sk); struct ip6_sf_socklist *newpsl, *psl; struct net *net = sock_net(sk); int leavegroup = 0; int i, err; group = &((struct sockaddr_in6 *)&gsf->gf_group)->sin6_addr; if (!ipv6_addr_is_multicast(group)) return -EINVAL; if (gsf->gf_fmode != MCAST_INCLUDE && gsf->gf_fmode != MCAST_EXCLUDE) return -EINVAL; idev = ip6_mc_find_dev_rtnl(net, group, gsf->gf_interface); if (!idev) return -ENODEV; err = 0; if (gsf->gf_fmode == MCAST_INCLUDE && gsf->gf_numsrc == 0) { leavegroup = 1; goto done; } for_each_pmc_socklock(inet6, sk, pmc) { if (pmc->ifindex != gsf->gf_interface) continue; if (ipv6_addr_equal(&pmc->addr, group)) break; } if (!pmc) { /* must have a prior join */ err = -EINVAL; goto done; } if (gsf->gf_numsrc) { newpsl = sock_kmalloc(sk, struct_size(newpsl, sl_addr, gsf->gf_numsrc), GFP_KERNEL); if (!newpsl) { err = -ENOBUFS; goto done; } newpsl->sl_max = newpsl->sl_count = gsf->gf_numsrc; for (i = 0; i < newpsl->sl_count; ++i, ++list) { struct sockaddr_in6 *psin6; psin6 = (struct sockaddr_in6 *)list; newpsl->sl_addr[i] = psin6->sin6_addr; } mutex_lock(&idev->mc_lock); err = ip6_mc_add_src(idev, group, gsf->gf_fmode, newpsl->sl_count, newpsl->sl_addr, 0); if (err) { mutex_unlock(&idev->mc_lock); sock_kfree_s(sk, newpsl, struct_size(newpsl, sl_addr, newpsl->sl_max)); goto done; } mutex_unlock(&idev->mc_lock); } else { newpsl = NULL; mutex_lock(&idev->mc_lock); ip6_mc_add_src(idev, group, gsf->gf_fmode, 0, NULL, 0); mutex_unlock(&idev->mc_lock); } mutex_lock(&idev->mc_lock); psl = sock_dereference(pmc->sflist, sk); if (psl) { ip6_mc_del_src(idev, group, pmc->sfmode, psl->sl_count, psl->sl_addr, 0); atomic_sub(struct_size(psl, sl_addr, psl->sl_max), &sk->sk_omem_alloc); } else { ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0); } rcu_assign_pointer(pmc->sflist, newpsl); mutex_unlock(&idev->mc_lock); kfree_rcu(psl, rcu); pmc->sfmode = gsf->gf_fmode; err = 0; done: if (leavegroup) err = ipv6_sock_mc_drop(sk, gsf->gf_interface, group); return err; } int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, sockptr_t optval, size_t ss_offset) { struct ipv6_pinfo *inet6 = inet6_sk(sk); const struct in6_addr *group; struct ipv6_mc_socklist *pmc; struct ip6_sf_socklist *psl; unsigned int count; int i, copycount; group = &((struct sockaddr_in6 *)&gsf->gf_group)->sin6_addr; if (!ipv6_addr_is_multicast(group)) return -EINVAL; /* changes to the ipv6_mc_list require the socket lock and * rtnl lock. We have the socket lock, so reading the list is safe. */ for_each_pmc_socklock(inet6, sk, pmc) { if (pmc->ifindex != gsf->gf_interface) continue; if (ipv6_addr_equal(group, &pmc->addr)) break; } if (!pmc) /* must have a prior join */ return -EADDRNOTAVAIL; gsf->gf_fmode = pmc->sfmode; psl = sock_dereference(pmc->sflist, sk); count = psl ? psl->sl_count : 0; copycount = min(count, gsf->gf_numsrc); gsf->gf_numsrc = count; for (i = 0; i < copycount; i++) { struct sockaddr_in6 *psin6; struct sockaddr_storage ss; psin6 = (struct sockaddr_in6 *)&ss; memset(&ss, 0, sizeof(ss)); psin6->sin6_family = AF_INET6; psin6->sin6_addr = psl->sl_addr[i]; if (copy_to_sockptr_offset(optval, ss_offset, &ss, sizeof(ss))) return -EFAULT; ss_offset += sizeof(ss); } return 0; } bool inet6_mc_check(const struct sock *sk, const struct in6_addr *mc_addr, const struct in6_addr *src_addr) { const struct ipv6_pinfo *np = inet6_sk(sk); const struct ipv6_mc_socklist *mc; const struct ip6_sf_socklist *psl; bool rv = true; rcu_read_lock(); for_each_pmc_rcu(np, mc) { if (ipv6_addr_equal(&mc->addr, mc_addr)) break; } if (!mc) { rcu_read_unlock(); return inet6_test_bit(MC6_ALL, sk); } psl = rcu_dereference(mc->sflist); if (!psl) { rv = mc->sfmode == MCAST_EXCLUDE; } else { int i; for (i = 0; i < psl->sl_count; i++) { if (ipv6_addr_equal(&psl->sl_addr[i], src_addr)) break; } if (mc->sfmode == MCAST_INCLUDE && i >= psl->sl_count) rv = false; if (mc->sfmode == MCAST_EXCLUDE && i < psl->sl_count) rv = false; } rcu_read_unlock(); return rv; } /* called with mc_lock */ static void igmp6_group_added(struct ifmcaddr6 *mc) { struct net_device *dev = mc->idev->dev; char buf[MAX_ADDR_LEN]; if (IPV6_ADDR_MC_SCOPE(&mc->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) return; if (!(mc->mca_flags&MAF_LOADED)) { mc->mca_flags |= MAF_LOADED; if (ndisc_mc_map(&mc->mca_addr, buf, dev, 0) == 0) dev_mc_add(dev, buf); } if (!(dev->flags & IFF_UP) || (mc->mca_flags & MAF_NOREPORT)) return; if (mld_in_v1_mode(mc->idev)) { igmp6_join_group(mc); return; } /* else v2 */ /* Based on RFC3810 6.1, for newly added INCLUDE SSM, we * should not send filter-mode change record as the mode * should be from IN() to IN(A). */ if (mc->mca_sfmode == MCAST_EXCLUDE) mc->mca_crcount = mc->idev->mc_qrv; mld_ifc_event(mc->idev); } /* called with mc_lock */ static void igmp6_group_dropped(struct ifmcaddr6 *mc) { struct net_device *dev = mc->idev->dev; char buf[MAX_ADDR_LEN]; if (IPV6_ADDR_MC_SCOPE(&mc->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) return; if (mc->mca_flags&MAF_LOADED) { mc->mca_flags &= ~MAF_LOADED; if (ndisc_mc_map(&mc->mca_addr, buf, dev, 0) == 0) dev_mc_del(dev, buf); } if (mc->mca_flags & MAF_NOREPORT) return; if (!mc->idev->dead) igmp6_leave_group(mc); if (cancel_delayed_work(&mc->mca_work)) refcount_dec(&mc->mca_refcnt); } /* * deleted ifmcaddr6 manipulation * called with mc_lock */ static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) { struct ifmcaddr6 *pmc; /* this is an "ifmcaddr6" for convenience; only the fields below * are actually used. In particular, the refcnt and users are not * used for management of the delete list. Using the same structure * for deleted items allows change reports to use common code with * non-deleted or query-response MCA's. */ pmc = kzalloc(sizeof(*pmc), GFP_KERNEL); if (!pmc) return; pmc->idev = im->idev; in6_dev_hold(idev); pmc->mca_addr = im->mca_addr; pmc->mca_crcount = idev->mc_qrv; pmc->mca_sfmode = im->mca_sfmode; if (pmc->mca_sfmode == MCAST_INCLUDE) { struct ip6_sf_list *psf; rcu_assign_pointer(pmc->mca_tomb, mc_dereference(im->mca_tomb, idev)); rcu_assign_pointer(pmc->mca_sources, mc_dereference(im->mca_sources, idev)); RCU_INIT_POINTER(im->mca_tomb, NULL); RCU_INIT_POINTER(im->mca_sources, NULL); for_each_psf_mclock(pmc, psf) psf->sf_crcount = pmc->mca_crcount; } rcu_assign_pointer(pmc->next, idev->mc_tomb); rcu_assign_pointer(idev->mc_tomb, pmc); } /* called with mc_lock */ static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) { struct ip6_sf_list *psf, *sources, *tomb; struct in6_addr *pmca = &im->mca_addr; struct ifmcaddr6 *pmc, *pmc_prev; pmc_prev = NULL; for_each_mc_tomb(idev, pmc) { if (ipv6_addr_equal(&pmc->mca_addr, pmca)) break; pmc_prev = pmc; } if (pmc) { if (pmc_prev) rcu_assign_pointer(pmc_prev->next, pmc->next); else rcu_assign_pointer(idev->mc_tomb, pmc->next); } if (pmc) { im->idev = pmc->idev; if (im->mca_sfmode == MCAST_INCLUDE) { tomb = rcu_replace_pointer(im->mca_tomb, mc_dereference(pmc->mca_tomb, pmc->idev), lockdep_is_held(&im->idev->mc_lock)); rcu_assign_pointer(pmc->mca_tomb, tomb); sources = rcu_replace_pointer(im->mca_sources, mc_dereference(pmc->mca_sources, pmc->idev), lockdep_is_held(&im->idev->mc_lock)); rcu_assign_pointer(pmc->mca_sources, sources); for_each_psf_mclock(im, psf) psf->sf_crcount = idev->mc_qrv; } else { im->mca_crcount = idev->mc_qrv; } in6_dev_put(pmc->idev); ip6_mc_clear_src(pmc); kfree_rcu(pmc, rcu); } } /* called with mc_lock */ static void mld_clear_delrec(struct inet6_dev *idev) { struct ifmcaddr6 *pmc, *nextpmc; pmc = mc_dereference(idev->mc_tomb, idev); RCU_INIT_POINTER(idev->mc_tomb, NULL); for (; pmc; pmc = nextpmc) { nextpmc = mc_dereference(pmc->next, idev); ip6_mc_clear_src(pmc); in6_dev_put(pmc->idev); kfree_rcu(pmc, rcu); } /* clear dead sources, too */ for_each_mc_mclock(idev, pmc) { struct ip6_sf_list *psf, *psf_next; psf = mc_dereference(pmc->mca_tomb, idev); RCU_INIT_POINTER(pmc->mca_tomb, NULL); for (; psf; psf = psf_next) { psf_next = mc_dereference(psf->sf_next, idev); kfree_rcu(psf, rcu); } } } static void mld_clear_query(struct inet6_dev *idev) { struct sk_buff *skb; spin_lock_bh(&idev->mc_query_lock); while ((skb = __skb_dequeue(&idev->mc_query_queue))) kfree_skb(skb); spin_unlock_bh(&idev->mc_query_lock); } static void mld_clear_report(struct inet6_dev *idev) { struct sk_buff *skb; spin_lock_bh(&idev->mc_report_lock); while ((skb = __skb_dequeue(&idev->mc_report_queue))) kfree_skb(skb); spin_unlock_bh(&idev->mc_report_lock); } static void mca_get(struct ifmcaddr6 *mc) { refcount_inc(&mc->mca_refcnt); } static void ma_put(struct ifmcaddr6 *mc) { if (refcount_dec_and_test(&mc->mca_refcnt)) { in6_dev_put(mc->idev); kfree_rcu(mc, rcu); } } /* called with mc_lock */ static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev, const struct in6_addr *addr, unsigned int mode) { struct ifmcaddr6 *mc; mc = kzalloc(sizeof(*mc), GFP_KERNEL); if (!mc) return NULL; INIT_DELAYED_WORK(&mc->mca_work, mld_mca_work); mc->mca_addr = *addr; mc->idev = idev; /* reference taken by caller */ mc->mca_users = 1; /* mca_stamp should be updated upon changes */ mc->mca_cstamp = mc->mca_tstamp = jiffies; refcount_set(&mc->mca_refcnt, 1); mc->mca_sfmode = mode; mc->mca_sfcount[mode] = 1; if (ipv6_addr_is_ll_all_nodes(&mc->mca_addr) || IPV6_ADDR_MC_SCOPE(&mc->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) mc->mca_flags |= MAF_NOREPORT; return mc; } static void inet6_ifmcaddr_notify(struct net_device *dev, const struct ifmcaddr6 *ifmca, int event) { struct inet6_fill_args fillargs = { .portid = 0, .seq = 0, .event = event, .flags = 0, .netnsid = -1, .force_rt_scope_universe = true, }; struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOMEM; skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct ifaddrmsg)) + nla_total_size(sizeof(struct in6_addr)) + nla_total_size(sizeof(struct ifa_cacheinfo)), GFP_KERNEL); if (!skb) goto error; err = inet6_fill_ifmcaddr(skb, ifmca, &fillargs); if (err < 0) { WARN_ON_ONCE(err == -EMSGSIZE); nlmsg_free(skb); goto error; } rtnl_notify(skb, net, 0, RTNLGRP_IPV6_MCADDR, NULL, GFP_KERNEL); return; error: rtnl_set_sk_err(net, RTNLGRP_IPV6_MCADDR, err); } /* * device multicast group inc (add if not found) */ static int __ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr, unsigned int mode) { struct ifmcaddr6 *mc; struct inet6_dev *idev; ASSERT_RTNL(); /* we need to take a reference on idev */ idev = in6_dev_get(dev); if (!idev) return -EINVAL; if (idev->dead) { in6_dev_put(idev); return -ENODEV; } mutex_lock(&idev->mc_lock); for_each_mc_mclock(idev, mc) { if (ipv6_addr_equal(&mc->mca_addr, addr)) { mc->mca_users++; ip6_mc_add_src(idev, &mc->mca_addr, mode, 0, NULL, 0); mutex_unlock(&idev->mc_lock); in6_dev_put(idev); return 0; } } mc = mca_alloc(idev, addr, mode); if (!mc) { mutex_unlock(&idev->mc_lock); in6_dev_put(idev); return -ENOMEM; } rcu_assign_pointer(mc->next, idev->mc_list); rcu_assign_pointer(idev->mc_list, mc); mca_get(mc); mld_del_delrec(idev, mc); igmp6_group_added(mc); inet6_ifmcaddr_notify(dev, mc, RTM_NEWMULTICAST); mutex_unlock(&idev->mc_lock); ma_put(mc); return 0; } int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr) { return __ipv6_dev_mc_inc(dev, addr, MCAST_EXCLUDE); } EXPORT_SYMBOL(ipv6_dev_mc_inc); /* * device multicast group del */ int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr) { struct ifmcaddr6 *ma, __rcu **map; ASSERT_RTNL(); mutex_lock(&idev->mc_lock); for (map = &idev->mc_list; (ma = mc_dereference(*map, idev)); map = &ma->next) { if (ipv6_addr_equal(&ma->mca_addr, addr)) { if (--ma->mca_users == 0) { *map = ma->next; igmp6_group_dropped(ma); inet6_ifmcaddr_notify(idev->dev, ma, RTM_DELMULTICAST); ip6_mc_clear_src(ma); mutex_unlock(&idev->mc_lock); ma_put(ma); return 0; } mutex_unlock(&idev->mc_lock); return 0; } } mutex_unlock(&idev->mc_lock); return -ENOENT; } int ipv6_dev_mc_dec(struct net_device *dev, const struct in6_addr *addr) { struct inet6_dev *idev; int err; ASSERT_RTNL(); idev = __in6_dev_get(dev); if (!idev) err = -ENODEV; else err = __ipv6_dev_mc_dec(idev, addr); return err; } EXPORT_SYMBOL(ipv6_dev_mc_dec); /* * check if the interface/address pair is valid */ bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, const struct in6_addr *src_addr) { struct inet6_dev *idev; struct ifmcaddr6 *mc; bool rv = false; rcu_read_lock(); idev = __in6_dev_get(dev); if (!idev) goto unlock; for_each_mc_rcu(idev, mc) { if (ipv6_addr_equal(&mc->mca_addr, group)) break; } if (!mc) goto unlock; if (src_addr && !ipv6_addr_any(src_addr)) { struct ip6_sf_list *psf; for_each_psf_rcu(mc, psf) { if (ipv6_addr_equal(&psf->sf_addr, src_addr)) break; } if (psf) rv = READ_ONCE(psf->sf_count[MCAST_INCLUDE]) || READ_ONCE(psf->sf_count[MCAST_EXCLUDE]) != READ_ONCE(mc->mca_sfcount[MCAST_EXCLUDE]); else rv = READ_ONCE(mc->mca_sfcount[MCAST_EXCLUDE]) != 0; } else { rv = true; /* don't filter unspecified source */ } unlock: rcu_read_unlock(); return rv; } /* called with mc_lock */ static void mld_gq_start_work(struct inet6_dev *idev) { unsigned long tv = get_random_u32_below(idev->mc_maxdelay); idev->mc_gq_running = 1; if (!mod_delayed_work(mld_wq, &idev->mc_gq_work, tv + 2)) in6_dev_hold(idev); } /* called with mc_lock */ static void mld_gq_stop_work(struct inet6_dev *idev) { idev->mc_gq_running = 0; if (cancel_delayed_work(&idev->mc_gq_work)) __in6_dev_put(idev); } /* called with mc_lock */ static void mld_ifc_start_work(struct inet6_dev *idev, unsigned long delay) { unsigned long tv = get_random_u32_below(delay); if (!mod_delayed_work(mld_wq, &idev->mc_ifc_work, tv + 2)) in6_dev_hold(idev); } /* called with mc_lock */ static void mld_ifc_stop_work(struct inet6_dev *idev) { idev->mc_ifc_count = 0; if (cancel_delayed_work(&idev->mc_ifc_work)) __in6_dev_put(idev); } /* called with mc_lock */ static void mld_dad_start_work(struct inet6_dev *idev, unsigned long delay) { unsigned long tv = get_random_u32_below(delay); if (!mod_delayed_work(mld_wq, &idev->mc_dad_work, tv + 2)) in6_dev_hold(idev); } static void mld_dad_stop_work(struct inet6_dev *idev) { if (cancel_delayed_work(&idev->mc_dad_work)) __in6_dev_put(idev); } static void mld_query_stop_work(struct inet6_dev *idev) { spin_lock_bh(&idev->mc_query_lock); if (cancel_delayed_work(&idev->mc_query_work)) __in6_dev_put(idev); spin_unlock_bh(&idev->mc_query_lock); } static void mld_report_stop_work(struct inet6_dev *idev) { if (cancel_delayed_work_sync(&idev->mc_report_work)) __in6_dev_put(idev); } /* * IGMP handling (alias multicast ICMPv6 messages) * called with mc_lock */ static void igmp6_group_queried(struct ifmcaddr6 *ma, unsigned long resptime) { unsigned long delay = resptime; /* Do not start work for these addresses */ if (ipv6_addr_is_ll_all_nodes(&ma->mca_addr) || IPV6_ADDR_MC_SCOPE(&ma->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) return; if (cancel_delayed_work(&ma->mca_work)) { refcount_dec(&ma->mca_refcnt); delay = ma->mca_work.timer.expires - jiffies; } if (delay >= resptime) delay = get_random_u32_below(resptime); if (!mod_delayed_work(mld_wq, &ma->mca_work, delay)) refcount_inc(&ma->mca_refcnt); ma->mca_flags |= MAF_TIMER_RUNNING; } /* mark EXCLUDE-mode sources * called with mc_lock */ static bool mld_xmarksources(struct ifmcaddr6 *pmc, int nsrcs, const struct in6_addr *srcs) { struct ip6_sf_list *psf; int i, scount; scount = 0; for_each_psf_mclock(pmc, psf) { if (scount == nsrcs) break; for (i = 0; i < nsrcs; i++) { /* skip inactive filters */ if (psf->sf_count[MCAST_INCLUDE] || pmc->mca_sfcount[MCAST_EXCLUDE] != psf->sf_count[MCAST_EXCLUDE]) break; if (ipv6_addr_equal(&srcs[i], &psf->sf_addr)) { scount++; break; } } } pmc->mca_flags &= ~MAF_GSQUERY; if (scount == nsrcs) /* all sources excluded */ return false; return true; } /* called with mc_lock */ static bool mld_marksources(struct ifmcaddr6 *pmc, int nsrcs, const struct in6_addr *srcs) { struct ip6_sf_list *psf; int i, scount; if (pmc->mca_sfmode == MCAST_EXCLUDE) return mld_xmarksources(pmc, nsrcs, srcs); /* mark INCLUDE-mode sources */ scount = 0; for_each_psf_mclock(pmc, psf) { if (scount == nsrcs) break; for (i = 0; i < nsrcs; i++) { if (ipv6_addr_equal(&srcs[i], &psf->sf_addr)) { psf->sf_gsresp = 1; scount++; break; } } } if (!scount) { pmc->mca_flags &= ~MAF_GSQUERY; return false; } pmc->mca_flags |= MAF_GSQUERY; return true; } static int mld_force_mld_version(const struct inet6_dev *idev) { const struct net *net = dev_net(idev->dev); int all_force; all_force = READ_ONCE(net->ipv6.devconf_all->force_mld_version); /* Normally, both are 0 here. If enforcement to a particular is * being used, individual device enforcement will have a lower * precedence over 'all' device (.../conf/all/force_mld_version). */ return all_force ?: READ_ONCE(idev->cnf.force_mld_version); } static bool mld_in_v2_mode_only(const struct inet6_dev *idev) { return mld_force_mld_version(idev) == 2; } static bool mld_in_v1_mode_only(const struct inet6_dev *idev) { return mld_force_mld_version(idev) == 1; } static bool mld_in_v1_mode(const struct inet6_dev *idev) { if (mld_in_v2_mode_only(idev)) return false; if (mld_in_v1_mode_only(idev)) return true; if (idev->mc_v1_seen && time_before(jiffies, idev->mc_v1_seen)) return true; return false; } static void mld_set_v1_mode(struct inet6_dev *idev) { /* RFC3810, relevant sections: * - 9.1. Robustness Variable * - 9.2. Query Interval * - 9.3. Query Response Interval * - 9.12. Older Version Querier Present Timeout */ unsigned long switchback; switchback = (idev->mc_qrv * idev->mc_qi) + idev->mc_qri; idev->mc_v1_seen = jiffies + switchback; } static void mld_update_qrv(struct inet6_dev *idev, const struct mld2_query *mlh2) { /* RFC3810, relevant sections: * - 5.1.8. QRV (Querier's Robustness Variable) * - 9.1. Robustness Variable */ /* The value of the Robustness Variable MUST NOT be zero, * and SHOULD NOT be one. Catch this here if we ever run * into such a case in future. */ const int min_qrv = min(MLD_QRV_DEFAULT, sysctl_mld_qrv); WARN_ON(idev->mc_qrv == 0); if (mlh2->mld2q_qrv > 0) idev->mc_qrv = mlh2->mld2q_qrv; if (unlikely(idev->mc_qrv < min_qrv)) { net_warn_ratelimited("IPv6: MLD: clamping QRV from %u to %u!\n", idev->mc_qrv, min_qrv); idev->mc_qrv = min_qrv; } } static void mld_update_qi(struct inet6_dev *idev, const struct mld2_query *mlh2) { /* RFC3810, relevant sections: * - 5.1.9. QQIC (Querier's Query Interval Code) * - 9.2. Query Interval * - 9.12. Older Version Querier Present Timeout * (the [Query Interval] in the last Query received) */ unsigned long mc_qqi; if (mlh2->mld2q_qqic < 128) { mc_qqi = mlh2->mld2q_qqic; } else { unsigned long mc_man, mc_exp; mc_exp = MLDV2_QQIC_EXP(mlh2->mld2q_qqic); mc_man = MLDV2_QQIC_MAN(mlh2->mld2q_qqic); mc_qqi = (mc_man | 0x10) << (mc_exp + 3); } idev->mc_qi = mc_qqi * HZ; } static void mld_update_qri(struct inet6_dev *idev, const struct mld2_query *mlh2) { /* RFC3810, relevant sections: * - 5.1.3. Maximum Response Code * - 9.3. Query Response Interval */ idev->mc_qri = msecs_to_jiffies(mldv2_mrc(mlh2)); } static int mld_process_v1(struct inet6_dev *idev, struct mld_msg *mld, unsigned long *max_delay, bool v1_query) { unsigned long mldv1_md; /* Ignore v1 queries */ if (mld_in_v2_mode_only(idev)) return -EINVAL; mldv1_md = ntohs(mld->mld_maxdelay); /* When in MLDv1 fallback and a MLDv2 router start-up being * unaware of current MLDv1 operation, the MRC == MRD mapping * only works when the exponential algorithm is not being * used (as MLDv1 is unaware of such things). * * According to the RFC author, the MLDv2 implementations * he's aware of all use a MRC < 32768 on start up queries. * * Thus, should we *ever* encounter something else larger * than that, just assume the maximum possible within our * reach. */ if (!v1_query) mldv1_md = min(mldv1_md, MLDV1_MRD_MAX_COMPAT); *max_delay = max(msecs_to_jiffies(mldv1_md), 1UL); /* MLDv1 router present: we need to go into v1 mode *only* * when an MLDv1 query is received as per section 9.12. of * RFC3810! And we know from RFC2710 section 3.7 that MLDv1 * queries MUST be of exactly 24 octets. */ if (v1_query) mld_set_v1_mode(idev); /* cancel MLDv2 report work */ mld_gq_stop_work(idev); /* cancel the interface change work */ mld_ifc_stop_work(idev); /* clear deleted report items */ mld_clear_delrec(idev); return 0; } static void mld_process_v2(struct inet6_dev *idev, struct mld2_query *mld, unsigned long *max_delay) { *max_delay = max(msecs_to_jiffies(mldv2_mrc(mld)), 1UL); mld_update_qrv(idev, mld); mld_update_qi(idev, mld); mld_update_qri(idev, mld); idev->mc_maxdelay = *max_delay; return; } /* called with rcu_read_lock() */ void igmp6_event_query(struct sk_buff *skb) { struct inet6_dev *idev = __in6_dev_get(skb->dev); if (!idev || idev->dead) goto out; spin_lock_bh(&idev->mc_query_lock); if (skb_queue_len(&idev->mc_query_queue) < MLD_MAX_SKBS) { __skb_queue_tail(&idev->mc_query_queue, skb); if (!mod_delayed_work(mld_wq, &idev->mc_query_work, 0)) in6_dev_hold(idev); skb = NULL; } spin_unlock_bh(&idev->mc_query_lock); out: kfree_skb(skb); } static void __mld_query_work(struct sk_buff *skb) { struct mld2_query *mlh2 = NULL; const struct in6_addr *group; unsigned long max_delay; struct inet6_dev *idev; struct ifmcaddr6 *ma; struct mld_msg *mld; int group_type; int mark = 0; int len, err; if (!pskb_may_pull(skb, sizeof(struct in6_addr))) goto kfree_skb; /* compute payload length excluding extension headers */ len = ntohs(ipv6_hdr(skb)->payload_len) + sizeof(struct ipv6hdr); len -= skb_network_header_len(skb); /* RFC3810 6.2 * Upon reception of an MLD message that contains a Query, the node * checks if the source address of the message is a valid link-local * address, if the Hop Limit is set to 1, and if the Router Alert * option is present in the Hop-By-Hop Options header of the IPv6 * packet. If any of these checks fails, the packet is dropped. */ if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL) || ipv6_hdr(skb)->hop_limit != 1 || !(IP6CB(skb)->flags & IP6SKB_ROUTERALERT) || IP6CB(skb)->ra != htons(IPV6_OPT_ROUTERALERT_MLD)) goto kfree_skb; idev = in6_dev_get(skb->dev); if (!idev) goto kfree_skb; mld = (struct mld_msg *)icmp6_hdr(skb); group = &mld->mld_mca; group_type = ipv6_addr_type(group); if (group_type != IPV6_ADDR_ANY && !(group_type&IPV6_ADDR_MULTICAST)) goto out; if (len < MLD_V1_QUERY_LEN) { goto out; } else if (len == MLD_V1_QUERY_LEN || mld_in_v1_mode(idev)) { err = mld_process_v1(idev, mld, &max_delay, len == MLD_V1_QUERY_LEN); if (err < 0) goto out; } else if (len >= MLD_V2_QUERY_LEN_MIN) { int srcs_offset = sizeof(struct mld2_query) - sizeof(struct icmp6hdr); if (!pskb_may_pull(skb, srcs_offset)) goto out; mlh2 = (struct mld2_query *)skb_transport_header(skb); mld_process_v2(idev, mlh2, &max_delay); if (group_type == IPV6_ADDR_ANY) { /* general query */ if (mlh2->mld2q_nsrcs) goto out; /* no sources allowed */ mld_gq_start_work(idev); goto out; } /* mark sources to include, if group & source-specific */ if (mlh2->mld2q_nsrcs != 0) { if (!pskb_may_pull(skb, srcs_offset + ntohs(mlh2->mld2q_nsrcs) * sizeof(struct in6_addr))) goto out; mlh2 = (struct mld2_query *)skb_transport_header(skb); mark = 1; } } else { goto out; } if (group_type == IPV6_ADDR_ANY) { for_each_mc_mclock(idev, ma) { igmp6_group_queried(ma, max_delay); } } else { for_each_mc_mclock(idev, ma) { if (!ipv6_addr_equal(group, &ma->mca_addr)) continue; if (ma->mca_flags & MAF_TIMER_RUNNING) { /* gsquery <- gsquery && mark */ if (!mark) ma->mca_flags &= ~MAF_GSQUERY; } else { /* gsquery <- mark */ if (mark) ma->mca_flags |= MAF_GSQUERY; else ma->mca_flags &= ~MAF_GSQUERY; } if (!(ma->mca_flags & MAF_GSQUERY) || mld_marksources(ma, ntohs(mlh2->mld2q_nsrcs), mlh2->mld2q_srcs)) igmp6_group_queried(ma, max_delay); break; } } out: in6_dev_put(idev); kfree_skb: consume_skb(skb); } static void mld_query_work(struct work_struct *work) { struct inet6_dev *idev = container_of(to_delayed_work(work), struct inet6_dev, mc_query_work); struct sk_buff_head q; struct sk_buff *skb; bool rework = false; int cnt = 0; skb_queue_head_init(&q); spin_lock_bh(&idev->mc_query_lock); while ((skb = __skb_dequeue(&idev->mc_query_queue))) { __skb_queue_tail(&q, skb); if (++cnt >= MLD_MAX_QUEUE) { rework = true; break; } } spin_unlock_bh(&idev->mc_query_lock); mutex_lock(&idev->mc_lock); while ((skb = __skb_dequeue(&q))) __mld_query_work(skb); mutex_unlock(&idev->mc_lock); if (rework && queue_delayed_work(mld_wq, &idev->mc_query_work, 0)) return; in6_dev_put(idev); } /* called with rcu_read_lock() */ void igmp6_event_report(struct sk_buff *skb) { struct inet6_dev *idev = __in6_dev_get(skb->dev); if (!idev || idev->dead) goto out; spin_lock_bh(&idev->mc_report_lock); if (skb_queue_len(&idev->mc_report_queue) < MLD_MAX_SKBS) { __skb_queue_tail(&idev->mc_report_queue, skb); if (!mod_delayed_work(mld_wq, &idev->mc_report_work, 0)) in6_dev_hold(idev); skb = NULL; } spin_unlock_bh(&idev->mc_report_lock); out: kfree_skb(skb); } static void __mld_report_work(struct sk_buff *skb) { struct inet6_dev *idev; struct ifmcaddr6 *ma; struct mld_msg *mld; int addr_type; /* Our own report looped back. Ignore it. */ if (skb->pkt_type == PACKET_LOOPBACK) goto kfree_skb; /* send our report if the MC router may not have heard this report */ if (skb->pkt_type != PACKET_MULTICAST && skb->pkt_type != PACKET_BROADCAST) goto kfree_skb; if (!pskb_may_pull(skb, sizeof(*mld) - sizeof(struct icmp6hdr))) goto kfree_skb; mld = (struct mld_msg *)icmp6_hdr(skb); /* Drop reports with not link local source */ addr_type = ipv6_addr_type(&ipv6_hdr(skb)->saddr); if (addr_type != IPV6_ADDR_ANY && !(addr_type&IPV6_ADDR_LINKLOCAL)) goto kfree_skb; idev = in6_dev_get(skb->dev); if (!idev) goto kfree_skb; /* * Cancel the work for this group */ for_each_mc_mclock(idev, ma) { if (ipv6_addr_equal(&ma->mca_addr, &mld->mld_mca)) { if (cancel_delayed_work(&ma->mca_work)) refcount_dec(&ma->mca_refcnt); ma->mca_flags &= ~(MAF_LAST_REPORTER | MAF_TIMER_RUNNING); break; } } in6_dev_put(idev); kfree_skb: consume_skb(skb); } static void mld_report_work(struct work_struct *work) { struct inet6_dev *idev = container_of(to_delayed_work(work), struct inet6_dev, mc_report_work); struct sk_buff_head q; struct sk_buff *skb; bool rework = false; int cnt = 0; skb_queue_head_init(&q); spin_lock_bh(&idev->mc_report_lock); while ((skb = __skb_dequeue(&idev->mc_report_queue))) { __skb_queue_tail(&q, skb); if (++cnt >= MLD_MAX_QUEUE) { rework = true; break; } } spin_unlock_bh(&idev->mc_report_lock); mutex_lock(&idev->mc_lock); while ((skb = __skb_dequeue(&q))) __mld_report_work(skb); mutex_unlock(&idev->mc_lock); if (rework && queue_delayed_work(mld_wq, &idev->mc_report_work, 0)) return; in6_dev_put(idev); } static bool is_in(struct ifmcaddr6 *pmc, struct ip6_sf_list *psf, int type, int gdeleted, int sdeleted) { switch (type) { case MLD2_MODE_IS_INCLUDE: case MLD2_MODE_IS_EXCLUDE: if (gdeleted || sdeleted) return false; if (!((pmc->mca_flags & MAF_GSQUERY) && !psf->sf_gsresp)) { if (pmc->mca_sfmode == MCAST_INCLUDE) return true; /* don't include if this source is excluded * in all filters */ if (psf->sf_count[MCAST_INCLUDE]) return type == MLD2_MODE_IS_INCLUDE; return pmc->mca_sfcount[MCAST_EXCLUDE] == psf->sf_count[MCAST_EXCLUDE]; } return false; case MLD2_CHANGE_TO_INCLUDE: if (gdeleted || sdeleted) return false; return psf->sf_count[MCAST_INCLUDE] != 0; case MLD2_CHANGE_TO_EXCLUDE: if (gdeleted || sdeleted) return false; if (pmc->mca_sfcount[MCAST_EXCLUDE] == 0 || psf->sf_count[MCAST_INCLUDE]) return false; return pmc->mca_sfcount[MCAST_EXCLUDE] == psf->sf_count[MCAST_EXCLUDE]; case MLD2_ALLOW_NEW_SOURCES: if (gdeleted || !psf->sf_crcount) return false; return (pmc->mca_sfmode == MCAST_INCLUDE) ^ sdeleted; case MLD2_BLOCK_OLD_SOURCES: if (pmc->mca_sfmode == MCAST_INCLUDE) return gdeleted || (psf->sf_crcount && sdeleted); return psf->sf_crcount && !gdeleted && !sdeleted; } return false; } static int mld_scount(struct ifmcaddr6 *pmc, int type, int gdeleted, int sdeleted) { struct ip6_sf_list *psf; int scount = 0; for_each_psf_mclock(pmc, psf) { if (!is_in(pmc, psf, type, gdeleted, sdeleted)) continue; scount++; } return scount; } static void ip6_mc_hdr(const struct sock *sk, struct sk_buff *skb, struct net_device *dev, const struct in6_addr *saddr, const struct in6_addr *daddr, int proto, int len) { struct ipv6hdr *hdr; skb->protocol = htons(ETH_P_IPV6); skb->dev = dev; skb_reset_network_header(skb); skb_put(skb, sizeof(struct ipv6hdr)); hdr = ipv6_hdr(skb); ip6_flow_hdr(hdr, 0, 0); hdr->payload_len = htons(len); hdr->nexthdr = proto; hdr->hop_limit = READ_ONCE(inet6_sk(sk)->hop_limit); hdr->saddr = *saddr; hdr->daddr = *daddr; } static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu) { u8 ra[8] = { IPPROTO_ICMPV6, 0, IPV6_TLV_ROUTERALERT, 2, 0, 0, IPV6_TLV_PADN, 0 }; struct net_device *dev = idev->dev; int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; const struct in6_addr *saddr; struct in6_addr addr_buf; struct mld2_report *pmr; struct sk_buff *skb; unsigned int size; struct sock *sk; struct net *net; /* we assume size > sizeof(ra) here * Also try to not allocate high-order pages for big MTU */ size = min_t(int, mtu, PAGE_SIZE / 2) + hlen + tlen; skb = alloc_skb(size, GFP_KERNEL); if (!skb) return NULL; skb->priority = TC_PRIO_CONTROL; skb_reserve(skb, hlen); skb_tailroom_reserve(skb, mtu, tlen); rcu_read_lock(); net = dev_net_rcu(dev); sk = net->ipv6.igmp_sk; skb_set_owner_w(skb, sk); if (ipv6_get_lladdr(dev, &addr_buf, IFA_F_TENTATIVE)) { /* <draft-ietf-magma-mld-source-05.txt>: * use unspecified address as the source address * when a valid link-local address is not available. */ saddr = &in6addr_any; } else saddr = &addr_buf; ip6_mc_hdr(sk, skb, dev, saddr, &mld2_all_mcr, NEXTHDR_HOP, 0); rcu_read_unlock(); skb_put_data(skb, ra, sizeof(ra)); skb_set_transport_header(skb, skb_tail_pointer(skb) - skb->data); skb_put(skb, sizeof(*pmr)); pmr = (struct mld2_report *)skb_transport_header(skb); pmr->mld2r_type = ICMPV6_MLD2_REPORT; pmr->mld2r_resv1 = 0; pmr->mld2r_cksum = 0; pmr->mld2r_resv2 = 0; pmr->mld2r_ngrec = 0; return skb; } static void mld_sendpack(struct sk_buff *skb) { struct ipv6hdr *pip6 = ipv6_hdr(skb); struct mld2_report *pmr = (struct mld2_report *)skb_transport_header(skb); int payload_len, mldlen; struct inet6_dev *idev; struct net *net = dev_net(skb->dev); int err; struct flowi6 fl6; struct dst_entry *dst; rcu_read_lock(); idev = __in6_dev_get(skb->dev); IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); payload_len = (skb_tail_pointer(skb) - skb_network_header(skb)) - sizeof(*pip6); mldlen = skb_tail_pointer(skb) - skb_transport_header(skb); pip6->payload_len = htons(payload_len); pmr->mld2r_cksum = csum_ipv6_magic(&pip6->saddr, &pip6->daddr, mldlen, IPPROTO_ICMPV6, csum_partial(skb_transport_header(skb), mldlen, 0)); icmpv6_flow_init(net->ipv6.igmp_sk, &fl6, ICMPV6_MLD2_REPORT, &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, skb->dev->ifindex); dst = icmp6_dst_alloc(skb->dev, &fl6); err = 0; if (IS_ERR(dst)) { err = PTR_ERR(dst); dst = NULL; } skb_dst_set(skb, dst); if (err) goto err_out; err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, net->ipv6.igmp_sk, skb, NULL, skb->dev, dst_output); out: if (!err) { ICMP6MSGOUT_INC_STATS(net, idev, ICMPV6_MLD2_REPORT); ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); } else { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); } rcu_read_unlock(); return; err_out: kfree_skb(skb); goto out; } static int grec_size(struct ifmcaddr6 *pmc, int type, int gdel, int sdel) { return sizeof(struct mld2_grec) + 16 * mld_scount(pmc,type,gdel,sdel); } static struct sk_buff *add_grhead(struct sk_buff *skb, struct ifmcaddr6 *pmc, int type, struct mld2_grec **ppgr, unsigned int mtu) { struct mld2_report *pmr; struct mld2_grec *pgr; if (!skb) { skb = mld_newpack(pmc->idev, mtu); if (!skb) return NULL; } pgr = skb_put(skb, sizeof(struct mld2_grec)); pgr->grec_type = type; pgr->grec_auxwords = 0; pgr->grec_nsrcs = 0; pgr->grec_mca = pmc->mca_addr; /* structure copy */ pmr = (struct mld2_report *)skb_transport_header(skb); pmr->mld2r_ngrec = htons(ntohs(pmr->mld2r_ngrec)+1); *ppgr = pgr; return skb; } #define AVAILABLE(skb) ((skb) ? skb_availroom(skb) : 0) /* called with mc_lock */ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, int type, int gdeleted, int sdeleted, int crsend) { struct ip6_sf_list *psf, *psf_prev, *psf_next; int scount, stotal, first, isquery, truncate; struct ip6_sf_list __rcu **psf_list; struct inet6_dev *idev = pmc->idev; struct net_device *dev = idev->dev; struct mld2_grec *pgr = NULL; struct mld2_report *pmr; unsigned int mtu; if (pmc->mca_flags & MAF_NOREPORT) return skb; mtu = READ_ONCE(dev->mtu); if (mtu < IPV6_MIN_MTU) return skb; isquery = type == MLD2_MODE_IS_INCLUDE || type == MLD2_MODE_IS_EXCLUDE; truncate = type == MLD2_MODE_IS_EXCLUDE || type == MLD2_CHANGE_TO_EXCLUDE; stotal = scount = 0; psf_list = sdeleted ? &pmc->mca_tomb : &pmc->mca_sources; if (!rcu_access_pointer(*psf_list)) goto empty_source; pmr = skb ? (struct mld2_report *)skb_transport_header(skb) : NULL; /* EX and TO_EX get a fresh packet, if needed */ if (truncate) { if (pmr && pmr->mld2r_ngrec && AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) { if (skb) mld_sendpack(skb); skb = mld_newpack(idev, mtu); } } first = 1; psf_prev = NULL; for (psf = mc_dereference(*psf_list, idev); psf; psf = psf_next) { struct in6_addr *psrc; psf_next = mc_dereference(psf->sf_next, idev); if (!is_in(pmc, psf, type, gdeleted, sdeleted) && !crsend) { psf_prev = psf; continue; } /* Based on RFC3810 6.1. Should not send source-list change * records when there is a filter mode change. */ if (((gdeleted && pmc->mca_sfmode == MCAST_EXCLUDE) || (!gdeleted && pmc->mca_crcount)) && (type == MLD2_ALLOW_NEW_SOURCES || type == MLD2_BLOCK_OLD_SOURCES) && psf->sf_crcount) goto decrease_sf_crcount; /* clear marks on query responses */ if (isquery) psf->sf_gsresp = 0; if (AVAILABLE(skb) < sizeof(*psrc) + first*sizeof(struct mld2_grec)) { if (truncate && !first) break; /* truncate these */ if (pgr) pgr->grec_nsrcs = htons(scount); if (skb) mld_sendpack(skb); skb = mld_newpack(idev, mtu); first = 1; scount = 0; } if (first) { skb = add_grhead(skb, pmc, type, &pgr, mtu); first = 0; } if (!skb) return NULL; psrc = skb_put(skb, sizeof(*psrc)); *psrc = psf->sf_addr; scount++; stotal++; if ((type == MLD2_ALLOW_NEW_SOURCES || type == MLD2_BLOCK_OLD_SOURCES) && psf->sf_crcount) { decrease_sf_crcount: psf->sf_crcount--; if ((sdeleted || gdeleted) && psf->sf_crcount == 0) { if (psf_prev) rcu_assign_pointer(psf_prev->sf_next, mc_dereference(psf->sf_next, idev)); else rcu_assign_pointer(*psf_list, mc_dereference(psf->sf_next, idev)); kfree_rcu(psf, rcu); continue; } } psf_prev = psf; } empty_source: if (!stotal) { if (type == MLD2_ALLOW_NEW_SOURCES || type == MLD2_BLOCK_OLD_SOURCES) return skb; if (pmc->mca_crcount || isquery || crsend) { /* make sure we have room for group header */ if (skb && AVAILABLE(skb) < sizeof(struct mld2_grec)) { mld_sendpack(skb); skb = NULL; /* add_grhead will get a new one */ } skb = add_grhead(skb, pmc, type, &pgr, mtu); } } if (pgr) pgr->grec_nsrcs = htons(scount); if (isquery) pmc->mca_flags &= ~MAF_GSQUERY; /* clear query state */ return skb; } /* called with mc_lock */ static void mld_send_report(struct inet6_dev *idev, struct ifmcaddr6 *pmc) { struct sk_buff *skb = NULL; int type; if (!pmc) { for_each_mc_mclock(idev, pmc) { if (pmc->mca_flags & MAF_NOREPORT) continue; if (pmc->mca_sfcount[MCAST_EXCLUDE]) type = MLD2_MODE_IS_EXCLUDE; else type = MLD2_MODE_IS_INCLUDE; skb = add_grec(skb, pmc, type, 0, 0, 0); } } else { if (pmc->mca_sfcount[MCAST_EXCLUDE]) type = MLD2_MODE_IS_EXCLUDE; else type = MLD2_MODE_IS_INCLUDE; skb = add_grec(skb, pmc, type, 0, 0, 0); } if (skb) mld_sendpack(skb); } /* * remove zero-count source records from a source filter list * called with mc_lock */ static void mld_clear_zeros(struct ip6_sf_list __rcu **ppsf, struct inet6_dev *idev) { struct ip6_sf_list *psf_prev, *psf_next, *psf; psf_prev = NULL; for (psf = mc_dereference(*ppsf, idev); psf; psf = psf_next) { psf_next = mc_dereference(psf->sf_next, idev); if (psf->sf_crcount == 0) { if (psf_prev) rcu_assign_pointer(psf_prev->sf_next, mc_dereference(psf->sf_next, idev)); else rcu_assign_pointer(*ppsf, mc_dereference(psf->sf_next, idev)); kfree_rcu(psf, rcu); } else { psf_prev = psf; } } } /* called with mc_lock */ static void mld_send_cr(struct inet6_dev *idev) { struct ifmcaddr6 *pmc, *pmc_prev, *pmc_next; struct sk_buff *skb = NULL; int type, dtype; /* deleted MCA's */ pmc_prev = NULL; for (pmc = mc_dereference(idev->mc_tomb, idev); pmc; pmc = pmc_next) { pmc_next = mc_dereference(pmc->next, idev); if (pmc->mca_sfmode == MCAST_INCLUDE) { type = MLD2_BLOCK_OLD_SOURCES; dtype = MLD2_BLOCK_OLD_SOURCES; skb = add_grec(skb, pmc, type, 1, 0, 0); skb = add_grec(skb, pmc, dtype, 1, 1, 0); } if (pmc->mca_crcount) { if (pmc->mca_sfmode == MCAST_EXCLUDE) { type = MLD2_CHANGE_TO_INCLUDE; skb = add_grec(skb, pmc, type, 1, 0, 0); } pmc->mca_crcount--; if (pmc->mca_crcount == 0) { mld_clear_zeros(&pmc->mca_tomb, idev); mld_clear_zeros(&pmc->mca_sources, idev); } } if (pmc->mca_crcount == 0 && !rcu_access_pointer(pmc->mca_tomb) && !rcu_access_pointer(pmc->mca_sources)) { if (pmc_prev) rcu_assign_pointer(pmc_prev->next, pmc_next); else rcu_assign_pointer(idev->mc_tomb, pmc_next); in6_dev_put(pmc->idev); kfree_rcu(pmc, rcu); } else pmc_prev = pmc; } /* change recs */ for_each_mc_mclock(idev, pmc) { if (pmc->mca_sfcount[MCAST_EXCLUDE]) { type = MLD2_BLOCK_OLD_SOURCES; dtype = MLD2_ALLOW_NEW_SOURCES; } else { type = MLD2_ALLOW_NEW_SOURCES; dtype = MLD2_BLOCK_OLD_SOURCES; } skb = add_grec(skb, pmc, type, 0, 0, 0); skb = add_grec(skb, pmc, dtype, 0, 1, 0); /* deleted sources */ /* filter mode changes */ if (pmc->mca_crcount) { if (pmc->mca_sfmode == MCAST_EXCLUDE) type = MLD2_CHANGE_TO_EXCLUDE; else type = MLD2_CHANGE_TO_INCLUDE; skb = add_grec(skb, pmc, type, 0, 0, 0); pmc->mca_crcount--; } } if (!skb) return; (void) mld_sendpack(skb); } static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) { const struct in6_addr *snd_addr, *saddr; int err, len, payload_len, full_len; struct in6_addr addr_buf; struct inet6_dev *idev; struct sk_buff *skb; struct mld_msg *hdr; int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; u8 ra[8] = { IPPROTO_ICMPV6, 0, IPV6_TLV_ROUTERALERT, 2, 0, 0, IPV6_TLV_PADN, 0 }; struct dst_entry *dst; struct flowi6 fl6; struct net *net; struct sock *sk; if (type == ICMPV6_MGM_REDUCTION) snd_addr = &in6addr_linklocal_allrouters; else snd_addr = addr; len = sizeof(struct icmp6hdr) + sizeof(struct in6_addr); payload_len = len + sizeof(ra); full_len = sizeof(struct ipv6hdr) + payload_len; skb = alloc_skb(hlen + tlen + full_len, GFP_KERNEL); rcu_read_lock(); net = dev_net_rcu(dev); idev = __in6_dev_get(dev); IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); if (!skb) { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); rcu_read_unlock(); return; } sk = net->ipv6.igmp_sk; skb_set_owner_w(skb, sk); skb->priority = TC_PRIO_CONTROL; skb_reserve(skb, hlen); if (ipv6_get_lladdr(dev, &addr_buf, IFA_F_TENTATIVE)) { /* <draft-ietf-magma-mld-source-05.txt>: * use unspecified address as the source address * when a valid link-local address is not available. */ saddr = &in6addr_any; } else saddr = &addr_buf; ip6_mc_hdr(sk, skb, dev, saddr, snd_addr, NEXTHDR_HOP, payload_len); skb_put_data(skb, ra, sizeof(ra)); hdr = skb_put_zero(skb, sizeof(struct mld_msg)); hdr->mld_type = type; hdr->mld_mca = *addr; hdr->mld_cksum = csum_ipv6_magic(saddr, snd_addr, len, IPPROTO_ICMPV6, csum_partial(hdr, len, 0)); icmpv6_flow_init(sk, &fl6, type, &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, skb->dev->ifindex); dst = icmp6_dst_alloc(skb->dev, &fl6); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto err_out; } skb_dst_set(skb, dst); err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, skb->dev, dst_output); out: if (!err) { ICMP6MSGOUT_INC_STATS(net, idev, type); ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); } else IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); rcu_read_unlock(); return; err_out: kfree_skb(skb); goto out; } /* called with mc_lock */ static void mld_send_initial_cr(struct inet6_dev *idev) { struct sk_buff *skb; struct ifmcaddr6 *pmc; int type; if (mld_in_v1_mode(idev)) return; skb = NULL; for_each_mc_mclock(idev, pmc) { if (pmc->mca_sfcount[MCAST_EXCLUDE]) type = MLD2_CHANGE_TO_EXCLUDE; else type = MLD2_ALLOW_NEW_SOURCES; skb = add_grec(skb, pmc, type, 0, 0, 1); } if (skb) mld_sendpack(skb); } void ipv6_mc_dad_complete(struct inet6_dev *idev) { mutex_lock(&idev->mc_lock); idev->mc_dad_count = idev->mc_qrv; if (idev->mc_dad_count) { mld_send_initial_cr(idev); idev->mc_dad_count--; if (idev->mc_dad_count) mld_dad_start_work(idev, unsolicited_report_interval(idev)); } mutex_unlock(&idev->mc_lock); } static void mld_dad_work(struct work_struct *work) { struct inet6_dev *idev = container_of(to_delayed_work(work), struct inet6_dev, mc_dad_work); mutex_lock(&idev->mc_lock); mld_send_initial_cr(idev); if (idev->mc_dad_count) { idev->mc_dad_count--; if (idev->mc_dad_count) mld_dad_start_work(idev, unsolicited_report_interval(idev)); } mutex_unlock(&idev->mc_lock); in6_dev_put(idev); } /* called with mc_lock */ static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode, const struct in6_addr *psfsrc) { struct ip6_sf_list *psf, *psf_prev; int rv = 0; psf_prev = NULL; for_each_psf_mclock(pmc, psf) { if (ipv6_addr_equal(&psf->sf_addr, psfsrc)) break; psf_prev = psf; } if (!psf || psf->sf_count[sfmode] == 0) { /* source filter not found, or count wrong => bug */ return -ESRCH; } WRITE_ONCE(psf->sf_count[sfmode], psf->sf_count[sfmode] - 1); if (!psf->sf_count[MCAST_INCLUDE] && !psf->sf_count[MCAST_EXCLUDE]) { struct inet6_dev *idev = pmc->idev; /* no more filters for this source */ if (psf_prev) rcu_assign_pointer(psf_prev->sf_next, mc_dereference(psf->sf_next, idev)); else rcu_assign_pointer(pmc->mca_sources, mc_dereference(psf->sf_next, idev)); if (psf->sf_oldin && !(pmc->mca_flags & MAF_NOREPORT) && !mld_in_v1_mode(idev)) { psf->sf_crcount = idev->mc_qrv; rcu_assign_pointer(psf->sf_next, mc_dereference(pmc->mca_tomb, idev)); rcu_assign_pointer(pmc->mca_tomb, psf); rv = 1; } else { kfree_rcu(psf, rcu); } } return rv; } /* called with mc_lock */ static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca, int sfmode, int sfcount, const struct in6_addr *psfsrc, int delta) { struct ifmcaddr6 *pmc; int changerec = 0; int i, err; if (!idev) return -ENODEV; for_each_mc_mclock(idev, pmc) { if (ipv6_addr_equal(pmca, &pmc->mca_addr)) break; } if (!pmc) return -ESRCH; sf_markstate(pmc); if (!delta) { if (!pmc->mca_sfcount[sfmode]) return -EINVAL; pmc->mca_sfcount[sfmode]--; } err = 0; for (i = 0; i < sfcount; i++) { int rv = ip6_mc_del1_src(pmc, sfmode, &psfsrc[i]); changerec |= rv > 0; if (!err && rv < 0) err = rv; } if (pmc->mca_sfmode == MCAST_EXCLUDE && pmc->mca_sfcount[MCAST_EXCLUDE] == 0 && pmc->mca_sfcount[MCAST_INCLUDE]) { struct ip6_sf_list *psf; /* filter mode change */ pmc->mca_sfmode = MCAST_INCLUDE; pmc->mca_crcount = idev->mc_qrv; idev->mc_ifc_count = pmc->mca_crcount; for_each_psf_mclock(pmc, psf) psf->sf_crcount = 0; mld_ifc_event(pmc->idev); } else if (sf_setstate(pmc) || changerec) { mld_ifc_event(pmc->idev); } return err; } /* * Add multicast single-source filter to the interface list * called with mc_lock */ static int ip6_mc_add1_src(struct ifmcaddr6 *pmc, int sfmode, const struct in6_addr *psfsrc) { struct ip6_sf_list *psf, *psf_prev; psf_prev = NULL; for_each_psf_mclock(pmc, psf) { if (ipv6_addr_equal(&psf->sf_addr, psfsrc)) break; psf_prev = psf; } if (!psf) { psf = kzalloc(sizeof(*psf), GFP_KERNEL); if (!psf) return -ENOBUFS; psf->sf_addr = *psfsrc; if (psf_prev) { rcu_assign_pointer(psf_prev->sf_next, psf); } else { rcu_assign_pointer(pmc->mca_sources, psf); } } WRITE_ONCE(psf->sf_count[sfmode], psf->sf_count[sfmode] + 1); return 0; } /* called with mc_lock */ static void sf_markstate(struct ifmcaddr6 *pmc) { struct ip6_sf_list *psf; int mca_xcount = pmc->mca_sfcount[MCAST_EXCLUDE]; for_each_psf_mclock(pmc, psf) { if (pmc->mca_sfcount[MCAST_EXCLUDE]) { psf->sf_oldin = mca_xcount == psf->sf_count[MCAST_EXCLUDE] && !psf->sf_count[MCAST_INCLUDE]; } else { psf->sf_oldin = psf->sf_count[MCAST_INCLUDE] != 0; } } } /* called with mc_lock */ static int sf_setstate(struct ifmcaddr6 *pmc) { struct ip6_sf_list *psf, *dpsf; int mca_xcount = pmc->mca_sfcount[MCAST_EXCLUDE]; int qrv = pmc->idev->mc_qrv; int new_in, rv; rv = 0; for_each_psf_mclock(pmc, psf) { if (pmc->mca_sfcount[MCAST_EXCLUDE]) { new_in = mca_xcount == psf->sf_count[MCAST_EXCLUDE] && !psf->sf_count[MCAST_INCLUDE]; } else new_in = psf->sf_count[MCAST_INCLUDE] != 0; if (new_in) { if (!psf->sf_oldin) { struct ip6_sf_list *prev = NULL; for_each_psf_tomb(pmc, dpsf) { if (ipv6_addr_equal(&dpsf->sf_addr, &psf->sf_addr)) break; prev = dpsf; } if (dpsf) { if (prev) rcu_assign_pointer(prev->sf_next, mc_dereference(dpsf->sf_next, pmc->idev)); else rcu_assign_pointer(pmc->mca_tomb, mc_dereference(dpsf->sf_next, pmc->idev)); kfree_rcu(dpsf, rcu); } psf->sf_crcount = qrv; rv++; } } else if (psf->sf_oldin) { psf->sf_crcount = 0; /* * add or update "delete" records if an active filter * is now inactive */ for_each_psf_tomb(pmc, dpsf) if (ipv6_addr_equal(&dpsf->sf_addr, &psf->sf_addr)) break; if (!dpsf) { dpsf = kmalloc(sizeof(*dpsf), GFP_KERNEL); if (!dpsf) continue; *dpsf = *psf; rcu_assign_pointer(dpsf->sf_next, mc_dereference(pmc->mca_tomb, pmc->idev)); rcu_assign_pointer(pmc->mca_tomb, dpsf); } dpsf->sf_crcount = qrv; rv++; } } return rv; } /* * Add multicast source filter list to the interface list * called with mc_lock */ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, int sfmode, int sfcount, const struct in6_addr *psfsrc, int delta) { struct ifmcaddr6 *pmc; int isexclude; int i, err; if (!idev) return -ENODEV; for_each_mc_mclock(idev, pmc) { if (ipv6_addr_equal(pmca, &pmc->mca_addr)) break; } if (!pmc) return -ESRCH; sf_markstate(pmc); isexclude = pmc->mca_sfmode == MCAST_EXCLUDE; if (!delta) WRITE_ONCE(pmc->mca_sfcount[sfmode], pmc->mca_sfcount[sfmode] + 1); err = 0; for (i = 0; i < sfcount; i++) { err = ip6_mc_add1_src(pmc, sfmode, &psfsrc[i]); if (err) break; } if (err) { int j; if (!delta) WRITE_ONCE(pmc->mca_sfcount[sfmode], pmc->mca_sfcount[sfmode] - 1); for (j = 0; j < i; j++) ip6_mc_del1_src(pmc, sfmode, &psfsrc[j]); } else if (isexclude != (pmc->mca_sfcount[MCAST_EXCLUDE] != 0)) { struct ip6_sf_list *psf; /* filter mode change */ if (pmc->mca_sfcount[MCAST_EXCLUDE]) pmc->mca_sfmode = MCAST_EXCLUDE; else if (pmc->mca_sfcount[MCAST_INCLUDE]) pmc->mca_sfmode = MCAST_INCLUDE; /* else no filters; keep old mode for reports */ pmc->mca_crcount = idev->mc_qrv; idev->mc_ifc_count = pmc->mca_crcount; for_each_psf_mclock(pmc, psf) psf->sf_crcount = 0; mld_ifc_event(idev); } else if (sf_setstate(pmc)) { mld_ifc_event(idev); } return err; } /* called with mc_lock */ static void ip6_mc_clear_src(struct ifmcaddr6 *pmc) { struct ip6_sf_list *psf, *nextpsf; for (psf = mc_dereference(pmc->mca_tomb, pmc->idev); psf; psf = nextpsf) { nextpsf = mc_dereference(psf->sf_next, pmc->idev); kfree_rcu(psf, rcu); } RCU_INIT_POINTER(pmc->mca_tomb, NULL); for (psf = mc_dereference(pmc->mca_sources, pmc->idev); psf; psf = nextpsf) { nextpsf = mc_dereference(psf->sf_next, pmc->idev); kfree_rcu(psf, rcu); } RCU_INIT_POINTER(pmc->mca_sources, NULL); pmc->mca_sfmode = MCAST_EXCLUDE; pmc->mca_sfcount[MCAST_INCLUDE] = 0; /* Paired with the READ_ONCE() from ipv6_chk_mcast_addr() */ WRITE_ONCE(pmc->mca_sfcount[MCAST_EXCLUDE], 1); } /* called with mc_lock */ static void igmp6_join_group(struct ifmcaddr6 *ma) { unsigned long delay; if (ma->mca_flags & MAF_NOREPORT) return; igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT); delay = get_random_u32_below(unsolicited_report_interval(ma->idev)); if (cancel_delayed_work(&ma->mca_work)) { refcount_dec(&ma->mca_refcnt); delay = ma->mca_work.timer.expires - jiffies; } if (!mod_delayed_work(mld_wq, &ma->mca_work, delay)) refcount_inc(&ma->mca_refcnt); ma->mca_flags |= MAF_TIMER_RUNNING | MAF_LAST_REPORTER; } static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, struct inet6_dev *idev) { struct ip6_sf_socklist *psl; int err; psl = sock_dereference(iml->sflist, sk); if (idev) mutex_lock(&idev->mc_lock); if (!psl) { /* any-source empty exclude case */ err = ip6_mc_del_src(idev, &iml->addr, iml->sfmode, 0, NULL, 0); } else { err = ip6_mc_del_src(idev, &iml->addr, iml->sfmode, psl->sl_count, psl->sl_addr, 0); RCU_INIT_POINTER(iml->sflist, NULL); atomic_sub(struct_size(psl, sl_addr, psl->sl_max), &sk->sk_omem_alloc); kfree_rcu(psl, rcu); } if (idev) mutex_unlock(&idev->mc_lock); return err; } /* called with mc_lock */ static void igmp6_leave_group(struct ifmcaddr6 *ma) { if (mld_in_v1_mode(ma->idev)) { if (ma->mca_flags & MAF_LAST_REPORTER) { igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REDUCTION); } } else { mld_add_delrec(ma->idev, ma); mld_ifc_event(ma->idev); } } static void mld_gq_work(struct work_struct *work) { struct inet6_dev *idev = container_of(to_delayed_work(work), struct inet6_dev, mc_gq_work); mutex_lock(&idev->mc_lock); mld_send_report(idev, NULL); idev->mc_gq_running = 0; mutex_unlock(&idev->mc_lock); in6_dev_put(idev); } static void mld_ifc_work(struct work_struct *work) { struct inet6_dev *idev = container_of(to_delayed_work(work), struct inet6_dev, mc_ifc_work); mutex_lock(&idev->mc_lock); mld_send_cr(idev); if (idev->mc_ifc_count) { idev->mc_ifc_count--; if (idev->mc_ifc_count) mld_ifc_start_work(idev, unsolicited_report_interval(idev)); } mutex_unlock(&idev->mc_lock); in6_dev_put(idev); } /* called with mc_lock */ static void mld_ifc_event(struct inet6_dev *idev) { if (mld_in_v1_mode(idev)) return; idev->mc_ifc_count = idev->mc_qrv; mld_ifc_start_work(idev, 1); } static void mld_mca_work(struct work_struct *work) { struct ifmcaddr6 *ma = container_of(to_delayed_work(work), struct ifmcaddr6, mca_work); mutex_lock(&ma->idev->mc_lock); if (mld_in_v1_mode(ma->idev)) igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT); else mld_send_report(ma->idev, ma); ma->mca_flags |= MAF_LAST_REPORTER; ma->mca_flags &= ~MAF_TIMER_RUNNING; mutex_unlock(&ma->idev->mc_lock); ma_put(ma); } /* Device changing type */ void ipv6_mc_unmap(struct inet6_dev *idev) { struct ifmcaddr6 *i; /* Install multicast list, except for all-nodes (already installed) */ mutex_lock(&idev->mc_lock); for_each_mc_mclock(idev, i) igmp6_group_dropped(i); mutex_unlock(&idev->mc_lock); } void ipv6_mc_remap(struct inet6_dev *idev) { ipv6_mc_up(idev); } /* Device going down */ void ipv6_mc_down(struct inet6_dev *idev) { struct ifmcaddr6 *i; mutex_lock(&idev->mc_lock); /* Withdraw multicast list */ for_each_mc_mclock(idev, i) igmp6_group_dropped(i); mutex_unlock(&idev->mc_lock); /* Should stop work after group drop. or we will * start work again in mld_ifc_event() */ mld_query_stop_work(idev); mld_report_stop_work(idev); mutex_lock(&idev->mc_lock); mld_ifc_stop_work(idev); mld_gq_stop_work(idev); mutex_unlock(&idev->mc_lock); mld_dad_stop_work(idev); } static void ipv6_mc_reset(struct inet6_dev *idev) { idev->mc_qrv = sysctl_mld_qrv; idev->mc_qi = MLD_QI_DEFAULT; idev->mc_qri = MLD_QRI_DEFAULT; idev->mc_v1_seen = 0; idev->mc_maxdelay = unsolicited_report_interval(idev); } /* Device going up */ void ipv6_mc_up(struct inet6_dev *idev) { struct ifmcaddr6 *i; /* Install multicast list, except for all-nodes (already installed) */ ipv6_mc_reset(idev); mutex_lock(&idev->mc_lock); for_each_mc_mclock(idev, i) { mld_del_delrec(idev, i); igmp6_group_added(i); } mutex_unlock(&idev->mc_lock); } /* IPv6 device initialization. */ void ipv6_mc_init_dev(struct inet6_dev *idev) { idev->mc_gq_running = 0; INIT_DELAYED_WORK(&idev->mc_gq_work, mld_gq_work); RCU_INIT_POINTER(idev->mc_tomb, NULL); idev->mc_ifc_count = 0; INIT_DELAYED_WORK(&idev->mc_ifc_work, mld_ifc_work); INIT_DELAYED_WORK(&idev->mc_dad_work, mld_dad_work); INIT_DELAYED_WORK(&idev->mc_query_work, mld_query_work); INIT_DELAYED_WORK(&idev->mc_report_work, mld_report_work); skb_queue_head_init(&idev->mc_query_queue); skb_queue_head_init(&idev->mc_report_queue); spin_lock_init(&idev->mc_query_lock); spin_lock_init(&idev->mc_report_lock); mutex_init(&idev->mc_lock); ipv6_mc_reset(idev); } /* * Device is about to be destroyed: clean up. */ void ipv6_mc_destroy_dev(struct inet6_dev *idev) { struct ifmcaddr6 *i; /* Deactivate works */ ipv6_mc_down(idev); mutex_lock(&idev->mc_lock); mld_clear_delrec(idev); mutex_unlock(&idev->mc_lock); mld_clear_query(idev); mld_clear_report(idev); /* Delete all-nodes address. */ /* We cannot call ipv6_dev_mc_dec() directly, our caller in * addrconf.c has NULL'd out dev->ip6_ptr so in6_dev_get() will * fail. */ __ipv6_dev_mc_dec(idev, &in6addr_linklocal_allnodes); if (idev->cnf.forwarding) __ipv6_dev_mc_dec(idev, &in6addr_linklocal_allrouters); mutex_lock(&idev->mc_lock); while ((i = mc_dereference(idev->mc_list, idev))) { rcu_assign_pointer(idev->mc_list, mc_dereference(i->next, idev)); ip6_mc_clear_src(i); ma_put(i); } mutex_unlock(&idev->mc_lock); } static void ipv6_mc_rejoin_groups(struct inet6_dev *idev) { struct ifmcaddr6 *pmc; ASSERT_RTNL(); mutex_lock(&idev->mc_lock); if (mld_in_v1_mode(idev)) { for_each_mc_mclock(idev, pmc) igmp6_join_group(pmc); } else { mld_send_report(idev, NULL); } mutex_unlock(&idev->mc_lock); } static int ipv6_mc_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct inet6_dev *idev = __in6_dev_get(dev); switch (event) { case NETDEV_RESEND_IGMP: if (idev) ipv6_mc_rejoin_groups(idev); break; default: break; } return NOTIFY_DONE; } static struct notifier_block igmp6_netdev_notifier = { .notifier_call = ipv6_mc_netdev_event, }; #ifdef CONFIG_PROC_FS struct igmp6_mc_iter_state { struct seq_net_private p; struct net_device *dev; struct inet6_dev *idev; }; #define igmp6_mc_seq_private(seq) ((struct igmp6_mc_iter_state *)(seq)->private) static inline struct ifmcaddr6 *igmp6_mc_get_first(struct seq_file *seq) { struct ifmcaddr6 *im = NULL; struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq); struct net *net = seq_file_net(seq); state->idev = NULL; for_each_netdev_rcu(net, state->dev) { struct inet6_dev *idev; idev = __in6_dev_get(state->dev); if (!idev) continue; im = rcu_dereference(idev->mc_list); if (im) { state->idev = idev; break; } } return im; } static struct ifmcaddr6 *igmp6_mc_get_next(struct seq_file *seq, struct ifmcaddr6 *im) { struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq); im = rcu_dereference(im->next); while (!im) { state->dev = next_net_device_rcu(state->dev); if (!state->dev) { state->idev = NULL; break; } state->idev = __in6_dev_get(state->dev); if (!state->idev) continue; im = rcu_dereference(state->idev->mc_list); } return im; } static struct ifmcaddr6 *igmp6_mc_get_idx(struct seq_file *seq, loff_t pos) { struct ifmcaddr6 *im = igmp6_mc_get_first(seq); if (im) while (pos && (im = igmp6_mc_get_next(seq, im)) != NULL) --pos; return pos ? NULL : im; } static void *igmp6_mc_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { rcu_read_lock(); return igmp6_mc_get_idx(seq, *pos); } static void *igmp6_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct ifmcaddr6 *im = igmp6_mc_get_next(seq, v); ++*pos; return im; } static void igmp6_mc_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq); if (likely(state->idev)) state->idev = NULL; state->dev = NULL; rcu_read_unlock(); } static int igmp6_mc_seq_show(struct seq_file *seq, void *v) { struct ifmcaddr6 *im = (struct ifmcaddr6 *)v; struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq); seq_printf(seq, "%-4d %-15s %pi6 %5d %08X %ld\n", state->dev->ifindex, state->dev->name, &im->mca_addr, im->mca_users, im->mca_flags, (im->mca_flags & MAF_TIMER_RUNNING) ? jiffies_to_clock_t(im->mca_work.timer.expires - jiffies) : 0); return 0; } static const struct seq_operations igmp6_mc_seq_ops = { .start = igmp6_mc_seq_start, .next = igmp6_mc_seq_next, .stop = igmp6_mc_seq_stop, .show = igmp6_mc_seq_show, }; struct igmp6_mcf_iter_state { struct seq_net_private p; struct net_device *dev; struct inet6_dev *idev; struct ifmcaddr6 *im; }; #define igmp6_mcf_seq_private(seq) ((struct igmp6_mcf_iter_state *)(seq)->private) static inline struct ip6_sf_list *igmp6_mcf_get_first(struct seq_file *seq) { struct ip6_sf_list *psf = NULL; struct ifmcaddr6 *im = NULL; struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq); struct net *net = seq_file_net(seq); state->idev = NULL; state->im = NULL; for_each_netdev_rcu(net, state->dev) { struct inet6_dev *idev; idev = __in6_dev_get(state->dev); if (unlikely(idev == NULL)) continue; im = rcu_dereference(idev->mc_list); if (likely(im)) { psf = rcu_dereference(im->mca_sources); if (likely(psf)) { state->im = im; state->idev = idev; break; } } } return psf; } static struct ip6_sf_list *igmp6_mcf_get_next(struct seq_file *seq, struct ip6_sf_list *psf) { struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq); psf = rcu_dereference(psf->sf_next); while (!psf) { state->im = rcu_dereference(state->im->next); while (!state->im) { state->dev = next_net_device_rcu(state->dev); if (!state->dev) { state->idev = NULL; goto out; } state->idev = __in6_dev_get(state->dev); if (!state->idev) continue; state->im = rcu_dereference(state->idev->mc_list); } psf = rcu_dereference(state->im->mca_sources); } out: return psf; } static struct ip6_sf_list *igmp6_mcf_get_idx(struct seq_file *seq, loff_t pos) { struct ip6_sf_list *psf = igmp6_mcf_get_first(seq); if (psf) while (pos && (psf = igmp6_mcf_get_next(seq, psf)) != NULL) --pos; return pos ? NULL : psf; } static void *igmp6_mcf_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { rcu_read_lock(); return *pos ? igmp6_mcf_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; } static void *igmp6_mcf_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct ip6_sf_list *psf; if (v == SEQ_START_TOKEN) psf = igmp6_mcf_get_first(seq); else psf = igmp6_mcf_get_next(seq, v); ++*pos; return psf; } static void igmp6_mcf_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq); if (likely(state->im)) state->im = NULL; if (likely(state->idev)) state->idev = NULL; state->dev = NULL; rcu_read_unlock(); } static int igmp6_mcf_seq_show(struct seq_file *seq, void *v) { struct ip6_sf_list *psf = (struct ip6_sf_list *)v; struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq); if (v == SEQ_START_TOKEN) { seq_puts(seq, "Idx Device Multicast Address Source Address INC EXC\n"); } else { seq_printf(seq, "%3d %6.6s %pi6 %pi6 %6lu %6lu\n", state->dev->ifindex, state->dev->name, &state->im->mca_addr, &psf->sf_addr, READ_ONCE(psf->sf_count[MCAST_INCLUDE]), READ_ONCE(psf->sf_count[MCAST_EXCLUDE])); } return 0; } static const struct seq_operations igmp6_mcf_seq_ops = { .start = igmp6_mcf_seq_start, .next = igmp6_mcf_seq_next, .stop = igmp6_mcf_seq_stop, .show = igmp6_mcf_seq_show, }; static int __net_init igmp6_proc_init(struct net *net) { int err; err = -ENOMEM; if (!proc_create_net("igmp6", 0444, net->proc_net, &igmp6_mc_seq_ops, sizeof(struct igmp6_mc_iter_state))) goto out; if (!proc_create_net("mcfilter6", 0444, net->proc_net, &igmp6_mcf_seq_ops, sizeof(struct igmp6_mcf_iter_state))) goto out_proc_net_igmp6; err = 0; out: return err; out_proc_net_igmp6: remove_proc_entry("igmp6", net->proc_net); goto out; } static void __net_exit igmp6_proc_exit(struct net *net) { remove_proc_entry("mcfilter6", net->proc_net); remove_proc_entry("igmp6", net->proc_net); } #else static inline int igmp6_proc_init(struct net *net) { return 0; } static inline void igmp6_proc_exit(struct net *net) { } #endif static int __net_init igmp6_net_init(struct net *net) { int err; err = inet_ctl_sock_create(&net->ipv6.igmp_sk, PF_INET6, SOCK_RAW, IPPROTO_ICMPV6, net); if (err < 0) { pr_err("Failed to initialize the IGMP6 control socket (err %d)\n", err); goto out; } inet6_sk(net->ipv6.igmp_sk)->hop_limit = 1; net->ipv6.igmp_sk->sk_allocation = GFP_KERNEL; err = inet_ctl_sock_create(&net->ipv6.mc_autojoin_sk, PF_INET6, SOCK_RAW, IPPROTO_ICMPV6, net); if (err < 0) { pr_err("Failed to initialize the IGMP6 autojoin socket (err %d)\n", err); goto out_sock_create; } err = igmp6_proc_init(net); if (err) goto out_sock_create_autojoin; return 0; out_sock_create_autojoin: inet_ctl_sock_destroy(net->ipv6.mc_autojoin_sk); out_sock_create: inet_ctl_sock_destroy(net->ipv6.igmp_sk); out: return err; } static void __net_exit igmp6_net_exit(struct net *net) { inet_ctl_sock_destroy(net->ipv6.igmp_sk); inet_ctl_sock_destroy(net->ipv6.mc_autojoin_sk); igmp6_proc_exit(net); } static struct pernet_operations igmp6_net_ops = { .init = igmp6_net_init, .exit = igmp6_net_exit, }; int __init igmp6_init(void) { int err; err = register_pernet_subsys(&igmp6_net_ops); if (err) return err; mld_wq = create_workqueue("mld"); if (!mld_wq) { unregister_pernet_subsys(&igmp6_net_ops); return -ENOMEM; } return err; } int __init igmp6_late_init(void) { return register_netdevice_notifier(&igmp6_netdev_notifier); } void igmp6_cleanup(void) { unregister_pernet_subsys(&igmp6_net_ops); destroy_workqueue(mld_wq); } void igmp6_late_cleanup(void) { unregister_netdevice_notifier(&igmp6_netdev_notifier); }
115 114 114 448 30 191 602 604 90 603 603 603 603 602 604 602 90 90 90 1 89 39 588 588 587 587 55 55 54 2 2 52 54 501 1664 1666 1 1668 1668 1678 1670 76 485 1630 1556 1635 4 4 59 25 80 82 81 24 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/file.c * * Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes * * Manage the dynamic fd arrays in the process files_struct. */ #include <linux/syscalls.h> #include <linux/export.h> #include <linux/fs.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/sched/signal.h> #include <linux/slab.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/bitops.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/close_range.h> #include <linux/file_ref.h> #include <net/sock.h> #include <linux/init_task.h> #include "internal.h" static noinline bool __file_ref_put_badval(file_ref_t *ref, unsigned long cnt) { /* * If the reference count was already in the dead zone, then this * put() operation is imbalanced. Warn, put the reference count back to * DEAD and tell the caller to not deconstruct the object. */ if (WARN_ONCE(cnt >= FILE_REF_RELEASED, "imbalanced put on file reference count")) { atomic_long_set(&ref->refcnt, FILE_REF_DEAD); return false; } /* * This is a put() operation on a saturated refcount. Restore the * mean saturation value and tell the caller to not deconstruct the * object. */ if (cnt > FILE_REF_MAXREF) atomic_long_set(&ref->refcnt, FILE_REF_SATURATED); return false; } /** * __file_ref_put - Slowpath of file_ref_put() * @ref: Pointer to the reference count * @cnt: Current reference count * * Invoked when the reference count is outside of the valid zone. * * Return: * True if this was the last reference with no future references * possible. This signals the caller that it can safely schedule the * object, which is protected by the reference counter, for * deconstruction. * * False if there are still active references or the put() raced * with a concurrent get()/put() pair. Caller is not allowed to * deconstruct the protected object. */ bool __file_ref_put(file_ref_t *ref, unsigned long cnt) { /* Did this drop the last reference? */ if (likely(cnt == FILE_REF_NOREF)) { /* * Carefully try to set the reference count to FILE_REF_DEAD. * * This can fail if a concurrent get() operation has * elevated it again or the corresponding put() even marked * it dead already. Both are valid situations and do not * require a retry. If this fails the caller is not * allowed to deconstruct the object. */ if (!atomic_long_try_cmpxchg_release(&ref->refcnt, &cnt, FILE_REF_DEAD)) return false; /* * The caller can safely schedule the object for * deconstruction. Provide acquire ordering. */ smp_acquire__after_ctrl_dep(); return true; } return __file_ref_put_badval(ref, cnt); } EXPORT_SYMBOL_GPL(__file_ref_put); unsigned int sysctl_nr_open __read_mostly = 1024*1024; unsigned int sysctl_nr_open_min = BITS_PER_LONG; /* our min() is unusable in constant expressions ;-/ */ #define __const_min(x, y) ((x) < (y) ? (x) : (y)) unsigned int sysctl_nr_open_max = __const_min(INT_MAX, ~(size_t)0/sizeof(void *)) & -BITS_PER_LONG; static void __free_fdtable(struct fdtable *fdt) { kvfree(fdt->fd); kvfree(fdt->open_fds); kfree(fdt); } static void free_fdtable_rcu(struct rcu_head *rcu) { __free_fdtable(container_of(rcu, struct fdtable, rcu)); } #define BITBIT_NR(nr) BITS_TO_LONGS(BITS_TO_LONGS(nr)) #define BITBIT_SIZE(nr) (BITBIT_NR(nr) * sizeof(long)) #define fdt_words(fdt) ((fdt)->max_fds / BITS_PER_LONG) // words in ->open_fds /* * Copy 'count' fd bits from the old table to the new table and clear the extra * space if any. This does not copy the file pointers. Called with the files * spinlock held for write. */ static inline void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt, unsigned int copy_words) { unsigned int nwords = fdt_words(nfdt); bitmap_copy_and_extend(nfdt->open_fds, ofdt->open_fds, copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG); bitmap_copy_and_extend(nfdt->close_on_exec, ofdt->close_on_exec, copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG); bitmap_copy_and_extend(nfdt->full_fds_bits, ofdt->full_fds_bits, copy_words, nwords); } /* * Copy all file descriptors from the old table to the new, expanded table and * clear the extra space. Called with the files spinlock held for write. */ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) { size_t cpy, set; BUG_ON(nfdt->max_fds < ofdt->max_fds); cpy = ofdt->max_fds * sizeof(struct file *); set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *); memcpy(nfdt->fd, ofdt->fd, cpy); memset((char *)nfdt->fd + cpy, 0, set); copy_fd_bitmaps(nfdt, ofdt, fdt_words(ofdt)); } /* * Note how the fdtable bitmap allocations very much have to be a multiple of * BITS_PER_LONG. This is not only because we walk those things in chunks of * 'unsigned long' in some places, but simply because that is how the Linux * kernel bitmaps are defined to work: they are not "bits in an array of bytes", * they are very much "bits in an array of unsigned long". */ static struct fdtable *alloc_fdtable(unsigned int slots_wanted) { struct fdtable *fdt; unsigned int nr; void *data; /* * Figure out how many fds we actually want to support in this fdtable. * Allocation steps are keyed to the size of the fdarray, since it * grows far faster than any of the other dynamic data. We try to fit * the fdarray into comfortable page-tuned chunks: starting at 1024B * and growing in powers of two from there on. Since we called only * with slots_wanted > BITS_PER_LONG (embedded instance in files->fdtab * already gives BITS_PER_LONG slots), the above boils down to * 1. use the smallest power of two large enough to give us that many * slots. * 2. on 32bit skip 64 and 128 - the minimal capacity we want there is * 256 slots (i.e. 1Kb fd array). * 3. on 64bit don't skip anything, 1Kb fd array means 128 slots there * and we are never going to be asked for 64 or less. */ if (IS_ENABLED(CONFIG_32BIT) && slots_wanted < 256) nr = 256; else nr = roundup_pow_of_two(slots_wanted); /* * Note that this can drive nr *below* what we had passed if sysctl_nr_open * had been set lower between the check in expand_files() and here. * * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise * bitmaps handling below becomes unpleasant, to put it mildly... */ if (unlikely(nr > sysctl_nr_open)) { nr = round_down(sysctl_nr_open, BITS_PER_LONG); if (nr < slots_wanted) return ERR_PTR(-EMFILE); } fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT); if (!fdt) goto out; fdt->max_fds = nr; data = kvmalloc_array(nr, sizeof(struct file *), GFP_KERNEL_ACCOUNT); if (!data) goto out_fdt; fdt->fd = data; data = kvmalloc(max_t(size_t, 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES), GFP_KERNEL_ACCOUNT); if (!data) goto out_arr; fdt->open_fds = data; data += nr / BITS_PER_BYTE; fdt->close_on_exec = data; data += nr / BITS_PER_BYTE; fdt->full_fds_bits = data; return fdt; out_arr: kvfree(fdt->fd); out_fdt: kfree(fdt); out: return ERR_PTR(-ENOMEM); } /* * Expand the file descriptor table. * This function will allocate a new fdtable and both fd array and fdset, of * the given size. * Return <0 error code on error; 0 on successful completion. * The files->file_lock should be held on entry, and will be held on exit. */ static int expand_fdtable(struct files_struct *files, unsigned int nr) __releases(files->file_lock) __acquires(files->file_lock) { struct fdtable *new_fdt, *cur_fdt; spin_unlock(&files->file_lock); new_fdt = alloc_fdtable(nr + 1); /* make sure all fd_install() have seen resize_in_progress * or have finished their rcu_read_lock_sched() section. */ if (atomic_read(&files->count) > 1) synchronize_rcu(); spin_lock(&files->file_lock); if (IS_ERR(new_fdt)) return PTR_ERR(new_fdt); cur_fdt = files_fdtable(files); BUG_ON(nr < cur_fdt->max_fds); copy_fdtable(new_fdt, cur_fdt); rcu_assign_pointer(files->fdt, new_fdt); if (cur_fdt != &files->fdtab) call_rcu(&cur_fdt->rcu, free_fdtable_rcu); /* coupled with smp_rmb() in fd_install() */ smp_wmb(); return 0; } /* * Expand files. * This function will expand the file structures, if the requested size exceeds * the current capacity and there is room for expansion. * Return <0 error code on error; 0 on success. * The files->file_lock should be held on entry, and will be held on exit. */ static int expand_files(struct files_struct *files, unsigned int nr) __releases(files->file_lock) __acquires(files->file_lock) { struct fdtable *fdt; int error; repeat: fdt = files_fdtable(files); /* Do we need to expand? */ if (nr < fdt->max_fds) return 0; if (unlikely(files->resize_in_progress)) { spin_unlock(&files->file_lock); wait_event(files->resize_wait, !files->resize_in_progress); spin_lock(&files->file_lock); goto repeat; } /* Can we expand? */ if (unlikely(nr >= sysctl_nr_open)) return -EMFILE; /* All good, so we try */ files->resize_in_progress = true; error = expand_fdtable(files, nr); files->resize_in_progress = false; wake_up_all(&files->resize_wait); return error; } static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt, bool set) { if (set) { __set_bit(fd, fdt->close_on_exec); } else { if (test_bit(fd, fdt->close_on_exec)) __clear_bit(fd, fdt->close_on_exec); } } static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt, bool set) { __set_bit(fd, fdt->open_fds); __set_close_on_exec(fd, fdt, set); fd /= BITS_PER_LONG; if (!~fdt->open_fds[fd]) __set_bit(fd, fdt->full_fds_bits); } static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt) { __clear_bit(fd, fdt->open_fds); fd /= BITS_PER_LONG; if (test_bit(fd, fdt->full_fds_bits)) __clear_bit(fd, fdt->full_fds_bits); } static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt) { return test_bit(fd, fdt->open_fds); } /* * Note that a sane fdtable size always has to be a multiple of * BITS_PER_LONG, since we have bitmaps that are sized by this. * * punch_hole is optional - when close_range() is asked to unshare * and close, we don't need to copy descriptors in that range, so * a smaller cloned descriptor table might suffice if the last * currently opened descriptor falls into that range. */ static unsigned int sane_fdtable_size(struct fdtable *fdt, struct fd_range *punch_hole) { unsigned int last = find_last_bit(fdt->open_fds, fdt->max_fds); if (last == fdt->max_fds) return NR_OPEN_DEFAULT; if (punch_hole && punch_hole->to >= last && punch_hole->from <= last) { last = find_last_bit(fdt->open_fds, punch_hole->from); if (last == punch_hole->from) return NR_OPEN_DEFAULT; } return ALIGN(last + 1, BITS_PER_LONG); } /* * Allocate a new descriptor table and copy contents from the passed in * instance. Returns a pointer to cloned table on success, ERR_PTR() * on failure. For 'punch_hole' see sane_fdtable_size(). */ struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_hole) { struct files_struct *newf; struct file **old_fds, **new_fds; unsigned int open_files, i; struct fdtable *old_fdt, *new_fdt; newf = kmem_cache_alloc(files_cachep, GFP_KERNEL); if (!newf) return ERR_PTR(-ENOMEM); atomic_set(&newf->count, 1); spin_lock_init(&newf->file_lock); newf->resize_in_progress = false; init_waitqueue_head(&newf->resize_wait); newf->next_fd = 0; new_fdt = &newf->fdtab; new_fdt->max_fds = NR_OPEN_DEFAULT; new_fdt->close_on_exec = newf->close_on_exec_init; new_fdt->open_fds = newf->open_fds_init; new_fdt->full_fds_bits = newf->full_fds_bits_init; new_fdt->fd = &newf->fd_array[0]; spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); open_files = sane_fdtable_size(old_fdt, punch_hole); /* * Check whether we need to allocate a larger fd array and fd set. */ while (unlikely(open_files > new_fdt->max_fds)) { spin_unlock(&oldf->file_lock); if (new_fdt != &newf->fdtab) __free_fdtable(new_fdt); new_fdt = alloc_fdtable(open_files); if (IS_ERR(new_fdt)) { kmem_cache_free(files_cachep, newf); return ERR_CAST(new_fdt); } /* * Reacquire the oldf lock and a pointer to its fd table * who knows it may have a new bigger fd table. We need * the latest pointer. */ spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); open_files = sane_fdtable_size(old_fdt, punch_hole); } copy_fd_bitmaps(new_fdt, old_fdt, open_files / BITS_PER_LONG); old_fds = old_fdt->fd; new_fds = new_fdt->fd; /* * We may be racing against fd allocation from other threads using this * files_struct, despite holding ->file_lock. * * alloc_fd() might have already claimed a slot, while fd_install() * did not populate it yet. Note the latter operates locklessly, so * the file can show up as we are walking the array below. * * At the same time we know no files will disappear as all other * operations take the lock. * * Instead of trying to placate userspace racing with itself, we * ref the file if we see it and mark the fd slot as unused otherwise. */ for (i = open_files; i != 0; i--) { struct file *f = rcu_dereference_raw(*old_fds++); if (f) { get_file(f); } else { __clear_open_fd(open_files - i, new_fdt); } rcu_assign_pointer(*new_fds++, f); } spin_unlock(&oldf->file_lock); /* clear the remainder */ memset(new_fds, 0, (new_fdt->max_fds - open_files) * sizeof(struct file *)); rcu_assign_pointer(newf->fdt, new_fdt); return newf; } static struct fdtable *close_files(struct files_struct * files) { /* * It is safe to dereference the fd table without RCU or * ->file_lock because this is the last reference to the * files structure. */ struct fdtable *fdt = rcu_dereference_raw(files->fdt); unsigned int i, j = 0; for (;;) { unsigned long set; i = j * BITS_PER_LONG; if (i >= fdt->max_fds) break; set = fdt->open_fds[j++]; while (set) { if (set & 1) { struct file *file = fdt->fd[i]; if (file) { filp_close(file, files); cond_resched(); } } i++; set >>= 1; } } return fdt; } void put_files_struct(struct files_struct *files) { if (atomic_dec_and_test(&files->count)) { struct fdtable *fdt = close_files(files); /* free the arrays if they are not embedded */ if (fdt != &files->fdtab) __free_fdtable(fdt); kmem_cache_free(files_cachep, files); } } void exit_files(struct task_struct *tsk) { struct files_struct * files = tsk->files; if (files) { task_lock(tsk); tsk->files = NULL; task_unlock(tsk); put_files_struct(files); } } struct files_struct init_files = { .count = ATOMIC_INIT(1), .fdt = &init_files.fdtab, .fdtab = { .max_fds = NR_OPEN_DEFAULT, .fd = &init_files.fd_array[0], .close_on_exec = init_files.close_on_exec_init, .open_fds = init_files.open_fds_init, .full_fds_bits = init_files.full_fds_bits_init, }, .file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock), .resize_wait = __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait), }; static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start) { unsigned int maxfd = fdt->max_fds; /* always multiple of BITS_PER_LONG */ unsigned int maxbit = maxfd / BITS_PER_LONG; unsigned int bitbit = start / BITS_PER_LONG; unsigned int bit; /* * Try to avoid looking at the second level bitmap */ bit = find_next_zero_bit(&fdt->open_fds[bitbit], BITS_PER_LONG, start & (BITS_PER_LONG - 1)); if (bit < BITS_PER_LONG) return bit + bitbit * BITS_PER_LONG; bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG; if (bitbit >= maxfd) return maxfd; if (bitbit > start) start = bitbit; return find_next_zero_bit(fdt->open_fds, maxfd, start); } /* * allocate a file descriptor, mark it busy. */ static int alloc_fd(unsigned start, unsigned end, unsigned flags) { struct files_struct *files = current->files; unsigned int fd; int error; struct fdtable *fdt; spin_lock(&files->file_lock); repeat: fdt = files_fdtable(files); fd = start; if (fd < files->next_fd) fd = files->next_fd; if (likely(fd < fdt->max_fds)) fd = find_next_fd(fdt, fd); /* * N.B. For clone tasks sharing a files structure, this test * will limit the total number of files that can be opened. */ error = -EMFILE; if (unlikely(fd >= end)) goto out; if (unlikely(fd >= fdt->max_fds)) { error = expand_files(files, fd); if (error < 0) goto out; goto repeat; } if (start <= files->next_fd) files->next_fd = fd + 1; __set_open_fd(fd, fdt, flags & O_CLOEXEC); error = fd; VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL); out: spin_unlock(&files->file_lock); return error; } int __get_unused_fd_flags(unsigned flags, unsigned long nofile) { return alloc_fd(0, nofile, flags); } int get_unused_fd_flags(unsigned flags) { return __get_unused_fd_flags(flags, rlimit(RLIMIT_NOFILE)); } EXPORT_SYMBOL(get_unused_fd_flags); static void __put_unused_fd(struct files_struct *files, unsigned int fd) { struct fdtable *fdt = files_fdtable(files); __clear_open_fd(fd, fdt); if (fd < files->next_fd) files->next_fd = fd; } void put_unused_fd(unsigned int fd) { struct files_struct *files = current->files; spin_lock(&files->file_lock); __put_unused_fd(files, fd); spin_unlock(&files->file_lock); } EXPORT_SYMBOL(put_unused_fd); /** * fd_install - install a file pointer in the fd array * @fd: file descriptor to install the file in * @file: the file to install * * This consumes the "file" refcount, so callers should treat it * as if they had called fput(file). */ void fd_install(unsigned int fd, struct file *file) { struct files_struct *files = current->files; struct fdtable *fdt; if (WARN_ON_ONCE(unlikely(file->f_mode & FMODE_BACKING))) return; rcu_read_lock_sched(); if (unlikely(files->resize_in_progress)) { rcu_read_unlock_sched(); spin_lock(&files->file_lock); fdt = files_fdtable(files); VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL); rcu_assign_pointer(fdt->fd[fd], file); spin_unlock(&files->file_lock); return; } /* coupled with smp_wmb() in expand_fdtable() */ smp_rmb(); fdt = rcu_dereference_sched(files->fdt); VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL); rcu_assign_pointer(fdt->fd[fd], file); rcu_read_unlock_sched(); } EXPORT_SYMBOL(fd_install); /** * file_close_fd_locked - return file associated with fd * @files: file struct to retrieve file from * @fd: file descriptor to retrieve file for * * Doesn't take a separate reference count. * * Context: files_lock must be held. * * Returns: The file associated with @fd (NULL if @fd is not open) */ struct file *file_close_fd_locked(struct files_struct *files, unsigned fd) { struct fdtable *fdt = files_fdtable(files); struct file *file; lockdep_assert_held(&files->file_lock); if (fd >= fdt->max_fds) return NULL; fd = array_index_nospec(fd, fdt->max_fds); file = rcu_dereference_raw(fdt->fd[fd]); if (file) { rcu_assign_pointer(fdt->fd[fd], NULL); __put_unused_fd(files, fd); } return file; } int close_fd(unsigned fd) { struct files_struct *files = current->files; struct file *file; spin_lock(&files->file_lock); file = file_close_fd_locked(files, fd); spin_unlock(&files->file_lock); if (!file) return -EBADF; return filp_close(file, files); } EXPORT_SYMBOL(close_fd); /** * last_fd - return last valid index into fd table * @fdt: File descriptor table. * * Context: Either rcu read lock or files_lock must be held. * * Returns: Last valid index into fdtable. */ static inline unsigned last_fd(struct fdtable *fdt) { return fdt->max_fds - 1; } static inline void __range_cloexec(struct files_struct *cur_fds, unsigned int fd, unsigned int max_fd) { struct fdtable *fdt; /* make sure we're using the correct maximum value */ spin_lock(&cur_fds->file_lock); fdt = files_fdtable(cur_fds); max_fd = min(last_fd(fdt), max_fd); if (fd <= max_fd) bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1); spin_unlock(&cur_fds->file_lock); } static inline void __range_close(struct files_struct *files, unsigned int fd, unsigned int max_fd) { struct file *file; unsigned n; spin_lock(&files->file_lock); n = last_fd(files_fdtable(files)); max_fd = min(max_fd, n); for (; fd <= max_fd; fd++) { file = file_close_fd_locked(files, fd); if (file) { spin_unlock(&files->file_lock); filp_close(file, files); cond_resched(); spin_lock(&files->file_lock); } else if (need_resched()) { spin_unlock(&files->file_lock); cond_resched(); spin_lock(&files->file_lock); } } spin_unlock(&files->file_lock); } /** * sys_close_range() - Close all file descriptors in a given range. * * @fd: starting file descriptor to close * @max_fd: last file descriptor to close * @flags: CLOSE_RANGE flags. * * This closes a range of file descriptors. All file descriptors * from @fd up to and including @max_fd are closed. * Currently, errors to close a given file descriptor are ignored. */ SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd, unsigned int, flags) { struct task_struct *me = current; struct files_struct *cur_fds = me->files, *fds = NULL; if (flags & ~(CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC)) return -EINVAL; if (fd > max_fd) return -EINVAL; if ((flags & CLOSE_RANGE_UNSHARE) && atomic_read(&cur_fds->count) > 1) { struct fd_range range = {fd, max_fd}, *punch_hole = &range; /* * If the caller requested all fds to be made cloexec we always * copy all of the file descriptors since they still want to * use them. */ if (flags & CLOSE_RANGE_CLOEXEC) punch_hole = NULL; fds = dup_fd(cur_fds, punch_hole); if (IS_ERR(fds)) return PTR_ERR(fds); /* * We used to share our file descriptor table, and have now * created a private one, make sure we're using it below. */ swap(cur_fds, fds); } if (flags & CLOSE_RANGE_CLOEXEC) __range_cloexec(cur_fds, fd, max_fd); else __range_close(cur_fds, fd, max_fd); if (fds) { /* * We're done closing the files we were supposed to. Time to install * the new file descriptor table and drop the old one. */ task_lock(me); me->files = cur_fds; task_unlock(me); put_files_struct(fds); } return 0; } /** * file_close_fd - return file associated with fd * @fd: file descriptor to retrieve file for * * Doesn't take a separate reference count. * * Returns: The file associated with @fd (NULL if @fd is not open) */ struct file *file_close_fd(unsigned int fd) { struct files_struct *files = current->files; struct file *file; spin_lock(&files->file_lock); file = file_close_fd_locked(files, fd); spin_unlock(&files->file_lock); return file; } void do_close_on_exec(struct files_struct *files) { unsigned i; struct fdtable *fdt; /* exec unshares first */ spin_lock(&files->file_lock); for (i = 0; ; i++) { unsigned long set; unsigned fd = i * BITS_PER_LONG; fdt = files_fdtable(files); if (fd >= fdt->max_fds) break; set = fdt->close_on_exec[i]; if (!set) continue; fdt->close_on_exec[i] = 0; for ( ; set ; fd++, set >>= 1) { struct file *file; if (!(set & 1)) continue; file = fdt->fd[fd]; if (!file) continue; rcu_assign_pointer(fdt->fd[fd], NULL); __put_unused_fd(files, fd); spin_unlock(&files->file_lock); filp_close(file, files); cond_resched(); spin_lock(&files->file_lock); } } spin_unlock(&files->file_lock); } static struct file *__get_file_rcu(struct file __rcu **f) { struct file __rcu *file; struct file __rcu *file_reloaded; struct file __rcu *file_reloaded_cmp; file = rcu_dereference_raw(*f); if (!file) return NULL; if (unlikely(!file_ref_get(&file->f_ref))) return ERR_PTR(-EAGAIN); file_reloaded = rcu_dereference_raw(*f); /* * Ensure that all accesses have a dependency on the load from * rcu_dereference_raw() above so we get correct ordering * between reuse/allocation and the pointer check below. */ file_reloaded_cmp = file_reloaded; OPTIMIZER_HIDE_VAR(file_reloaded_cmp); /* * file_ref_get() above provided a full memory barrier when we * acquired a reference. * * This is paired with the write barrier from assigning to the * __rcu protected file pointer so that if that pointer still * matches the current file, we know we have successfully * acquired a reference to the right file. * * If the pointers don't match the file has been reallocated by * SLAB_TYPESAFE_BY_RCU. */ if (file == file_reloaded_cmp) return file_reloaded; fput(file); return ERR_PTR(-EAGAIN); } /** * get_file_rcu - try go get a reference to a file under rcu * @f: the file to get a reference on * * This function tries to get a reference on @f carefully verifying that * @f hasn't been reused. * * This function should rarely have to be used and only by users who * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it. * * Return: Returns @f with the reference count increased or NULL. */ struct file *get_file_rcu(struct file __rcu **f) { for (;;) { struct file __rcu *file; file = __get_file_rcu(f); if (!IS_ERR(file)) return file; } } EXPORT_SYMBOL_GPL(get_file_rcu); /** * get_file_active - try go get a reference to a file * @f: the file to get a reference on * * In contast to get_file_rcu() the pointer itself isn't part of the * reference counting. * * This function should rarely have to be used and only by users who * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it. * * Return: Returns @f with the reference count increased or NULL. */ struct file *get_file_active(struct file **f) { struct file __rcu *file; rcu_read_lock(); file = __get_file_rcu(f); rcu_read_unlock(); if (IS_ERR(file)) file = NULL; return file; } EXPORT_SYMBOL_GPL(get_file_active); static inline struct file *__fget_files_rcu(struct files_struct *files, unsigned int fd, fmode_t mask) { for (;;) { struct file *file; struct fdtable *fdt = rcu_dereference_raw(files->fdt); struct file __rcu **fdentry; unsigned long nospec_mask; /* Mask is a 0 for invalid fd's, ~0 for valid ones */ nospec_mask = array_index_mask_nospec(fd, fdt->max_fds); /* * fdentry points to the 'fd' offset, or fdt->fd[0]. * Loading from fdt->fd[0] is always safe, because the * array always exists. */ fdentry = fdt->fd + (fd & nospec_mask); /* Do the load, then mask any invalid result */ file = rcu_dereference_raw(*fdentry); file = (void *)(nospec_mask & (unsigned long)file); if (unlikely(!file)) return NULL; /* * Ok, we have a file pointer that was valid at * some point, but it might have become stale since. * * We need to confirm it by incrementing the refcount * and then check the lookup again. * * file_ref_get() gives us a full memory barrier. We * only really need an 'acquire' one to protect the * loads below, but we don't have that. */ if (unlikely(!file_ref_get(&file->f_ref))) continue; /* * Such a race can take two forms: * * (a) the file ref already went down to zero and the * file hasn't been reused yet or the file count * isn't zero but the file has already been reused. * * (b) the file table entry has changed under us. * Note that we don't need to re-check the 'fdt->fd' * pointer having changed, because it always goes * hand-in-hand with 'fdt'. * * If so, we need to put our ref and try again. */ if (unlikely(file != rcu_dereference_raw(*fdentry)) || unlikely(rcu_dereference_raw(files->fdt) != fdt)) { fput(file); continue; } /* * This isn't the file we're looking for or we're not * allowed to get a reference to it. */ if (unlikely(file->f_mode & mask)) { fput(file); return NULL; } /* * Ok, we have a ref to the file, and checked that it * still exists. */ return file; } } static struct file *__fget_files(struct files_struct *files, unsigned int fd, fmode_t mask) { struct file *file; rcu_read_lock(); file = __fget_files_rcu(files, fd, mask); rcu_read_unlock(); return file; } static inline struct file *__fget(unsigned int fd, fmode_t mask) { return __fget_files(current->files, fd, mask); } struct file *fget(unsigned int fd) { return __fget(fd, FMODE_PATH); } EXPORT_SYMBOL(fget); struct file *fget_raw(unsigned int fd) { return __fget(fd, 0); } EXPORT_SYMBOL(fget_raw); struct file *fget_task(struct task_struct *task, unsigned int fd) { struct file *file = NULL; task_lock(task); if (task->files) file = __fget_files(task->files, fd, 0); task_unlock(task); return file; } struct file *fget_task_next(struct task_struct *task, unsigned int *ret_fd) { /* Must be called with rcu_read_lock held */ struct files_struct *files; unsigned int fd = *ret_fd; struct file *file = NULL; task_lock(task); files = task->files; if (files) { rcu_read_lock(); for (; fd < files_fdtable(files)->max_fds; fd++) { file = __fget_files_rcu(files, fd, 0); if (file) break; } rcu_read_unlock(); } task_unlock(task); *ret_fd = fd; return file; } EXPORT_SYMBOL(fget_task_next); /* * Lightweight file lookup - no refcnt increment if fd table isn't shared. * * You can use this instead of fget if you satisfy all of the following * conditions: * 1) You must call fput_light before exiting the syscall and returning control * to userspace (i.e. you cannot remember the returned struct file * after * returning to userspace). * 2) You must not call filp_close on the returned struct file * in between * calls to fget_light and fput_light. * 3) You must not clone the current task in between the calls to fget_light * and fput_light. * * The fput_needed flag returned by fget_light should be passed to the * corresponding fput_light. * * (As an exception to rule 2, you can call filp_close between fget_light and * fput_light provided that you capture a real refcount with get_file before * the call to filp_close, and ensure that this real refcount is fput *after* * the fput_light call.) * * See also the documentation in rust/kernel/file.rs. */ static inline struct fd __fget_light(unsigned int fd, fmode_t mask) { struct files_struct *files = current->files; struct file *file; /* * If another thread is concurrently calling close_fd() followed * by put_files_struct(), we must not observe the old table * entry combined with the new refcount - otherwise we could * return a file that is concurrently being freed. * * atomic_read_acquire() pairs with atomic_dec_and_test() in * put_files_struct(). */ if (likely(atomic_read_acquire(&files->count) == 1)) { file = files_lookup_fd_raw(files, fd); if (!file || unlikely(file->f_mode & mask)) return EMPTY_FD; return BORROWED_FD(file); } else { file = __fget_files(files, fd, mask); if (!file) return EMPTY_FD; return CLONED_FD(file); } } struct fd fdget(unsigned int fd) { return __fget_light(fd, FMODE_PATH); } EXPORT_SYMBOL(fdget); struct fd fdget_raw(unsigned int fd) { return __fget_light(fd, 0); } /* * Try to avoid f_pos locking. We only need it if the * file is marked for FMODE_ATOMIC_POS, and it can be * accessed multiple ways. * * Always do it for directories, because pidfd_getfd() * can make a file accessible even if it otherwise would * not be, and for directories this is a correctness * issue, not a "POSIX requirement". */ static inline bool file_needs_f_pos_lock(struct file *file) { if (!(file->f_mode & FMODE_ATOMIC_POS)) return false; if (__file_ref_read_raw(&file->f_ref) != FILE_REF_ONEREF) return true; if (file->f_op->iterate_shared) return true; return false; } bool file_seek_cur_needs_f_lock(struct file *file) { if (!(file->f_mode & FMODE_ATOMIC_POS) && !file->f_op->iterate_shared) return false; /* * Note that we are not guaranteed to be called after fdget_pos() on * this file obj, in which case the caller is expected to provide the * appropriate locking. */ return true; } struct fd fdget_pos(unsigned int fd) { struct fd f = fdget(fd); struct file *file = fd_file(f); if (likely(file) && file_needs_f_pos_lock(file)) { f.word |= FDPUT_POS_UNLOCK; mutex_lock(&file->f_pos_lock); } return f; } void __f_unlock_pos(struct file *f) { mutex_unlock(&f->f_pos_lock); } /* * We only lock f_pos if we have threads or if the file might be * shared with another process. In both cases we'll have an elevated * file count (done either by fdget() or by fork()). */ void set_close_on_exec(unsigned int fd, int flag) { struct files_struct *files = current->files; spin_lock(&files->file_lock); __set_close_on_exec(fd, files_fdtable(files), flag); spin_unlock(&files->file_lock); } bool get_close_on_exec(unsigned int fd) { bool res; rcu_read_lock(); res = close_on_exec(fd, current->files); rcu_read_unlock(); return res; } static int do_dup2(struct files_struct *files, struct file *file, unsigned fd, unsigned flags) __releases(&files->file_lock) { struct file *tofree; struct fdtable *fdt; /* * dup2() is expected to close the file installed in the target fd slot * (if any). However, userspace hand-picking a fd may be racing against * its own threads which happened to allocate it in open() et al but did * not populate it yet. * * Broadly speaking we may be racing against the following: * fd = get_unused_fd_flags(); // fd slot reserved, ->fd[fd] == NULL * file = hard_work_goes_here(); * fd_install(fd, file); // only now ->fd[fd] == file * * It is an invariant that a successfully allocated fd has a NULL entry * in the array until the matching fd_install(). * * If we fit the window, we have the fd to populate, yet no target file * to close. Trying to ignore it and install our new file would violate * the invariant and make fd_install() overwrite our file. * * Things can be done(tm) to handle this. However, the issue does not * concern legitimate programs and we only need to make sure the kernel * does not trip over it. * * The simplest way out is to return an error if we find ourselves here. * * POSIX is silent on the issue, we return -EBUSY. */ fdt = files_fdtable(files); fd = array_index_nospec(fd, fdt->max_fds); tofree = rcu_dereference_raw(fdt->fd[fd]); if (!tofree && fd_is_open(fd, fdt)) goto Ebusy; get_file(file); rcu_assign_pointer(fdt->fd[fd], file); __set_open_fd(fd, fdt, flags & O_CLOEXEC); spin_unlock(&files->file_lock); if (tofree) filp_close(tofree, files); return fd; Ebusy: spin_unlock(&files->file_lock); return -EBUSY; } int replace_fd(unsigned fd, struct file *file, unsigned flags) { int err; struct files_struct *files = current->files; if (!file) return close_fd(fd); if (fd >= rlimit(RLIMIT_NOFILE)) return -EBADF; spin_lock(&files->file_lock); err = expand_files(files, fd); if (unlikely(err < 0)) goto out_unlock; return do_dup2(files, file, fd, flags); out_unlock: spin_unlock(&files->file_lock); return err; } /** * receive_fd() - Install received file into file descriptor table * @file: struct file that was received from another process * @ufd: __user pointer to write new fd number to * @o_flags: the O_* flags to apply to the new fd entry * * Installs a received file into the file descriptor table, with appropriate * checks and count updates. Optionally writes the fd number to userspace, if * @ufd is non-NULL. * * This helper handles its own reference counting of the incoming * struct file. * * Returns newly install fd or -ve on error. */ int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags) { int new_fd; int error; error = security_file_receive(file); if (error) return error; new_fd = get_unused_fd_flags(o_flags); if (new_fd < 0) return new_fd; if (ufd) { error = put_user(new_fd, ufd); if (error) { put_unused_fd(new_fd); return error; } } fd_install(new_fd, get_file(file)); __receive_sock(file); return new_fd; } EXPORT_SYMBOL_GPL(receive_fd); int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags) { int error; error = security_file_receive(file); if (error) return error; error = replace_fd(new_fd, file, o_flags); if (error) return error; __receive_sock(file); return new_fd; } static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags) { int err = -EBADF; struct file *file; struct files_struct *files = current->files; if ((flags & ~O_CLOEXEC) != 0) return -EINVAL; if (unlikely(oldfd == newfd)) return -EINVAL; if (newfd >= rlimit(RLIMIT_NOFILE)) return -EBADF; spin_lock(&files->file_lock); err = expand_files(files, newfd); file = files_lookup_fd_locked(files, oldfd); if (unlikely(!file)) goto Ebadf; if (unlikely(err < 0)) { if (err == -EMFILE) goto Ebadf; goto out_unlock; } return do_dup2(files, file, newfd, flags); Ebadf: err = -EBADF; out_unlock: spin_unlock(&files->file_lock); return err; } SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags) { return ksys_dup3(oldfd, newfd, flags); } SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd) { if (unlikely(newfd == oldfd)) { /* corner case */ struct files_struct *files = current->files; struct file *f; int retval = oldfd; rcu_read_lock(); f = __fget_files_rcu(files, oldfd, 0); if (!f) retval = -EBADF; rcu_read_unlock(); if (f) fput(f); return retval; } return ksys_dup3(oldfd, newfd, 0); } SYSCALL_DEFINE1(dup, unsigned int, fildes) { int ret = -EBADF; struct file *file = fget_raw(fildes); if (file) { ret = get_unused_fd_flags(0); if (ret >= 0) fd_install(ret, file); else fput(file); } return ret; } int f_dupfd(unsigned int from, struct file *file, unsigned flags) { unsigned long nofile = rlimit(RLIMIT_NOFILE); int err; if (from >= nofile) return -EINVAL; err = alloc_fd(from, nofile, flags); if (err >= 0) { get_file(file); fd_install(err, file); } return err; } int iterate_fd(struct files_struct *files, unsigned n, int (*f)(const void *, struct file *, unsigned), const void *p) { struct fdtable *fdt; int res = 0; if (!files) return 0; spin_lock(&files->file_lock); for (fdt = files_fdtable(files); n < fdt->max_fds; n++) { struct file *file; file = rcu_dereference_check_fdtable(files, fdt->fd[n]); if (!file) continue; res = f(p, file, n); if (res) break; } spin_unlock(&files->file_lock); return res; } EXPORT_SYMBOL(iterate_fd);
47 27 6 6 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2016 Linaro * Author: Christoffer Dall <christoffer.dall@linaro.org> */ #include <linux/cpu.h> #include <linux/debugfs.h> #include <linux/interrupt.h> #include <linux/kvm_host.h> #include <linux/seq_file.h> #include <kvm/arm_vgic.h> #include <asm/kvm_mmu.h> #include "vgic.h" /* * Structure to control looping through the entire vgic state. We start at * zero for each field and move upwards. So, if dist_id is 0 we print the * distributor info. When dist_id is 1, we have already printed it and move * on. * * When vcpu_id < nr_cpus we print the vcpu info until vcpu_id == nr_cpus and * so on. */ struct vgic_state_iter { int nr_cpus; int nr_spis; int nr_lpis; int dist_id; int vcpu_id; unsigned long intid; int lpi_idx; }; static void iter_next(struct kvm *kvm, struct vgic_state_iter *iter) { struct vgic_dist *dist = &kvm->arch.vgic; if (iter->dist_id == 0) { iter->dist_id++; return; } /* * Let the xarray drive the iterator after the last SPI, as the iterator * has exhausted the sequentially-allocated INTID space. */ if (iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS - 1) && iter->nr_lpis) { if (iter->lpi_idx < iter->nr_lpis) xa_find_after(&dist->lpi_xa, &iter->intid, VGIC_LPI_MAX_INTID, LPI_XA_MARK_DEBUG_ITER); iter->lpi_idx++; return; } iter->intid++; if (iter->intid == VGIC_NR_PRIVATE_IRQS && ++iter->vcpu_id < iter->nr_cpus) iter->intid = 0; } static int iter_mark_lpis(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; struct vgic_irq *irq; unsigned long intid; int nr_lpis = 0; xa_for_each(&dist->lpi_xa, intid, irq) { if (!vgic_try_get_irq_kref(irq)) continue; xa_set_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER); nr_lpis++; } return nr_lpis; } static void iter_unmark_lpis(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; struct vgic_irq *irq; unsigned long intid; xa_for_each_marked(&dist->lpi_xa, intid, irq, LPI_XA_MARK_DEBUG_ITER) { xa_clear_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER); vgic_put_irq(kvm, irq); } } static void iter_init(struct kvm *kvm, struct vgic_state_iter *iter, loff_t pos) { int nr_cpus = atomic_read(&kvm->online_vcpus); memset(iter, 0, sizeof(*iter)); iter->nr_cpus = nr_cpus; iter->nr_spis = kvm->arch.vgic.nr_spis; if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) iter->nr_lpis = iter_mark_lpis(kvm); /* Fast forward to the right position if needed */ while (pos--) iter_next(kvm, iter); } static bool end_of_vgic(struct vgic_state_iter *iter) { return iter->dist_id > 0 && iter->vcpu_id == iter->nr_cpus && iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS) && (!iter->nr_lpis || iter->lpi_idx > iter->nr_lpis); } static void *vgic_debug_start(struct seq_file *s, loff_t *pos) { struct kvm *kvm = s->private; struct vgic_state_iter *iter; mutex_lock(&kvm->arch.config_lock); iter = kvm->arch.vgic.iter; if (iter) { iter = ERR_PTR(-EBUSY); goto out; } iter = kmalloc(sizeof(*iter), GFP_KERNEL); if (!iter) { iter = ERR_PTR(-ENOMEM); goto out; } iter_init(kvm, iter, *pos); kvm->arch.vgic.iter = iter; if (end_of_vgic(iter)) iter = NULL; out: mutex_unlock(&kvm->arch.config_lock); return iter; } static void *vgic_debug_next(struct seq_file *s, void *v, loff_t *pos) { struct kvm *kvm = s->private; struct vgic_state_iter *iter = kvm->arch.vgic.iter; ++*pos; iter_next(kvm, iter); if (end_of_vgic(iter)) iter = NULL; return iter; } static void vgic_debug_stop(struct seq_file *s, void *v) { struct kvm *kvm = s->private; struct vgic_state_iter *iter; /* * If the seq file wasn't properly opened, there's nothing to clearn * up. */ if (IS_ERR(v)) return; mutex_lock(&kvm->arch.config_lock); iter = kvm->arch.vgic.iter; iter_unmark_lpis(kvm); kfree(iter); kvm->arch.vgic.iter = NULL; mutex_unlock(&kvm->arch.config_lock); } static void print_dist_state(struct seq_file *s, struct vgic_dist *dist, struct vgic_state_iter *iter) { bool v3 = dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3; seq_printf(s, "Distributor\n"); seq_printf(s, "===========\n"); seq_printf(s, "vgic_model:\t%s\n", v3 ? "GICv3" : "GICv2"); seq_printf(s, "nr_spis:\t%d\n", dist->nr_spis); if (v3) seq_printf(s, "nr_lpis:\t%d\n", iter->nr_lpis); seq_printf(s, "enabled:\t%d\n", dist->enabled); seq_printf(s, "\n"); seq_printf(s, "P=pending_latch, L=line_level, A=active\n"); seq_printf(s, "E=enabled, H=hw, C=config (level=1, edge=0)\n"); seq_printf(s, "G=group\n"); } static void print_header(struct seq_file *s, struct vgic_irq *irq, struct kvm_vcpu *vcpu) { int id = 0; char *hdr = "SPI "; if (vcpu) { hdr = "VCPU"; id = vcpu->vcpu_idx; } seq_printf(s, "\n"); seq_printf(s, "%s%2d TYP ID TGT_ID PLAEHCG HWID TARGET SRC PRI VCPU_ID\n", hdr, id); seq_printf(s, "----------------------------------------------------------------\n"); } static void print_irq_state(struct seq_file *s, struct vgic_irq *irq, struct kvm_vcpu *vcpu) { char *type; bool pending; if (irq->intid < VGIC_NR_SGIS) type = "SGI"; else if (irq->intid < VGIC_NR_PRIVATE_IRQS) type = "PPI"; else if (irq->intid < VGIC_MAX_SPI) type = "SPI"; else type = "LPI"; if (irq->intid ==0 || irq->intid == VGIC_NR_PRIVATE_IRQS) print_header(s, irq, vcpu); pending = irq->pending_latch; if (irq->hw && vgic_irq_is_sgi(irq->intid)) { int err; err = irq_get_irqchip_state(irq->host_irq, IRQCHIP_STATE_PENDING, &pending); WARN_ON_ONCE(err); } seq_printf(s, " %s %4d " " %2d " "%d%d%d%d%d%d%d " "%8d " "%8x " " %2x " "%3d " " %2d " "\n", type, irq->intid, (irq->target_vcpu) ? irq->target_vcpu->vcpu_idx : -1, pending, irq->line_level, irq->active, irq->enabled, irq->hw, irq->config == VGIC_CONFIG_LEVEL, irq->group, irq->hwintid, irq->mpidr, irq->source, irq->priority, (irq->vcpu) ? irq->vcpu->vcpu_idx : -1); } static int vgic_debug_show(struct seq_file *s, void *v) { struct kvm *kvm = s->private; struct vgic_state_iter *iter = v; struct vgic_irq *irq; struct kvm_vcpu *vcpu = NULL; unsigned long flags; if (iter->dist_id == 0) { print_dist_state(s, &kvm->arch.vgic, iter); return 0; } if (!kvm->arch.vgic.initialized) return 0; if (iter->vcpu_id < iter->nr_cpus) vcpu = kvm_get_vcpu(kvm, iter->vcpu_id); /* * Expect this to succeed, as iter_mark_lpis() takes a reference on * every LPI to be visited. */ if (iter->intid < VGIC_NR_PRIVATE_IRQS) irq = vgic_get_vcpu_irq(vcpu, iter->intid); else irq = vgic_get_irq(kvm, iter->intid); if (WARN_ON_ONCE(!irq)) return -EINVAL; raw_spin_lock_irqsave(&irq->irq_lock, flags); print_irq_state(s, irq, vcpu); raw_spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(kvm, irq); return 0; } static const struct seq_operations vgic_debug_sops = { .start = vgic_debug_start, .next = vgic_debug_next, .stop = vgic_debug_stop, .show = vgic_debug_show }; DEFINE_SEQ_ATTRIBUTE(vgic_debug); void vgic_debug_init(struct kvm *kvm) { debugfs_create_file("vgic-state", 0444, kvm->debugfs_dentry, kvm, &vgic_debug_fops); } void vgic_debug_destroy(struct kvm *kvm) { } /** * struct vgic_its_iter - Iterator for traversing VGIC ITS device tables. * @dev: Pointer to the current its_device being processed. * @ite: Pointer to the current its_ite within the device being processed. * * This structure is used to maintain the current position during iteration * over the ITS device tables. It holds pointers to both the current device * and the current ITE within that device. */ struct vgic_its_iter { struct its_device *dev; struct its_ite *ite; }; /** * end_of_iter - Checks if the iterator has reached the end. * @iter: The iterator to check. * * When the iterator completed processing the final ITE in the last device * table, it was marked to indicate the end of iteration by setting its * device and ITE pointers to NULL. * This function checks whether the iterator was marked as end. * * Return: True if the iterator is marked as end, false otherwise. */ static inline bool end_of_iter(struct vgic_its_iter *iter) { return !iter->dev && !iter->ite; } /** * vgic_its_iter_next - Advances the iterator to the next entry in the ITS tables. * @its: The VGIC ITS structure. * @iter: The iterator to advance. * * This function moves the iterator to the next ITE within the current device, * or to the first ITE of the next device if the current ITE is the last in * the device. If the current device is the last device, the iterator is set * to indicate the end of iteration. */ static void vgic_its_iter_next(struct vgic_its *its, struct vgic_its_iter *iter) { struct its_device *dev = iter->dev; struct its_ite *ite = iter->ite; if (!ite || list_is_last(&ite->ite_list, &dev->itt_head)) { if (list_is_last(&dev->dev_list, &its->device_list)) { dev = NULL; ite = NULL; } else { dev = list_next_entry(dev, dev_list); ite = list_first_entry_or_null(&dev->itt_head, struct its_ite, ite_list); } } else { ite = list_next_entry(ite, ite_list); } iter->dev = dev; iter->ite = ite; } /** * vgic_its_debug_start - Start function for the seq_file interface. * @s: The seq_file structure. * @pos: The starting position (offset). * * This function initializes the iterator to the beginning of the ITS tables * and advances it to the specified position. It acquires the its_lock mutex * to protect shared data. * * Return: An iterator pointer on success, NULL if no devices are found or * the end of the list is reached, or ERR_PTR(-ENOMEM) on memory * allocation failure. */ static void *vgic_its_debug_start(struct seq_file *s, loff_t *pos) { struct vgic_its *its = s->private; struct vgic_its_iter *iter; struct its_device *dev; loff_t offset = *pos; mutex_lock(&its->its_lock); dev = list_first_entry_or_null(&its->device_list, struct its_device, dev_list); if (!dev) return NULL; iter = kmalloc(sizeof(*iter), GFP_KERNEL); if (!iter) return ERR_PTR(-ENOMEM); iter->dev = dev; iter->ite = list_first_entry_or_null(&dev->itt_head, struct its_ite, ite_list); while (!end_of_iter(iter) && offset--) vgic_its_iter_next(its, iter); if (end_of_iter(iter)) { kfree(iter); return NULL; } return iter; } /** * vgic_its_debug_next - Next function for the seq_file interface. * @s: The seq_file structure. * @v: The current iterator. * @pos: The current position (offset). * * This function advances the iterator to the next entry and increments the * position. * * Return: An iterator pointer on success, or NULL if the end of the list is * reached. */ static void *vgic_its_debug_next(struct seq_file *s, void *v, loff_t *pos) { struct vgic_its *its = s->private; struct vgic_its_iter *iter = v; ++*pos; vgic_its_iter_next(its, iter); if (end_of_iter(iter)) { kfree(iter); return NULL; } return iter; } /** * vgic_its_debug_stop - Stop function for the seq_file interface. * @s: The seq_file structure. * @v: The current iterator. * * This function frees the iterator and releases the its_lock mutex. */ static void vgic_its_debug_stop(struct seq_file *s, void *v) { struct vgic_its *its = s->private; struct vgic_its_iter *iter = v; if (!IS_ERR_OR_NULL(iter)) kfree(iter); mutex_unlock(&its->its_lock); } /** * vgic_its_debug_show - Show function for the seq_file interface. * @s: The seq_file structure. * @v: The current iterator. * * This function formats and prints the ITS table entry information to the * seq_file output. * * Return: 0 on success. */ static int vgic_its_debug_show(struct seq_file *s, void *v) { struct vgic_its_iter *iter = v; struct its_device *dev = iter->dev; struct its_ite *ite = iter->ite; if (!ite) return 0; if (list_is_first(&ite->ite_list, &dev->itt_head)) { seq_printf(s, "\n"); seq_printf(s, "Device ID: 0x%x, Event ID Range: [0 - %llu]\n", dev->device_id, BIT_ULL(dev->num_eventid_bits) - 1); seq_printf(s, "EVENT_ID INTID HWINTID TARGET COL_ID HW\n"); seq_printf(s, "-----------------------------------------------\n"); } if (ite->irq && ite->collection) { seq_printf(s, "%8u %8u %8u %8u %8u %2d\n", ite->event_id, ite->irq->intid, ite->irq->hwintid, ite->collection->target_addr, ite->collection->collection_id, ite->irq->hw); } return 0; } static const struct seq_operations vgic_its_debug_sops = { .start = vgic_its_debug_start, .next = vgic_its_debug_next, .stop = vgic_its_debug_stop, .show = vgic_its_debug_show }; DEFINE_SEQ_ATTRIBUTE(vgic_its_debug); /** * vgic_its_debug_init - Initializes the debugfs interface for VGIC ITS. * @dev: The KVM device structure. * * This function creates a debugfs file named "vgic-its-state@%its_base" * to expose the ITS table information. * * Return: 0 on success. */ int vgic_its_debug_init(struct kvm_device *dev) { struct vgic_its *its = dev->private; char *name; name = kasprintf(GFP_KERNEL, "vgic-its-state@%llx", (u64)its->vgic_its_base); if (!name) return -ENOMEM; debugfs_create_file(name, 0444, dev->kvm->debugfs_dentry, its, &vgic_its_debug_fops); kfree(name); return 0; } void vgic_its_debug_destroy(struct kvm_device *dev) { }
307 307 307 306 307 307 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 // SPDX-License-Identifier: GPL-2.0-or-later /* * Generic Timer-queue * * Manages a simple queue of timers, ordered by expiration time. * Uses rbtrees for quick list adds and expiration. * * NOTE: All of the following functions need to be serialized * to avoid races. No locking is done by this library code. */ #include <linux/bug.h> #include <linux/timerqueue.h> #include <linux/rbtree.h> #include <linux/export.h> #define __node_2_tq(_n) \ rb_entry((_n), struct timerqueue_node, node) static inline bool __timerqueue_less(struct rb_node *a, const struct rb_node *b) { return __node_2_tq(a)->expires < __node_2_tq(b)->expires; } /** * timerqueue_add - Adds timer to timerqueue. * * @head: head of timerqueue * @node: timer node to be added * * Adds the timer node to the timerqueue, sorted by the node's expires * value. Returns true if the newly added timer is the first expiring timer in * the queue. */ bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node) { /* Make sure we don't add nodes that are already added */ WARN_ON_ONCE(!RB_EMPTY_NODE(&node->node)); return rb_add_cached(&node->node, &head->rb_root, __timerqueue_less); } EXPORT_SYMBOL_GPL(timerqueue_add); /** * timerqueue_del - Removes a timer from the timerqueue. * * @head: head of timerqueue * @node: timer node to be removed * * Removes the timer node from the timerqueue. Returns true if the queue is * not empty after the remove. */ bool timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node) { WARN_ON_ONCE(RB_EMPTY_NODE(&node->node)); rb_erase_cached(&node->node, &head->rb_root); RB_CLEAR_NODE(&node->node); return !RB_EMPTY_ROOT(&head->rb_root.rb_root); } EXPORT_SYMBOL_GPL(timerqueue_del); /** * timerqueue_iterate_next - Returns the timer after the provided timer * * @node: Pointer to a timer. * * Provides the timer that is after the given node. This is used, when * necessary, to iterate through the list of timers in a timer list * without modifying the list. */ struct timerqueue_node *timerqueue_iterate_next(struct timerqueue_node *node) { struct rb_node *next; if (!node) return NULL; next = rb_next(&node->node); if (!next) return NULL; return container_of(next, struct timerqueue_node, node); } EXPORT_SYMBOL_GPL(timerqueue_iterate_next);
25 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 // SPDX-License-Identifier: GPL-2.0-only /* * Based on arch/arm/kernel/setup.c * * Copyright (C) 1995-2001 Russell King * Copyright (C) 2012 ARM Ltd. */ #include <linux/acpi.h> #include <linux/export.h> #include <linux/kernel.h> #include <linux/stddef.h> #include <linux/ioport.h> #include <linux/delay.h> #include <linux/initrd.h> #include <linux/console.h> #include <linux/cache.h> #include <linux/screen_info.h> #include <linux/init.h> #include <linux/kexec.h> #include <linux/root_dev.h> #include <linux/cpu.h> #include <linux/interrupt.h> #include <linux/smp.h> #include <linux/fs.h> #include <linux/panic_notifier.h> #include <linux/proc_fs.h> #include <linux/memblock.h> #include <linux/of_fdt.h> #include <linux/efi.h> #include <linux/psci.h> #include <linux/sched/task.h> #include <linux/scs.h> #include <linux/mm.h> #include <asm/acpi.h> #include <asm/fixmap.h> #include <asm/cpu.h> #include <asm/cputype.h> #include <asm/daifflags.h> #include <asm/elf.h> #include <asm/cpufeature.h> #include <asm/cpu_ops.h> #include <asm/kasan.h> #include <asm/numa.h> #include <asm/rsi.h> #include <asm/scs.h> #include <asm/sections.h> #include <asm/setup.h> #include <asm/smp_plat.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> #include <asm/traps.h> #include <asm/efi.h> #include <asm/xen/hypervisor.h> #include <asm/mmu_context.h> static int num_standard_resources; static struct resource *standard_resources; phys_addr_t __fdt_pointer __initdata; u64 mmu_enabled_at_boot __initdata; /* * Standard memory resources */ static struct resource mem_res[] = { { .name = "Kernel code", .start = 0, .end = 0, .flags = IORESOURCE_SYSTEM_RAM }, { .name = "Kernel data", .start = 0, .end = 0, .flags = IORESOURCE_SYSTEM_RAM } }; #define kernel_code mem_res[0] #define kernel_data mem_res[1] /* * The recorded values of x0 .. x3 upon kernel entry. */ u64 __cacheline_aligned boot_args[4]; void __init smp_setup_processor_id(void) { u64 mpidr = read_cpuid_mpidr() & MPIDR_HWID_BITMASK; set_cpu_logical_map(0, mpidr); pr_info("Booting Linux on physical CPU 0x%010lx [0x%08x]\n", (unsigned long)mpidr, read_cpuid_id()); } bool arch_match_cpu_phys_id(int cpu, u64 phys_id) { return phys_id == cpu_logical_map(cpu); } struct mpidr_hash mpidr_hash; /** * smp_build_mpidr_hash - Pre-compute shifts required at each affinity * level in order to build a linear index from an * MPIDR value. Resulting algorithm is a collision * free hash carried out through shifting and ORing */ static void __init smp_build_mpidr_hash(void) { u32 i, affinity, fs[4], bits[4], ls; u64 mask = 0; /* * Pre-scan the list of MPIDRS and filter out bits that do * not contribute to affinity levels, ie they never toggle. */ for_each_possible_cpu(i) mask |= (cpu_logical_map(i) ^ cpu_logical_map(0)); pr_debug("mask of set bits %#llx\n", mask); /* * Find and stash the last and first bit set at all affinity levels to * check how many bits are required to represent them. */ for (i = 0; i < 4; i++) { affinity = MPIDR_AFFINITY_LEVEL(mask, i); /* * Find the MSB bit and LSB bits position * to determine how many bits are required * to express the affinity level. */ ls = fls(affinity); fs[i] = affinity ? ffs(affinity) - 1 : 0; bits[i] = ls - fs[i]; } /* * An index can be created from the MPIDR_EL1 by isolating the * significant bits at each affinity level and by shifting * them in order to compress the 32 bits values space to a * compressed set of values. This is equivalent to hashing * the MPIDR_EL1 through shifting and ORing. It is a collision free * hash though not minimal since some levels might contain a number * of CPUs that is not an exact power of 2 and their bit * representation might contain holes, eg MPIDR_EL1[7:0] = {0x2, 0x80}. */ mpidr_hash.shift_aff[0] = MPIDR_LEVEL_SHIFT(0) + fs[0]; mpidr_hash.shift_aff[1] = MPIDR_LEVEL_SHIFT(1) + fs[1] - bits[0]; mpidr_hash.shift_aff[2] = MPIDR_LEVEL_SHIFT(2) + fs[2] - (bits[1] + bits[0]); mpidr_hash.shift_aff[3] = MPIDR_LEVEL_SHIFT(3) + fs[3] - (bits[2] + bits[1] + bits[0]); mpidr_hash.mask = mask; mpidr_hash.bits = bits[3] + bits[2] + bits[1] + bits[0]; pr_debug("MPIDR hash: aff0[%u] aff1[%u] aff2[%u] aff3[%u] mask[%#llx] bits[%u]\n", mpidr_hash.shift_aff[0], mpidr_hash.shift_aff[1], mpidr_hash.shift_aff[2], mpidr_hash.shift_aff[3], mpidr_hash.mask, mpidr_hash.bits); /* * 4x is an arbitrary value used to warn on a hash table much bigger * than expected on most systems. */ if (mpidr_hash_size() > 4 * num_possible_cpus()) pr_warn("Large number of MPIDR hash buckets detected\n"); } static void __init setup_machine_fdt(phys_addr_t dt_phys) { int size = 0; void *dt_virt = fixmap_remap_fdt(dt_phys, &size, PAGE_KERNEL); const char *name; if (dt_virt) memblock_reserve(dt_phys, size); /* * dt_virt is a fixmap address, hence __pa(dt_virt) can't be used. * Pass dt_phys directly. */ if (!early_init_dt_scan(dt_virt, dt_phys)) { pr_crit("\n" "Error: invalid device tree blob: PA=%pa, VA=%px, size=%d bytes\n" "The dtb must be 8-byte aligned and must not exceed 2 MB in size.\n" "\nPlease check your bootloader.\n", &dt_phys, dt_virt, size); /* * Note that in this _really_ early stage we cannot even BUG() * or oops, so the least terrible thing to do is cpu_relax(), * or else we could end-up printing non-initialized data, etc. */ while (true) cpu_relax(); } /* Early fixups are done, map the FDT as read-only now */ fixmap_remap_fdt(dt_phys, &size, PAGE_KERNEL_RO); name = of_flat_dt_get_machine_name(); if (!name) return; pr_info("Machine model: %s\n", name); dump_stack_set_arch_desc("%s (DT)", name); } static void __init request_standard_resources(void) { struct memblock_region *region; struct resource *res; unsigned long i = 0; size_t res_size; kernel_code.start = __pa_symbol(_stext); kernel_code.end = __pa_symbol(__init_begin - 1); kernel_data.start = __pa_symbol(_sdata); kernel_data.end = __pa_symbol(_end - 1); insert_resource(&iomem_resource, &kernel_code); insert_resource(&iomem_resource, &kernel_data); num_standard_resources = memblock.memory.cnt; res_size = num_standard_resources * sizeof(*standard_resources); standard_resources = memblock_alloc_or_panic(res_size, SMP_CACHE_BYTES); for_each_mem_region(region) { res = &standard_resources[i++]; if (memblock_is_nomap(region)) { res->name = "reserved"; res->flags = IORESOURCE_MEM; res->start = __pfn_to_phys(memblock_region_reserved_base_pfn(region)); res->end = __pfn_to_phys(memblock_region_reserved_end_pfn(region)) - 1; } else { res->name = "System RAM"; res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; res->start = __pfn_to_phys(memblock_region_memory_base_pfn(region)); res->end = __pfn_to_phys(memblock_region_memory_end_pfn(region)) - 1; } insert_resource(&iomem_resource, res); } } static int __init reserve_memblock_reserved_regions(void) { u64 i, j; for (i = 0; i < num_standard_resources; ++i) { struct resource *mem = &standard_resources[i]; phys_addr_t r_start, r_end, mem_size = resource_size(mem); if (!memblock_is_region_reserved(mem->start, mem_size)) continue; for_each_reserved_mem_range(j, &r_start, &r_end) { resource_size_t start, end; start = max(PFN_PHYS(PFN_DOWN(r_start)), mem->start); end = min(PFN_PHYS(PFN_UP(r_end)) - 1, mem->end); if (start > mem->end || end < mem->start) continue; reserve_region_with_split(mem, start, end, "reserved"); } } return 0; } arch_initcall(reserve_memblock_reserved_regions); u64 __cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] = INVALID_HWID }; u64 cpu_logical_map(unsigned int cpu) { return __cpu_logical_map[cpu]; } void __init __no_sanitize_address setup_arch(char **cmdline_p) { setup_initial_init_mm(_stext, _etext, _edata, _end); *cmdline_p = boot_command_line; kaslr_init(); early_fixmap_init(); early_ioremap_init(); setup_machine_fdt(__fdt_pointer); /* * Initialise the static keys early as they may be enabled by the * cpufeature code and early parameters. */ jump_label_init(); parse_early_param(); dynamic_scs_init(); /* * The primary CPU enters the kernel with all DAIF exceptions masked. * * We must unmask Debug and SError before preemption or scheduling is * possible to ensure that these are consistently unmasked across * threads, and we want to unmask SError as soon as possible after * initializing earlycon so that we can report any SErrors immediately. * * IRQ and FIQ will be unmasked after the root irqchip has been * detected and initialized. */ local_daif_restore(DAIF_PROCCTX_NOIRQ); /* * TTBR0 is only used for the identity mapping at this stage. Make it * point to zero page to avoid speculatively fetching new entries. */ cpu_uninstall_idmap(); xen_early_init(); efi_init(); if (!efi_enabled(EFI_BOOT)) { if ((u64)_text % MIN_KIMG_ALIGN) pr_warn(FW_BUG "Kernel image misaligned at boot, please fix your bootloader!"); WARN_TAINT(mmu_enabled_at_boot, TAINT_FIRMWARE_WORKAROUND, FW_BUG "Booted with MMU enabled!"); } arm64_memblock_init(); paging_init(); acpi_table_upgrade(); /* Parse the ACPI tables for possible boot-time configuration */ acpi_boot_table_init(); if (acpi_disabled) unflatten_device_tree(); bootmem_init(); kasan_init(); request_standard_resources(); early_ioremap_reset(); if (acpi_disabled) psci_dt_init(); else psci_acpi_init(); arm64_rsi_init(); init_bootcpu_ops(); smp_init_cpus(); smp_build_mpidr_hash(); #ifdef CONFIG_ARM64_SW_TTBR0_PAN /* * Make sure init_thread_info.ttbr0 always generates translation * faults in case uaccess_enable() is inadvertently called by the init * thread. */ init_task.thread_info.ttbr0 = phys_to_ttbr(__pa_symbol(reserved_pg_dir)); #endif if (boot_args[1] || boot_args[2] || boot_args[3]) { pr_err("WARNING: x1-x3 nonzero in violation of boot protocol:\n" "\tx1: %016llx\n\tx2: %016llx\n\tx3: %016llx\n" "This indicates a broken bootloader or old kernel\n", boot_args[1], boot_args[2], boot_args[3]); } } static inline bool cpu_can_disable(unsigned int cpu) { #ifdef CONFIG_HOTPLUG_CPU const struct cpu_operations *ops = get_cpu_ops(cpu); if (ops && ops->cpu_can_disable) return ops->cpu_can_disable(cpu); #endif return false; } bool arch_cpu_is_hotpluggable(int num) { return cpu_can_disable(num); } static void dump_kernel_offset(void) { const unsigned long offset = kaslr_offset(); if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && offset > 0) { pr_emerg("Kernel Offset: 0x%lx from 0x%lx\n", offset, KIMAGE_VADDR); pr_emerg("PHYS_OFFSET: 0x%llx\n", PHYS_OFFSET); } else { pr_emerg("Kernel Offset: disabled\n"); } } static int arm64_panic_block_dump(struct notifier_block *self, unsigned long v, void *p) { dump_kernel_offset(); dump_cpu_features(); dump_mem_limit(); return 0; } static struct notifier_block arm64_panic_block = { .notifier_call = arm64_panic_block_dump }; static int __init register_arm64_panic_block(void) { atomic_notifier_chain_register(&panic_notifier_list, &arm64_panic_block); return 0; } device_initcall(register_arm64_panic_block); static int __init check_mmu_enabled_at_boot(void) { if (!efi_enabled(EFI_BOOT) && mmu_enabled_at_boot) panic("Non-EFI boot detected with MMU and caches enabled"); return 0; } device_initcall_sync(check_mmu_enabled_at_boot);
78 76 194 89 3 1 80 77 93 95 95 89 81 1 71 79 82 82 93 211 4 4 3 1 202 194 96 78 183 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2012 - Virtual Open Systems and Columbia University * Author: Christoffer Dall <c.dall@virtualopensystems.com> */ #include <linux/kvm_host.h> #include <asm/kvm_emulate.h> #include <trace/events/kvm.h> #include "trace.h" void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data) { void *datap = NULL; union { u8 byte; u16 hword; u32 word; u64 dword; } tmp; switch (len) { case 1: tmp.byte = data; datap = &tmp.byte; break; case 2: tmp.hword = data; datap = &tmp.hword; break; case 4: tmp.word = data; datap = &tmp.word; break; case 8: tmp.dword = data; datap = &tmp.dword; break; } memcpy(buf, datap, len); } unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len) { unsigned long data = 0; union { u16 hword; u32 word; u64 dword; } tmp; switch (len) { case 1: data = *(u8 *)buf; break; case 2: memcpy(&tmp.hword, buf, len); data = tmp.hword; break; case 4: memcpy(&tmp.word, buf, len); data = tmp.word; break; case 8: memcpy(&tmp.dword, buf, len); data = tmp.dword; break; } return data; } static bool kvm_pending_external_abort(struct kvm_vcpu *vcpu) { if (!vcpu_get_flag(vcpu, PENDING_EXCEPTION)) return false; if (vcpu_el1_is_32bit(vcpu)) { switch (vcpu_get_flag(vcpu, EXCEPT_MASK)) { case unpack_vcpu_flag(EXCEPT_AA32_UND): case unpack_vcpu_flag(EXCEPT_AA32_IABT): case unpack_vcpu_flag(EXCEPT_AA32_DABT): return true; default: return false; } } else { switch (vcpu_get_flag(vcpu, EXCEPT_MASK)) { case unpack_vcpu_flag(EXCEPT_AA64_EL1_SYNC): case unpack_vcpu_flag(EXCEPT_AA64_EL2_SYNC): case unpack_vcpu_flag(EXCEPT_AA64_EL1_SERR): case unpack_vcpu_flag(EXCEPT_AA64_EL2_SERR): return true; default: return false; } } } /** * kvm_handle_mmio_return -- Handle MMIO loads after user space emulation * or in-kernel IO emulation * * @vcpu: The VCPU pointer */ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu) { unsigned long data; unsigned int len; int mask; /* * Detect if the MMIO return was already handled or if userspace aborted * the MMIO access. */ if (unlikely(!vcpu->mmio_needed || kvm_pending_external_abort(vcpu))) return 1; vcpu->mmio_needed = 0; if (!kvm_vcpu_dabt_iswrite(vcpu)) { struct kvm_run *run = vcpu->run; len = kvm_vcpu_dabt_get_as(vcpu); data = kvm_mmio_read_buf(run->mmio.data, len); if (kvm_vcpu_dabt_issext(vcpu) && len < sizeof(unsigned long)) { mask = 1U << ((len * 8) - 1); data = (data ^ mask) - mask; } if (!kvm_vcpu_dabt_issf(vcpu)) data = data & 0xffffffff; trace_kvm_mmio(KVM_TRACE_MMIO_READ, len, run->mmio.phys_addr, &data); data = vcpu_data_host_to_guest(vcpu, data, len); vcpu_set_reg(vcpu, kvm_vcpu_dabt_get_rd(vcpu), data); } /* * The MMIO instruction is emulated and should not be re-executed * in the guest. */ kvm_incr_pc(vcpu); return 1; } int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) { struct kvm_run *run = vcpu->run; unsigned long data; unsigned long rt; int ret; bool is_write; int len; u8 data_buf[8]; /* * No valid syndrome? Ask userspace for help if it has * volunteered to do so, and bail out otherwise. * * In the protected VM case, there isn't much userspace can do * though, so directly deliver an exception to the guest. */ if (!kvm_vcpu_dabt_isvalid(vcpu)) { trace_kvm_mmio_nisv(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu), kvm_vcpu_get_hfar(vcpu), fault_ipa); if (vcpu_is_protected(vcpu)) return kvm_inject_sea_dabt(vcpu, kvm_vcpu_get_hfar(vcpu)); if (test_bit(KVM_ARCH_FLAG_RETURN_NISV_IO_ABORT_TO_USER, &vcpu->kvm->arch.flags)) { run->exit_reason = KVM_EXIT_ARM_NISV; run->arm_nisv.esr_iss = kvm_vcpu_dabt_iss_nisv_sanitized(vcpu); run->arm_nisv.fault_ipa = fault_ipa; return 0; } return -ENOSYS; } /* * Prepare MMIO operation. First decode the syndrome data we get * from the CPU. Then try if some in-kernel emulation feels * responsible, otherwise let user space do its magic. */ is_write = kvm_vcpu_dabt_iswrite(vcpu); len = kvm_vcpu_dabt_get_as(vcpu); rt = kvm_vcpu_dabt_get_rd(vcpu); if (is_write) { data = vcpu_data_guest_to_host(vcpu, vcpu_get_reg(vcpu, rt), len); trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, len, fault_ipa, &data); kvm_mmio_write_buf(data_buf, len, data); ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, fault_ipa, len, data_buf); } else { trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, len, fault_ipa, NULL); ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, fault_ipa, len, data_buf); } /* Now prepare kvm_run for the potential return to userland. */ run->mmio.is_write = is_write; run->mmio.phys_addr = fault_ipa; run->mmio.len = len; vcpu->mmio_needed = 1; if (!ret) { /* We handled the access successfully in the kernel. */ if (!is_write) memcpy(run->mmio.data, data_buf, len); vcpu->stat.mmio_exit_kernel++; kvm_handle_mmio_return(vcpu); return 1; } if (is_write) memcpy(run->mmio.data, data_buf, len); vcpu->stat.mmio_exit_user++; run->exit_reason = KVM_EXIT_MMIO; return 0; }
6 6 6 6 159 159 158 159 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 // SPDX-License-Identifier: GPL-2.0 /* * Implementation of the multi-level security (MLS) policy. * * Author : Stephen Smalley, <stephen.smalley.work@gmail.com> */ /* * Updated: Trusted Computer Solutions, Inc. <dgoeddel@trustedcs.com> * Support for enhanced MLS infrastructure. * Copyright (C) 2004-2006 Trusted Computer Solutions, Inc. * * Updated: Hewlett-Packard <paul@paul-moore.com> * Added support to import/export the MLS label from NetLabel * Copyright (C) Hewlett-Packard Development Company, L.P., 2006 */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/errno.h> #include <net/netlabel.h> #include "sidtab.h" #include "mls.h" #include "policydb.h" #include "services.h" /* * Return the length in bytes for the MLS fields of the * security context string representation of `context'. */ int mls_compute_context_len(struct policydb *p, struct context *context) { int i, l, len, head, prev; char *nm; struct ebitmap *e; struct ebitmap_node *node; if (!p->mls_enabled) return 0; len = 1; /* for the beginning ":" */ for (l = 0; l < 2; l++) { u32 index_sens = context->range.level[l].sens; len += strlen(sym_name(p, SYM_LEVELS, index_sens - 1)); /* categories */ head = -2; prev = -2; e = &context->range.level[l].cat; ebitmap_for_each_positive_bit(e, node, i) { if (i - prev > 1) { /* one or more negative bits are skipped */ if (head != prev) { nm = sym_name(p, SYM_CATS, prev); len += strlen(nm) + 1; } nm = sym_name(p, SYM_CATS, i); len += strlen(nm) + 1; head = i; } prev = i; } if (prev != head) { nm = sym_name(p, SYM_CATS, prev); len += strlen(nm) + 1; } if (l == 0) { if (mls_level_eq(&context->range.level[0], &context->range.level[1])) break; else len++; } } return len; } /* * Write the security context string representation of * the MLS fields of `context' into the string `*scontext'. * Update `*scontext' to point to the end of the MLS fields. */ void mls_sid_to_context(struct policydb *p, struct context *context, char **scontext) { char *scontextp, *nm; int i, l, head, prev; struct ebitmap *e; struct ebitmap_node *node; if (!p->mls_enabled) return; scontextp = *scontext; *scontextp = ':'; scontextp++; for (l = 0; l < 2; l++) { strcpy(scontextp, sym_name(p, SYM_LEVELS, context->range.level[l].sens - 1)); scontextp += strlen(scontextp); /* categories */ head = -2; prev = -2; e = &context->range.level[l].cat; ebitmap_for_each_positive_bit(e, node, i) { if (i - prev > 1) { /* one or more negative bits are skipped */ if (prev != head) { if (prev - head > 1) *scontextp++ = '.'; else *scontextp++ = ','; nm = sym_name(p, SYM_CATS, prev); strcpy(scontextp, nm); scontextp += strlen(nm); } if (prev < 0) *scontextp++ = ':'; else *scontextp++ = ','; nm = sym_name(p, SYM_CATS, i); strcpy(scontextp, nm); scontextp += strlen(nm); head = i; } prev = i; } if (prev != head) { if (prev - head > 1) *scontextp++ = '.'; else *scontextp++ = ','; nm = sym_name(p, SYM_CATS, prev); strcpy(scontextp, nm); scontextp += strlen(nm); } if (l == 0) { if (mls_level_eq(&context->range.level[0], &context->range.level[1])) break; else *scontextp++ = '-'; } } *scontext = scontextp; } int mls_level_isvalid(struct policydb *p, struct mls_level *l) { struct level_datum *levdatum; if (!l->sens || l->sens > p->p_levels.nprim) return 0; levdatum = symtab_search(&p->p_levels, sym_name(p, SYM_LEVELS, l->sens - 1)); if (!levdatum) return 0; /* * Return 1 iff all the bits set in l->cat are also be set in * levdatum->level->cat and no bit in l->cat is larger than * p->p_cats.nprim. */ return ebitmap_contains(&levdatum->level.cat, &l->cat, p->p_cats.nprim); } int mls_range_isvalid(struct policydb *p, struct mls_range *r) { return (mls_level_isvalid(p, &r->level[0]) && mls_level_isvalid(p, &r->level[1]) && mls_level_dom(&r->level[1], &r->level[0])); } /* * Return 1 if the MLS fields in the security context * structure `c' are valid. Return 0 otherwise. */ int mls_context_isvalid(struct policydb *p, struct context *c) { struct user_datum *usrdatum; if (!p->mls_enabled) return 1; if (!mls_range_isvalid(p, &c->range)) return 0; if (c->role == OBJECT_R_VAL) return 1; /* * User must be authorized for the MLS range. */ if (!c->user || c->user > p->p_users.nprim) return 0; usrdatum = p->user_val_to_struct[c->user - 1]; if (!mls_range_contains(usrdatum->range, c->range)) return 0; /* user may not be associated with range */ return 1; } /* * Set the MLS fields in the security context structure * `context' based on the string representation in * the string `scontext'. * * This function modifies the string in place, inserting * NULL characters to terminate the MLS fields. * * If a def_sid is provided and no MLS field is present, * copy the MLS field of the associated default context. * Used for upgraded to MLS systems where objects may lack * MLS fields. * * Policy read-lock must be held for sidtab lookup. * */ int mls_context_to_sid(struct policydb *pol, char oldc, char *scontext, struct context *context, struct sidtab *s, u32 def_sid) { char *sensitivity, *cur_cat, *next_cat, *rngptr; struct level_datum *levdatum; struct cat_datum *catdatum, *rngdatum; u32 i; int l, rc; char *rangep[2]; if (!pol->mls_enabled) { /* * With no MLS, only return -EINVAL if there is a MLS field * and it did not come from an xattr. */ if (oldc && def_sid == SECSID_NULL) return -EINVAL; return 0; } /* * No MLS component to the security context, try and map to * default if provided. */ if (!oldc) { struct context *defcon; if (def_sid == SECSID_NULL) return -EINVAL; defcon = sidtab_search(s, def_sid); if (!defcon) return -EINVAL; return mls_context_cpy(context, defcon); } /* * If we're dealing with a range, figure out where the two parts * of the range begin. */ rangep[0] = scontext; rangep[1] = strchr(scontext, '-'); if (rangep[1]) { rangep[1][0] = '\0'; rangep[1]++; } /* For each part of the range: */ for (l = 0; l < 2; l++) { /* Split sensitivity and category set. */ sensitivity = rangep[l]; if (sensitivity == NULL) break; next_cat = strchr(sensitivity, ':'); if (next_cat) *(next_cat++) = '\0'; /* Parse sensitivity. */ levdatum = symtab_search(&pol->p_levels, sensitivity); if (!levdatum) return -EINVAL; context->range.level[l].sens = levdatum->level.sens; /* Extract category set. */ while (next_cat != NULL) { cur_cat = next_cat; next_cat = strchr(next_cat, ','); if (next_cat != NULL) *(next_cat++) = '\0'; /* Separate into range if exists */ rngptr = strchr(cur_cat, '.'); if (rngptr != NULL) { /* Remove '.' */ *rngptr++ = '\0'; } catdatum = symtab_search(&pol->p_cats, cur_cat); if (!catdatum) return -EINVAL; rc = ebitmap_set_bit(&context->range.level[l].cat, catdatum->value - 1, 1); if (rc) return rc; /* If range, set all categories in range */ if (rngptr == NULL) continue; rngdatum = symtab_search(&pol->p_cats, rngptr); if (!rngdatum) return -EINVAL; if (catdatum->value >= rngdatum->value) return -EINVAL; for (i = catdatum->value; i < rngdatum->value; i++) { rc = ebitmap_set_bit( &context->range.level[l].cat, i, 1); if (rc) return rc; } } } /* If we didn't see a '-', the range start is also the range end. */ if (rangep[1] == NULL) { context->range.level[1].sens = context->range.level[0].sens; rc = ebitmap_cpy(&context->range.level[1].cat, &context->range.level[0].cat); if (rc) return rc; } return 0; } /* * Set the MLS fields in the security context structure * `context' based on the string representation in * the string `str'. This function will allocate temporary memory with the * given constraints of gfp_mask. */ int mls_from_string(struct policydb *p, char *str, struct context *context, gfp_t gfp_mask) { char *tmpstr; int rc; if (!p->mls_enabled) return -EINVAL; tmpstr = kstrdup(str, gfp_mask); if (!tmpstr) { rc = -ENOMEM; } else { rc = mls_context_to_sid(p, ':', tmpstr, context, NULL, SECSID_NULL); kfree(tmpstr); } return rc; } /* * Copies the MLS range `range' into `context'. */ int mls_range_set(struct context *context, struct mls_range *range) { int l, rc = 0; /* Copy the MLS range into the context */ for (l = 0; l < 2; l++) { context->range.level[l].sens = range->level[l].sens; rc = ebitmap_cpy(&context->range.level[l].cat, &range->level[l].cat); if (rc) break; } return rc; } int mls_setup_user_range(struct policydb *p, struct context *fromcon, struct user_datum *user, struct context *usercon) { if (p->mls_enabled) { struct mls_level *fromcon_sen = &(fromcon->range.level[0]); struct mls_level *fromcon_clr = &(fromcon->range.level[1]); struct mls_level *user_low = &(user->range.level[0]); struct mls_level *user_clr = &(user->range.level[1]); struct mls_level *user_def = &(user->dfltlevel); struct mls_level *usercon_sen = &(usercon->range.level[0]); struct mls_level *usercon_clr = &(usercon->range.level[1]); /* Honor the user's default level if we can */ if (mls_level_between(user_def, fromcon_sen, fromcon_clr)) *usercon_sen = *user_def; else if (mls_level_between(fromcon_sen, user_def, user_clr)) *usercon_sen = *fromcon_sen; else if (mls_level_between(fromcon_clr, user_low, user_def)) *usercon_sen = *user_low; else return -EINVAL; /* Lower the clearance of available contexts if the clearance of "fromcon" is lower than that of the user's default clearance (but only if the "fromcon" clearance dominates the user's computed sensitivity level) */ if (mls_level_dom(user_clr, fromcon_clr)) *usercon_clr = *fromcon_clr; else if (mls_level_dom(fromcon_clr, user_clr)) *usercon_clr = *user_clr; else return -EINVAL; } return 0; } /* * Convert the MLS fields in the security context * structure `oldc' from the values specified in the * policy `oldp' to the values specified in the policy `newp', * storing the resulting context in `newc'. */ int mls_convert_context(struct policydb *oldp, struct policydb *newp, struct context *oldc, struct context *newc) { struct level_datum *levdatum; struct cat_datum *catdatum; struct ebitmap_node *node; u32 i; int l; if (!oldp->mls_enabled || !newp->mls_enabled) return 0; for (l = 0; l < 2; l++) { char *name = sym_name(oldp, SYM_LEVELS, oldc->range.level[l].sens - 1); levdatum = symtab_search(&newp->p_levels, name); if (!levdatum) return -EINVAL; newc->range.level[l].sens = levdatum->level.sens; ebitmap_for_each_positive_bit(&oldc->range.level[l].cat, node, i) { int rc; catdatum = symtab_search(&newp->p_cats, sym_name(oldp, SYM_CATS, i)); if (!catdatum) return -EINVAL; rc = ebitmap_set_bit(&newc->range.level[l].cat, catdatum->value - 1, 1); if (rc) return rc; } } return 0; } int mls_compute_sid(struct policydb *p, struct context *scontext, struct context *tcontext, u16 tclass, u32 specified, struct context *newcontext, bool sock) { struct range_trans rtr; struct mls_range *r; struct class_datum *cladatum; char default_range = 0; if (!p->mls_enabled) return 0; switch (specified) { case AVTAB_TRANSITION: /* Look for a range transition rule. */ rtr.source_type = scontext->type; rtr.target_type = tcontext->type; rtr.target_class = tclass; r = policydb_rangetr_search(p, &rtr); if (r) return mls_range_set(newcontext, r); if (tclass && tclass <= p->p_classes.nprim) { cladatum = p->class_val_to_struct[tclass - 1]; if (cladatum) default_range = cladatum->default_range; } switch (default_range) { case DEFAULT_SOURCE_LOW: return mls_context_cpy_low(newcontext, scontext); case DEFAULT_SOURCE_HIGH: return mls_context_cpy_high(newcontext, scontext); case DEFAULT_SOURCE_LOW_HIGH: return mls_context_cpy(newcontext, scontext); case DEFAULT_TARGET_LOW: return mls_context_cpy_low(newcontext, tcontext); case DEFAULT_TARGET_HIGH: return mls_context_cpy_high(newcontext, tcontext); case DEFAULT_TARGET_LOW_HIGH: return mls_context_cpy(newcontext, tcontext); case DEFAULT_GLBLUB: return mls_context_glblub(newcontext, scontext, tcontext); } fallthrough; case AVTAB_CHANGE: if ((tclass == p->process_class) || sock) /* Use the process MLS attributes. */ return mls_context_cpy(newcontext, scontext); else /* Use the process effective MLS attributes. */ return mls_context_cpy_low(newcontext, scontext); case AVTAB_MEMBER: /* Use the process effective MLS attributes. */ return mls_context_cpy_low(newcontext, scontext); } return -EINVAL; } #ifdef CONFIG_NETLABEL /** * mls_export_netlbl_lvl - Export the MLS sensitivity levels to NetLabel * @p: the policy * @context: the security context * @secattr: the NetLabel security attributes * * Description: * Given the security context copy the low MLS sensitivity level into the * NetLabel MLS sensitivity level field. * */ void mls_export_netlbl_lvl(struct policydb *p, struct context *context, struct netlbl_lsm_secattr *secattr) { if (!p->mls_enabled) return; secattr->attr.mls.lvl = context->range.level[0].sens - 1; secattr->flags |= NETLBL_SECATTR_MLS_LVL; } /** * mls_import_netlbl_lvl - Import the NetLabel MLS sensitivity levels * @p: the policy * @context: the security context * @secattr: the NetLabel security attributes * * Description: * Given the security context and the NetLabel security attributes, copy the * NetLabel MLS sensitivity level into the context. * */ void mls_import_netlbl_lvl(struct policydb *p, struct context *context, struct netlbl_lsm_secattr *secattr) { if (!p->mls_enabled) return; context->range.level[0].sens = secattr->attr.mls.lvl + 1; context->range.level[1].sens = context->range.level[0].sens; } /** * mls_export_netlbl_cat - Export the MLS categories to NetLabel * @p: the policy * @context: the security context * @secattr: the NetLabel security attributes * * Description: * Given the security context copy the low MLS categories into the NetLabel * MLS category field. Returns zero on success, negative values on failure. * */ int mls_export_netlbl_cat(struct policydb *p, struct context *context, struct netlbl_lsm_secattr *secattr) { int rc; if (!p->mls_enabled) return 0; rc = ebitmap_netlbl_export(&context->range.level[0].cat, &secattr->attr.mls.cat); if (rc == 0 && secattr->attr.mls.cat != NULL) secattr->flags |= NETLBL_SECATTR_MLS_CAT; return rc; } /** * mls_import_netlbl_cat - Import the MLS categories from NetLabel * @p: the policy * @context: the security context * @secattr: the NetLabel security attributes * * Description: * Copy the NetLabel security attributes into the SELinux context; since the * NetLabel security attribute only contains a single MLS category use it for * both the low and high categories of the context. Returns zero on success, * negative values on failure. * */ int mls_import_netlbl_cat(struct policydb *p, struct context *context, struct netlbl_lsm_secattr *secattr) { int rc; if (!p->mls_enabled) return 0; rc = ebitmap_netlbl_import(&context->range.level[0].cat, secattr->attr.mls.cat); if (rc) goto import_netlbl_cat_failure; memcpy(&context->range.level[1].cat, &context->range.level[0].cat, sizeof(context->range.level[0].cat)); return 0; import_netlbl_cat_failure: ebitmap_destroy(&context->range.level[0].cat); return rc; } #endif /* CONFIG_NETLABEL */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 // SPDX-License-Identifier: GPL-2.0 /* * kobject.h - generic kernel object infrastructure. * * Copyright (c) 2002-2003 Patrick Mochel * Copyright (c) 2002-2003 Open Source Development Labs * Copyright (c) 2006-2008 Greg Kroah-Hartman <greg@kroah.com> * Copyright (c) 2006-2008 Novell Inc. * * Please read Documentation/core-api/kobject.rst before using the kobject * interface, ESPECIALLY the parts about reference counts and object * destructors. */ #ifndef _KOBJECT_H_ #define _KOBJECT_H_ #include <linux/types.h> #include <linux/list.h> #include <linux/sysfs.h> #include <linux/compiler.h> #include <linux/container_of.h> #include <linux/spinlock.h> #include <linux/kref.h> #include <linux/kobject_ns.h> #include <linux/wait.h> #include <linux/atomic.h> #include <linux/workqueue.h> #include <linux/uidgid.h> #define UEVENT_HELPER_PATH_LEN 256 #define UEVENT_NUM_ENVP 64 /* number of env pointers */ #define UEVENT_BUFFER_SIZE 2048 /* buffer for the variables */ #ifdef CONFIG_UEVENT_HELPER /* path to the userspace helper executed on an event */ extern char uevent_helper[]; #endif /* counter to tag the uevent, read only except for the kobject core */ extern atomic64_t uevent_seqnum; /* * The actions here must match the index to the string array * in lib/kobject_uevent.c * * Do not add new actions here without checking with the driver-core * maintainers. Action strings are not meant to express subsystem * or device specific properties. In most cases you want to send a * kobject_uevent_env(kobj, KOBJ_CHANGE, env) with additional event * specific variables added to the event environment. */ enum kobject_action { KOBJ_ADD, KOBJ_REMOVE, KOBJ_CHANGE, KOBJ_MOVE, KOBJ_ONLINE, KOBJ_OFFLINE, KOBJ_BIND, KOBJ_UNBIND, }; struct kobject { const char *name; struct list_head entry; struct kobject *parent; struct kset *kset; const struct kobj_type *ktype; struct kernfs_node *sd; /* sysfs directory entry */ struct kref kref; unsigned int state_initialized:1; unsigned int state_in_sysfs:1; unsigned int state_add_uevent_sent:1; unsigned int state_remove_uevent_sent:1; unsigned int uevent_suppress:1; #ifdef CONFIG_DEBUG_KOBJECT_RELEASE struct delayed_work release; #endif }; __printf(2, 3) int kobject_set_name(struct kobject *kobj, const char *name, ...); __printf(2, 0) int kobject_set_name_vargs(struct kobject *kobj, const char *fmt, va_list vargs); static inline const char *kobject_name(const struct kobject *kobj) { return kobj->name; } void kobject_init(struct kobject *kobj, const struct kobj_type *ktype); __printf(3, 4) __must_check int kobject_add(struct kobject *kobj, struct kobject *parent, const char *fmt, ...); __printf(4, 5) __must_check int kobject_init_and_add(struct kobject *kobj, const struct kobj_type *ktype, struct kobject *parent, const char *fmt, ...); void kobject_del(struct kobject *kobj); struct kobject * __must_check kobject_create_and_add(const char *name, struct kobject *parent); int __must_check kobject_rename(struct kobject *, const char *new_name); int __must_check kobject_move(struct kobject *, struct kobject *); struct kobject *kobject_get(struct kobject *kobj); struct kobject * __must_check kobject_get_unless_zero(struct kobject *kobj); void kobject_put(struct kobject *kobj); const void *kobject_namespace(const struct kobject *kobj); void kobject_get_ownership(const struct kobject *kobj, kuid_t *uid, kgid_t *gid); char *kobject_get_path(const struct kobject *kobj, gfp_t flag); struct kobj_type { void (*release)(struct kobject *kobj); const struct sysfs_ops *sysfs_ops; const struct attribute_group **default_groups; const struct kobj_ns_type_operations *(*child_ns_type)(const struct kobject *kobj); const void *(*namespace)(const struct kobject *kobj); void (*get_ownership)(const struct kobject *kobj, kuid_t *uid, kgid_t *gid); }; struct kobj_uevent_env { char *argv[3]; char *envp[UEVENT_NUM_ENVP]; int envp_idx; char buf[UEVENT_BUFFER_SIZE]; int buflen; }; struct kset_uevent_ops { int (* const filter)(const struct kobject *kobj); const char *(* const name)(const struct kobject *kobj); int (* const uevent)(const struct kobject *kobj, struct kobj_uevent_env *env); }; struct kobj_attribute { struct attribute attr; ssize_t (*show)(struct kobject *kobj, struct kobj_attribute *attr, char *buf); ssize_t (*store)(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count); }; extern const struct sysfs_ops kobj_sysfs_ops; struct sock; /** * struct kset - a set of kobjects of a specific type, belonging to a specific subsystem. * * A kset defines a group of kobjects. They can be individually * different "types" but overall these kobjects all want to be grouped * together and operated on in the same manner. ksets are used to * define the attribute callbacks and other common events that happen to * a kobject. * * @list: the list of all kobjects for this kset * @list_lock: a lock for iterating over the kobjects * @kobj: the embedded kobject for this kset (recursion, isn't it fun...) * @uevent_ops: the set of uevent operations for this kset. These are * called whenever a kobject has something happen to it so that the kset * can add new environment variables, or filter out the uevents if so * desired. */ struct kset { struct list_head list; spinlock_t list_lock; struct kobject kobj; const struct kset_uevent_ops *uevent_ops; } __randomize_layout; void kset_init(struct kset *kset); int __must_check kset_register(struct kset *kset); void kset_unregister(struct kset *kset); struct kset * __must_check kset_create_and_add(const char *name, const struct kset_uevent_ops *u, struct kobject *parent_kobj); static inline struct kset *to_kset(struct kobject *kobj) { return kobj ? container_of(kobj, struct kset, kobj) : NULL; } static inline struct kset *kset_get(struct kset *k) { return k ? to_kset(kobject_get(&k->kobj)) : NULL; } static inline void kset_put(struct kset *k) { kobject_put(&k->kobj); } static inline const struct kobj_type *get_ktype(const struct kobject *kobj) { return kobj->ktype; } struct kobject *kset_find_obj(struct kset *, const char *); /* The global /sys/kernel/ kobject for people to chain off of */ extern struct kobject *kernel_kobj; /* The global /sys/kernel/mm/ kobject for people to chain off of */ extern struct kobject *mm_kobj; /* The global /sys/hypervisor/ kobject for people to chain off of */ extern struct kobject *hypervisor_kobj; /* The global /sys/power/ kobject for people to chain off of */ extern struct kobject *power_kobj; /* The global /sys/firmware/ kobject for people to chain off of */ extern struct kobject *firmware_kobj; int kobject_uevent(struct kobject *kobj, enum kobject_action action); int kobject_uevent_env(struct kobject *kobj, enum kobject_action action, char *envp[]); int kobject_synth_uevent(struct kobject *kobj, const char *buf, size_t count); __printf(2, 3) int add_uevent_var(struct kobj_uevent_env *env, const char *format, ...); #endif /* _KOBJECT_H_ */
301 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 /* SPDX-License-Identifier: GPL-2.0+ */ /* * Read-Copy Update mechanism for mutual exclusion (tree-based version) * * Copyright IBM Corporation, 2008 * * Author: Dipankar Sarma <dipankar@in.ibm.com> * Paul E. McKenney <paulmck@linux.ibm.com> Hierarchical algorithm * * Based on the original work by Paul McKenney <paulmck@linux.ibm.com> * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. * * For detailed explanation of Read-Copy Update mechanism see - * Documentation/RCU */ #ifndef __LINUX_RCUTREE_H #define __LINUX_RCUTREE_H void rcu_softirq_qs(void); void rcu_note_context_switch(bool preempt); int rcu_needs_cpu(void); void rcu_cpu_stall_reset(void); void rcu_request_urgent_qs_task(struct task_struct *t); /* * Note a virtualization-based context switch. This is simply a * wrapper around rcu_note_context_switch(), which allows TINY_RCU * to save a few bytes. The caller must have disabled interrupts. */ static inline void rcu_virt_note_context_switch(void) { rcu_note_context_switch(false); } void synchronize_rcu_expedited(void); void rcu_barrier(void); void rcu_momentary_eqs(void); struct rcu_gp_oldstate { unsigned long rgos_norm; unsigned long rgos_exp; }; // Maximum number of rcu_gp_oldstate values corresponding to // not-yet-completed RCU grace periods. #define NUM_ACTIVE_RCU_POLL_FULL_OLDSTATE 4 /** * same_state_synchronize_rcu_full - Are two old-state values identical? * @rgosp1: First old-state value. * @rgosp2: Second old-state value. * * The two old-state values must have been obtained from either * get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(), * or get_completed_synchronize_rcu_full(). Returns @true if the two * values are identical and @false otherwise. This allows structures * whose lifetimes are tracked by old-state values to push these values * to a list header, allowing those structures to be slightly smaller. * * Note that equality is judged on a bitwise basis, so that an * @rcu_gp_oldstate structure with an already-completed state in one field * will compare not-equal to a structure with an already-completed state * in the other field. After all, the @rcu_gp_oldstate structure is opaque * so how did such a situation come to pass in the first place? */ static inline bool same_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp1, struct rcu_gp_oldstate *rgosp2) { return rgosp1->rgos_norm == rgosp2->rgos_norm && rgosp1->rgos_exp == rgosp2->rgos_exp; } unsigned long start_poll_synchronize_rcu_expedited(void); void start_poll_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp); void cond_synchronize_rcu_expedited(unsigned long oldstate); void cond_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp); unsigned long get_state_synchronize_rcu(void); void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp); unsigned long start_poll_synchronize_rcu(void); void start_poll_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp); bool poll_state_synchronize_rcu(unsigned long oldstate); bool poll_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp); void cond_synchronize_rcu(unsigned long oldstate); void cond_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp); #ifdef CONFIG_PROVE_RCU void rcu_irq_exit_check_preempt(void); #else static inline void rcu_irq_exit_check_preempt(void) { } #endif struct task_struct; void rcu_preempt_deferred_qs(struct task_struct *t); void exit_rcu(void); void rcu_scheduler_starting(void); extern int rcu_scheduler_active; void rcu_end_inkernel_boot(void); bool rcu_inkernel_boot_has_ended(void); bool rcu_is_watching(void); #ifndef CONFIG_PREEMPT_RCU void rcu_all_qs(void); #endif /* RCUtree hotplug events */ int rcutree_prepare_cpu(unsigned int cpu); int rcutree_online_cpu(unsigned int cpu); void rcutree_report_cpu_starting(unsigned int cpu); #ifdef CONFIG_HOTPLUG_CPU int rcutree_dead_cpu(unsigned int cpu); int rcutree_dying_cpu(unsigned int cpu); int rcutree_offline_cpu(unsigned int cpu); #else #define rcutree_dead_cpu NULL #define rcutree_dying_cpu NULL #define rcutree_offline_cpu NULL #endif void rcutree_migrate_callbacks(int cpu); /* Called from hotplug and also arm64 early secondary boot failure */ void rcutree_report_cpu_dead(void); #endif /* __LINUX_RCUTREE_H */
27 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 /* SPDX-License-Identifier: GPL-2.0 */ /* * Authors: Thiébaud Weksteen <tweek@google.com> * Peter Enderborg <Peter.Enderborg@sony.com> */ #undef TRACE_SYSTEM #define TRACE_SYSTEM avc #if !defined(_TRACE_SELINUX_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_SELINUX_H #include <linux/tracepoint.h> TRACE_EVENT(selinux_audited, TP_PROTO(struct selinux_audit_data *sad, char *scontext, char *tcontext, const char *tclass ), TP_ARGS(sad, scontext, tcontext, tclass), TP_STRUCT__entry( __field(u32, requested) __field(u32, denied) __field(u32, audited) __field(int, result) __string(scontext, scontext) __string(tcontext, tcontext) __string(tclass, tclass) ), TP_fast_assign( __entry->requested = sad->requested; __entry->denied = sad->denied; __entry->audited = sad->audited; __entry->result = sad->result; __assign_str(tcontext); __assign_str(scontext); __assign_str(tclass); ), TP_printk("requested=0x%x denied=0x%x audited=0x%x result=%d scontext=%s tcontext=%s tclass=%s", __entry->requested, __entry->denied, __entry->audited, __entry->result, __get_str(scontext), __get_str(tcontext), __get_str(tclass) ) ); #endif /* This part must be outside protection */ #include <trace/define_trace.h>
105 1143 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 // SPDX-License-Identifier: GPL-2.0 #include <linux/export.h> #include <linux/bitops.h> #include <asm/types.h> /** * hweightN - returns the hamming weight of a N-bit word * @x: the word to weigh * * The Hamming Weight of a number is the total number of bits set in it. */ unsigned int __sw_hweight32(unsigned int w) { #ifdef CONFIG_ARCH_HAS_FAST_MULTIPLIER w -= (w >> 1) & 0x55555555; w = (w & 0x33333333) + ((w >> 2) & 0x33333333); w = (w + (w >> 4)) & 0x0f0f0f0f; return (w * 0x01010101) >> 24; #else unsigned int res = w - ((w >> 1) & 0x55555555); res = (res & 0x33333333) + ((res >> 2) & 0x33333333); res = (res + (res >> 4)) & 0x0F0F0F0F; res = res + (res >> 8); return (res + (res >> 16)) & 0x000000FF; #endif } EXPORT_SYMBOL(__sw_hweight32); unsigned int __sw_hweight16(unsigned int w) { unsigned int res = w - ((w >> 1) & 0x5555); res = (res & 0x3333) + ((res >> 2) & 0x3333); res = (res + (res >> 4)) & 0x0F0F; return (res + (res >> 8)) & 0x00FF; } EXPORT_SYMBOL(__sw_hweight16); unsigned int __sw_hweight8(unsigned int w) { unsigned int res = w - ((w >> 1) & 0x55); res = (res & 0x33) + ((res >> 2) & 0x33); return (res + (res >> 4)) & 0x0F; } EXPORT_SYMBOL(__sw_hweight8); unsigned long __sw_hweight64(__u64 w) { #if BITS_PER_LONG == 32 return __sw_hweight32((unsigned int)(w >> 32)) + __sw_hweight32((unsigned int)w); #elif BITS_PER_LONG == 64 #ifdef CONFIG_ARCH_HAS_FAST_MULTIPLIER w -= (w >> 1) & 0x5555555555555555ul; w = (w & 0x3333333333333333ul) + ((w >> 2) & 0x3333333333333333ul); w = (w + (w >> 4)) & 0x0f0f0f0f0f0f0f0ful; return (w * 0x0101010101010101ul) >> 56; #else __u64 res = w - ((w >> 1) & 0x5555555555555555ul); res = (res & 0x3333333333333333ul) + ((res >> 2) & 0x3333333333333333ul); res = (res + (res >> 4)) & 0x0F0F0F0F0F0F0F0Ful; res = res + (res >> 8); res = res + (res >> 16); return (res + (res >> 32)) & 0x00000000000000FFul; #endif #endif } EXPORT_SYMBOL(__sw_hweight64);
13 13 13 13 12 4 6 4 5 1 3 1 28 28 230 6 174 37 28 196 43 226 226 225 217 223 2 226 217 7 226 226 181 48 4 4 227 1 227 7 7 232 8 47 201 240 47 202 1 232 8 232 8 210 16 22 240 239 2 6 34 224 241 241 241 52 53 1 52 55 4 53 34 34 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/open.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/string.h> #include <linux/mm.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/fsnotify.h> #include <linux/module.h> #include <linux/tty.h> #include <linux/namei.h> #include <linux/backing-dev.h> #include <linux/capability.h> #include <linux/securebits.h> #include <linux/security.h> #include <linux/mount.h> #include <linux/fcntl.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/fs.h> #include <linux/personality.h> #include <linux/pagemap.h> #include <linux/syscalls.h> #include <linux/rcupdate.h> #include <linux/audit.h> #include <linux/falloc.h> #include <linux/fs_struct.h> #include <linux/dnotify.h> #include <linux/compat.h> #include <linux/mnt_idmapping.h> #include <linux/filelock.h> #include "internal.h" int do_truncate(struct mnt_idmap *idmap, struct dentry *dentry, loff_t length, unsigned int time_attrs, struct file *filp) { int ret; struct iattr newattrs; /* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */ if (length < 0) return -EINVAL; newattrs.ia_size = length; newattrs.ia_valid = ATTR_SIZE | time_attrs; if (filp) { newattrs.ia_file = filp; newattrs.ia_valid |= ATTR_FILE; } /* Remove suid, sgid, and file capabilities on truncate too */ ret = dentry_needs_remove_privs(idmap, dentry); if (ret < 0) return ret; if (ret) newattrs.ia_valid |= ret | ATTR_FORCE; ret = inode_lock_killable(dentry->d_inode); if (ret) return ret; /* Note any delegations or leases have already been broken: */ ret = notify_change(idmap, dentry, &newattrs, NULL); inode_unlock(dentry->d_inode); return ret; } int vfs_truncate(const struct path *path, loff_t length) { struct mnt_idmap *idmap; struct inode *inode; int error; inode = path->dentry->d_inode; /* For directories it's -EISDIR, for other non-regulars - -EINVAL */ if (S_ISDIR(inode->i_mode)) return -EISDIR; if (!S_ISREG(inode->i_mode)) return -EINVAL; idmap = mnt_idmap(path->mnt); error = inode_permission(idmap, inode, MAY_WRITE); if (error) return error; error = fsnotify_truncate_perm(path, length); if (error) return error; error = mnt_want_write(path->mnt); if (error) return error; error = -EPERM; if (IS_APPEND(inode)) goto mnt_drop_write_and_out; error = get_write_access(inode); if (error) goto mnt_drop_write_and_out; /* * Make sure that there are no leases. get_write_access() protects * against the truncate racing with a lease-granting setlease(). */ error = break_lease(inode, O_WRONLY); if (error) goto put_write_and_out; error = security_path_truncate(path); if (!error) error = do_truncate(idmap, path->dentry, length, 0, NULL); put_write_and_out: put_write_access(inode); mnt_drop_write_and_out: mnt_drop_write(path->mnt); return error; } EXPORT_SYMBOL_GPL(vfs_truncate); int do_sys_truncate(const char __user *pathname, loff_t length) { unsigned int lookup_flags = LOOKUP_FOLLOW; struct path path; int error; if (length < 0) /* sorry, but loff_t says... */ return -EINVAL; retry: error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); if (!error) { error = vfs_truncate(&path, length); path_put(&path); } if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } return error; } SYSCALL_DEFINE2(truncate, const char __user *, path, long, length) { return do_sys_truncate(path, length); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length) { return do_sys_truncate(path, length); } #endif int do_ftruncate(struct file *file, loff_t length, int small) { struct inode *inode; struct dentry *dentry; int error; /* explicitly opened as large or we are on 64-bit box */ if (file->f_flags & O_LARGEFILE) small = 0; dentry = file->f_path.dentry; inode = dentry->d_inode; if (!S_ISREG(inode->i_mode) || !(file->f_mode & FMODE_WRITE)) return -EINVAL; /* Cannot ftruncate over 2^31 bytes without large file support */ if (small && length > MAX_NON_LFS) return -EINVAL; /* Check IS_APPEND on real upper inode */ if (IS_APPEND(file_inode(file))) return -EPERM; error = security_file_truncate(file); if (error) return error; error = fsnotify_truncate_perm(&file->f_path, length); if (error) return error; sb_start_write(inode->i_sb); error = do_truncate(file_mnt_idmap(file), dentry, length, ATTR_MTIME | ATTR_CTIME, file); sb_end_write(inode->i_sb); return error; } int do_sys_ftruncate(unsigned int fd, loff_t length, int small) { if (length < 0) return -EINVAL; CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; return do_ftruncate(fd_file(f), length, small); } SYSCALL_DEFINE2(ftruncate, unsigned int, fd, off_t, length) { return do_sys_ftruncate(fd, length, 1); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_off_t, length) { return do_sys_ftruncate(fd, length, 1); } #endif /* LFS versions of truncate are only needed on 32 bit machines */ #if BITS_PER_LONG == 32 SYSCALL_DEFINE2(truncate64, const char __user *, path, loff_t, length) { return do_sys_truncate(path, length); } SYSCALL_DEFINE2(ftruncate64, unsigned int, fd, loff_t, length) { return do_sys_ftruncate(fd, length, 0); } #endif /* BITS_PER_LONG == 32 */ #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_TRUNCATE64) COMPAT_SYSCALL_DEFINE3(truncate64, const char __user *, pathname, compat_arg_u64_dual(length)) { return ksys_truncate(pathname, compat_arg_u64_glue(length)); } #endif #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_FTRUNCATE64) COMPAT_SYSCALL_DEFINE3(ftruncate64, unsigned int, fd, compat_arg_u64_dual(length)) { return ksys_ftruncate(fd, compat_arg_u64_glue(length)); } #endif int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); int ret; loff_t sum; if (offset < 0 || len <= 0) return -EINVAL; if (mode & ~(FALLOC_FL_MODE_MASK | FALLOC_FL_KEEP_SIZE)) return -EOPNOTSUPP; /* * Modes are exclusive, even if that is not obvious from the encoding * as bit masks and the mix with the flag in the same namespace. * * To make things even more complicated, FALLOC_FL_ALLOCATE_RANGE is * encoded as no bit set. */ switch (mode & FALLOC_FL_MODE_MASK) { case FALLOC_FL_ALLOCATE_RANGE: case FALLOC_FL_UNSHARE_RANGE: case FALLOC_FL_ZERO_RANGE: break; case FALLOC_FL_PUNCH_HOLE: if (!(mode & FALLOC_FL_KEEP_SIZE)) return -EOPNOTSUPP; break; case FALLOC_FL_COLLAPSE_RANGE: case FALLOC_FL_INSERT_RANGE: if (mode & FALLOC_FL_KEEP_SIZE) return -EOPNOTSUPP; break; default: return -EOPNOTSUPP; } if (!(file->f_mode & FMODE_WRITE)) return -EBADF; /* * On append-only files only space preallocation is supported. */ if ((mode & ~FALLOC_FL_KEEP_SIZE) && IS_APPEND(inode)) return -EPERM; if (IS_IMMUTABLE(inode)) return -EPERM; /* * We cannot allow any fallocate operation on an active swapfile */ if (IS_SWAPFILE(inode)) return -ETXTBSY; /* * Revalidate the write permissions, in case security policy has * changed since the files were opened. */ ret = security_file_permission(file, MAY_WRITE); if (ret) return ret; ret = fsnotify_file_area_perm(file, MAY_WRITE, &offset, len); if (ret) return ret; if (S_ISFIFO(inode->i_mode)) return -ESPIPE; if (S_ISDIR(inode->i_mode)) return -EISDIR; if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode)) return -ENODEV; /* Check for wraparound */ if (check_add_overflow(offset, len, &sum)) return -EFBIG; if (sum > inode->i_sb->s_maxbytes) return -EFBIG; if (!file->f_op->fallocate) return -EOPNOTSUPP; file_start_write(file); ret = file->f_op->fallocate(file, mode, offset, len); /* * Create inotify and fanotify events. * * To keep the logic simple always create events if fallocate succeeds. * This implies that events are even created if the file size remains * unchanged, e.g. when using flag FALLOC_FL_KEEP_SIZE. */ if (ret == 0) fsnotify_modify(file); file_end_write(file); return ret; } EXPORT_SYMBOL_GPL(vfs_fallocate); int ksys_fallocate(int fd, int mode, loff_t offset, loff_t len) { CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; return vfs_fallocate(fd_file(f), mode, offset, len); } SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len) { return ksys_fallocate(fd, mode, offset, len); } #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_FALLOCATE) COMPAT_SYSCALL_DEFINE6(fallocate, int, fd, int, mode, compat_arg_u64_dual(offset), compat_arg_u64_dual(len)) { return ksys_fallocate(fd, mode, compat_arg_u64_glue(offset), compat_arg_u64_glue(len)); } #endif /* * access() needs to use the real uid/gid, not the effective uid/gid. * We do this by temporarily clearing all FS-related capabilities and * switching the fsuid/fsgid around to the real ones. * * Creating new credentials is expensive, so we try to skip doing it, * which we can if the result would match what we already got. */ static bool access_need_override_creds(int flags) { const struct cred *cred; if (flags & AT_EACCESS) return false; cred = current_cred(); if (!uid_eq(cred->fsuid, cred->uid) || !gid_eq(cred->fsgid, cred->gid)) return true; if (!issecure(SECURE_NO_SETUID_FIXUP)) { kuid_t root_uid = make_kuid(cred->user_ns, 0); if (!uid_eq(cred->uid, root_uid)) { if (!cap_isclear(cred->cap_effective)) return true; } else { if (!cap_isidentical(cred->cap_effective, cred->cap_permitted)) return true; } } return false; } static const struct cred *access_override_creds(void) { struct cred *override_cred; override_cred = prepare_creds(); if (!override_cred) return NULL; /* * XXX access_need_override_creds performs checks in hopes of skipping * this work. Make sure it stays in sync if making any changes in this * routine. */ override_cred->fsuid = override_cred->uid; override_cred->fsgid = override_cred->gid; if (!issecure(SECURE_NO_SETUID_FIXUP)) { /* Clear the capabilities if we switch to a non-root user */ kuid_t root_uid = make_kuid(override_cred->user_ns, 0); if (!uid_eq(override_cred->uid, root_uid)) cap_clear(override_cred->cap_effective); else override_cred->cap_effective = override_cred->cap_permitted; } /* * The new set of credentials can *only* be used in * task-synchronous circumstances, and does not need * RCU freeing, unless somebody then takes a separate * reference to it. * * NOTE! This is _only_ true because this credential * is used purely for override_creds() that installs * it as the subjective cred. Other threads will be * accessing ->real_cred, not the subjective cred. * * If somebody _does_ make a copy of this (using the * 'get_current_cred()' function), that will clear the * non_rcu field, because now that other user may be * expecting RCU freeing. But normal thread-synchronous * cred accesses will keep things non-racy to avoid RCU * freeing. */ override_cred->non_rcu = 1; return override_creds(override_cred); } static int do_faccessat(int dfd, const char __user *filename, int mode, int flags) { struct path path; struct inode *inode; int res; unsigned int lookup_flags = LOOKUP_FOLLOW; const struct cred *old_cred = NULL; if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ return -EINVAL; if (flags & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) return -EINVAL; if (flags & AT_SYMLINK_NOFOLLOW) lookup_flags &= ~LOOKUP_FOLLOW; if (flags & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; if (access_need_override_creds(flags)) { old_cred = access_override_creds(); if (!old_cred) return -ENOMEM; } retry: res = user_path_at(dfd, filename, lookup_flags, &path); if (res) goto out; inode = d_backing_inode(path.dentry); if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) { /* * MAY_EXEC on regular files is denied if the fs is mounted * with the "noexec" flag. */ res = -EACCES; if (path_noexec(&path)) goto out_path_release; } res = inode_permission(mnt_idmap(path.mnt), inode, mode | MAY_ACCESS); /* SuS v2 requires we report a read only fs too */ if (res || !(mode & S_IWOTH) || special_file(inode->i_mode)) goto out_path_release; /* * This is a rare case where using __mnt_is_readonly() * is OK without a mnt_want/drop_write() pair. Since * no actual write to the fs is performed here, we do * not need to telegraph to that to anyone. * * By doing this, we accept that this access is * inherently racy and know that the fs may change * state before we even see this result. */ if (__mnt_is_readonly(path.mnt)) res = -EROFS; out_path_release: path_put(&path); if (retry_estale(res, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out: if (old_cred) put_cred(revert_creds(old_cred)); return res; } SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode) { return do_faccessat(dfd, filename, mode, 0); } SYSCALL_DEFINE4(faccessat2, int, dfd, const char __user *, filename, int, mode, int, flags) { return do_faccessat(dfd, filename, mode, flags); } SYSCALL_DEFINE2(access, const char __user *, filename, int, mode) { return do_faccessat(AT_FDCWD, filename, mode, 0); } SYSCALL_DEFINE1(chdir, const char __user *, filename) { struct path path; int error; unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY; retry: error = user_path_at(AT_FDCWD, filename, lookup_flags, &path); if (error) goto out; error = path_permission(&path, MAY_EXEC | MAY_CHDIR); if (error) goto dput_and_out; set_fs_pwd(current->fs, &path); dput_and_out: path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out: return error; } SYSCALL_DEFINE1(fchdir, unsigned int, fd) { CLASS(fd_raw, f)(fd); int error; if (fd_empty(f)) return -EBADF; if (!d_can_lookup(fd_file(f)->f_path.dentry)) return -ENOTDIR; error = file_permission(fd_file(f), MAY_EXEC | MAY_CHDIR); if (!error) set_fs_pwd(current->fs, &fd_file(f)->f_path); return error; } SYSCALL_DEFINE1(chroot, const char __user *, filename) { struct path path; int error; unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY; retry: error = user_path_at(AT_FDCWD, filename, lookup_flags, &path); if (error) goto out; error = path_permission(&path, MAY_EXEC | MAY_CHDIR); if (error) goto dput_and_out; error = -EPERM; if (!ns_capable(current_user_ns(), CAP_SYS_CHROOT)) goto dput_and_out; error = security_path_chroot(&path); if (error) goto dput_and_out; set_fs_root(current->fs, &path); error = 0; dput_and_out: path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out: return error; } int chmod_common(const struct path *path, umode_t mode) { struct inode *inode = path->dentry->d_inode; struct inode *delegated_inode = NULL; struct iattr newattrs; int error; error = mnt_want_write(path->mnt); if (error) return error; retry_deleg: error = inode_lock_killable(inode); if (error) goto out_mnt_unlock; error = security_path_chmod(path, mode); if (error) goto out_unlock; newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; error = notify_change(mnt_idmap(path->mnt), path->dentry, &newattrs, &delegated_inode); out_unlock: inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } out_mnt_unlock: mnt_drop_write(path->mnt); return error; } int vfs_fchmod(struct file *file, umode_t mode) { audit_file(file); return chmod_common(&file->f_path, mode); } SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode) { CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; return vfs_fchmod(fd_file(f), mode); } static int do_fchmodat(int dfd, const char __user *filename, umode_t mode, unsigned int flags) { struct path path; int error; unsigned int lookup_flags; if (unlikely(flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH))) return -EINVAL; lookup_flags = (flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; if (flags & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; retry: error = user_path_at(dfd, filename, lookup_flags, &path); if (!error) { error = chmod_common(&path, mode); path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } } return error; } SYSCALL_DEFINE4(fchmodat2, int, dfd, const char __user *, filename, umode_t, mode, unsigned int, flags) { return do_fchmodat(dfd, filename, mode, flags); } SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, umode_t, mode) { return do_fchmodat(dfd, filename, mode, 0); } SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode) { return do_fchmodat(AT_FDCWD, filename, mode, 0); } /* * Check whether @kuid is valid and if so generate and set vfsuid_t in * ia_vfsuid. * * Return: true if @kuid is valid, false if not. */ static inline bool setattr_vfsuid(struct iattr *attr, kuid_t kuid) { if (!uid_valid(kuid)) return false; attr->ia_valid |= ATTR_UID; attr->ia_vfsuid = VFSUIDT_INIT(kuid); return true; } /* * Check whether @kgid is valid and if so generate and set vfsgid_t in * ia_vfsgid. * * Return: true if @kgid is valid, false if not. */ static inline bool setattr_vfsgid(struct iattr *attr, kgid_t kgid) { if (!gid_valid(kgid)) return false; attr->ia_valid |= ATTR_GID; attr->ia_vfsgid = VFSGIDT_INIT(kgid); return true; } int chown_common(const struct path *path, uid_t user, gid_t group) { struct mnt_idmap *idmap; struct user_namespace *fs_userns; struct inode *inode = path->dentry->d_inode; struct inode *delegated_inode = NULL; int error; struct iattr newattrs; kuid_t uid; kgid_t gid; uid = make_kuid(current_user_ns(), user); gid = make_kgid(current_user_ns(), group); idmap = mnt_idmap(path->mnt); fs_userns = i_user_ns(inode); retry_deleg: newattrs.ia_vfsuid = INVALID_VFSUID; newattrs.ia_vfsgid = INVALID_VFSGID; newattrs.ia_valid = ATTR_CTIME; if ((user != (uid_t)-1) && !setattr_vfsuid(&newattrs, uid)) return -EINVAL; if ((group != (gid_t)-1) && !setattr_vfsgid(&newattrs, gid)) return -EINVAL; error = inode_lock_killable(inode); if (error) return error; if (!S_ISDIR(inode->i_mode)) newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_PRIV | setattr_should_drop_sgid(idmap, inode); /* Continue to send actual fs values, not the mount values. */ error = security_path_chown( path, from_vfsuid(idmap, fs_userns, newattrs.ia_vfsuid), from_vfsgid(idmap, fs_userns, newattrs.ia_vfsgid)); if (!error) error = notify_change(idmap, path->dentry, &newattrs, &delegated_inode); inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } return error; } int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group, int flag) { struct path path; int error = -EINVAL; int lookup_flags; if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) goto out; lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; if (flag & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; retry: error = user_path_at(dfd, filename, lookup_flags, &path); if (error) goto out; error = mnt_want_write(path.mnt); if (error) goto out_release; error = chown_common(&path, user, group); mnt_drop_write(path.mnt); out_release: path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out: return error; } SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user, gid_t, group, int, flag) { return do_fchownat(dfd, filename, user, group, flag); } SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group) { return do_fchownat(AT_FDCWD, filename, user, group, 0); } SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group) { return do_fchownat(AT_FDCWD, filename, user, group, AT_SYMLINK_NOFOLLOW); } int vfs_fchown(struct file *file, uid_t user, gid_t group) { int error; error = mnt_want_write_file(file); if (error) return error; audit_file(file); error = chown_common(&file->f_path, user, group); mnt_drop_write_file(file); return error; } int ksys_fchown(unsigned int fd, uid_t user, gid_t group) { CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; return vfs_fchown(fd_file(f), user, group); } SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group) { return ksys_fchown(fd, user, group); } static inline int file_get_write_access(struct file *f) { int error; error = get_write_access(f->f_inode); if (unlikely(error)) return error; error = mnt_get_write_access(f->f_path.mnt); if (unlikely(error)) goto cleanup_inode; if (unlikely(f->f_mode & FMODE_BACKING)) { error = mnt_get_write_access(backing_file_user_path(f)->mnt); if (unlikely(error)) goto cleanup_mnt; } return 0; cleanup_mnt: mnt_put_write_access(f->f_path.mnt); cleanup_inode: put_write_access(f->f_inode); return error; } static int do_dentry_open(struct file *f, int (*open)(struct inode *, struct file *)) { static const struct file_operations empty_fops = {}; struct inode *inode = f->f_path.dentry->d_inode; int error; path_get(&f->f_path); f->f_inode = inode; f->f_mapping = inode->i_mapping; f->f_wb_err = filemap_sample_wb_err(f->f_mapping); f->f_sb_err = file_sample_sb_err(f); if (unlikely(f->f_flags & O_PATH)) { f->f_mode = FMODE_PATH | FMODE_OPENED; file_set_fsnotify_mode(f, FMODE_NONOTIFY); f->f_op = &empty_fops; return 0; } if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) { i_readcount_inc(inode); } else if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) { error = file_get_write_access(f); if (unlikely(error)) goto cleanup_file; f->f_mode |= FMODE_WRITER; } /* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */ if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)) f->f_mode |= FMODE_ATOMIC_POS; f->f_op = fops_get(inode->i_fop); if (WARN_ON(!f->f_op)) { error = -ENODEV; goto cleanup_all; } error = security_file_open(f); if (error) goto cleanup_all; /* * Set FMODE_NONOTIFY_* bits according to existing permission watches. * If FMODE_NONOTIFY mode was already set for an fanotify fd or for a * pseudo file, this call will not change the mode. */ file_set_fsnotify_mode_from_watchers(f); error = fsnotify_open_perm(f); if (error) goto cleanup_all; error = break_lease(file_inode(f), f->f_flags); if (error) goto cleanup_all; /* normally all 3 are set; ->open() can clear them if needed */ f->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; if (!open) open = f->f_op->open; if (open) { error = open(inode, f); if (error) goto cleanup_all; } f->f_mode |= FMODE_OPENED; if ((f->f_mode & FMODE_READ) && likely(f->f_op->read || f->f_op->read_iter)) f->f_mode |= FMODE_CAN_READ; if ((f->f_mode & FMODE_WRITE) && likely(f->f_op->write || f->f_op->write_iter)) f->f_mode |= FMODE_CAN_WRITE; if ((f->f_mode & FMODE_LSEEK) && !f->f_op->llseek) f->f_mode &= ~FMODE_LSEEK; if (f->f_mapping->a_ops && f->f_mapping->a_ops->direct_IO) f->f_mode |= FMODE_CAN_ODIRECT; f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); f->f_iocb_flags = iocb_flags(f); file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping); if ((f->f_flags & O_DIRECT) && !(f->f_mode & FMODE_CAN_ODIRECT)) return -EINVAL; /* * XXX: Huge page cache doesn't support writing yet. Drop all page * cache for this file before processing writes. */ if (f->f_mode & FMODE_WRITE) { /* * Depends on full fence from get_write_access() to synchronize * against collapse_file() regarding i_writecount and nr_thps * updates. Ensures subsequent insertion of THPs into the page * cache will fail. */ if (filemap_nr_thps(inode->i_mapping)) { struct address_space *mapping = inode->i_mapping; filemap_invalidate_lock(inode->i_mapping); /* * unmap_mapping_range just need to be called once * here, because the private pages is not need to be * unmapped mapping (e.g. data segment of dynamic * shared libraries here). */ unmap_mapping_range(mapping, 0, 0, 0); truncate_inode_pages(mapping, 0); filemap_invalidate_unlock(inode->i_mapping); } } return 0; cleanup_all: if (WARN_ON_ONCE(error > 0)) error = -EINVAL; fops_put(f->f_op); put_file_access(f); cleanup_file: path_put(&f->f_path); f->f_path.mnt = NULL; f->f_path.dentry = NULL; f->f_inode = NULL; return error; } /** * finish_open - finish opening a file * @file: file pointer * @dentry: pointer to dentry * @open: open callback * * This can be used to finish opening a file passed to i_op->atomic_open(). * * If the open callback is set to NULL, then the standard f_op->open() * filesystem callback is substituted. * * NB: the dentry reference is _not_ consumed. If, for example, the dentry is * the return value of d_splice_alias(), then the caller needs to perform dput() * on it after finish_open(). * * Returns zero on success or -errno if the open failed. */ int finish_open(struct file *file, struct dentry *dentry, int (*open)(struct inode *, struct file *)) { BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */ file->f_path.dentry = dentry; return do_dentry_open(file, open); } EXPORT_SYMBOL(finish_open); /** * finish_no_open - finish ->atomic_open() without opening the file * * @file: file pointer * @dentry: dentry or NULL (as returned from ->lookup()) * * This can be used to set the result of a successful lookup in ->atomic_open(). * * NB: unlike finish_open() this function does consume the dentry reference and * the caller need not dput() it. * * Returns "0" which must be the return value of ->atomic_open() after having * called this function. */ int finish_no_open(struct file *file, struct dentry *dentry) { file->f_path.dentry = dentry; return 0; } EXPORT_SYMBOL(finish_no_open); char *file_path(struct file *filp, char *buf, int buflen) { return d_path(&filp->f_path, buf, buflen); } EXPORT_SYMBOL(file_path); /** * vfs_open - open the file at the given path * @path: path to open * @file: newly allocated file with f_flag initialized */ int vfs_open(const struct path *path, struct file *file) { int ret; file->f_path = *path; ret = do_dentry_open(file, NULL); if (!ret) { /* * Once we return a file with FMODE_OPENED, __fput() will call * fsnotify_close(), so we need fsnotify_open() here for * symmetry. */ fsnotify_open(file); } return ret; } struct file *dentry_open(const struct path *path, int flags, const struct cred *cred) { int error; struct file *f; /* We must always pass in a valid mount pointer. */ BUG_ON(!path->mnt); f = alloc_empty_file(flags, cred); if (!IS_ERR(f)) { error = vfs_open(path, f); if (error) { fput(f); f = ERR_PTR(error); } } return f; } EXPORT_SYMBOL(dentry_open); struct file *dentry_open_nonotify(const struct path *path, int flags, const struct cred *cred) { struct file *f = alloc_empty_file(flags, cred); if (!IS_ERR(f)) { int error; file_set_fsnotify_mode(f, FMODE_NONOTIFY); error = vfs_open(path, f); if (error) { fput(f); f = ERR_PTR(error); } } return f; } /** * dentry_create - Create and open a file * @path: path to create * @flags: O_ flags * @mode: mode bits for new file * @cred: credentials to use * * Caller must hold the parent directory's lock, and have prepared * a negative dentry, placed in @path->dentry, for the new file. * * Caller sets @path->mnt to the vfsmount of the filesystem where * the new file is to be created. The parent directory and the * negative dentry must reside on the same filesystem instance. * * On success, returns a "struct file *". Otherwise a ERR_PTR * is returned. */ struct file *dentry_create(const struct path *path, int flags, umode_t mode, const struct cred *cred) { struct file *f; int error; f = alloc_empty_file(flags, cred); if (IS_ERR(f)) return f; error = vfs_create(mnt_idmap(path->mnt), d_inode(path->dentry->d_parent), path->dentry, mode, true); if (!error) error = vfs_open(path, f); if (unlikely(error)) { fput(f); return ERR_PTR(error); } return f; } EXPORT_SYMBOL(dentry_create); /** * kernel_file_open - open a file for kernel internal use * @path: path of the file to open * @flags: open flags * @cred: credentials for open * * Open a file for use by in-kernel consumers. The file is not accounted * against nr_files and must not be installed into the file descriptor * table. * * Return: Opened file on success, an error pointer on failure. */ struct file *kernel_file_open(const struct path *path, int flags, const struct cred *cred) { struct file *f; int error; f = alloc_empty_file_noaccount(flags, cred); if (IS_ERR(f)) return f; f->f_path = *path; error = do_dentry_open(f, NULL); if (error) { fput(f); return ERR_PTR(error); } fsnotify_open(f); return f; } EXPORT_SYMBOL_GPL(kernel_file_open); #define WILL_CREATE(flags) (flags & (O_CREAT | __O_TMPFILE)) #define O_PATH_FLAGS (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC) inline struct open_how build_open_how(int flags, umode_t mode) { struct open_how how = { .flags = flags & VALID_OPEN_FLAGS, .mode = mode & S_IALLUGO, }; /* O_PATH beats everything else. */ if (how.flags & O_PATH) how.flags &= O_PATH_FLAGS; /* Modes should only be set for create-like flags. */ if (!WILL_CREATE(how.flags)) how.mode = 0; return how; } inline int build_open_flags(const struct open_how *how, struct open_flags *op) { u64 flags = how->flags; u64 strip = O_CLOEXEC; int lookup_flags = 0; int acc_mode = ACC_MODE(flags); BUILD_BUG_ON_MSG(upper_32_bits(VALID_OPEN_FLAGS), "struct open_flags doesn't yet handle flags > 32 bits"); /* * Strip flags that aren't relevant in determining struct open_flags. */ flags &= ~strip; /* * Older syscalls implicitly clear all of the invalid flags or argument * values before calling build_open_flags(), but openat2(2) checks all * of its arguments. */ if (flags & ~VALID_OPEN_FLAGS) return -EINVAL; if (how->resolve & ~VALID_RESOLVE_FLAGS) return -EINVAL; /* Scoping flags are mutually exclusive. */ if ((how->resolve & RESOLVE_BENEATH) && (how->resolve & RESOLVE_IN_ROOT)) return -EINVAL; /* Deal with the mode. */ if (WILL_CREATE(flags)) { if (how->mode & ~S_IALLUGO) return -EINVAL; op->mode = how->mode | S_IFREG; } else { if (how->mode != 0) return -EINVAL; op->mode = 0; } /* * Block bugs where O_DIRECTORY | O_CREAT created regular files. * Note, that blocking O_DIRECTORY | O_CREAT here also protects * O_TMPFILE below which requires O_DIRECTORY being raised. */ if ((flags & (O_DIRECTORY | O_CREAT)) == (O_DIRECTORY | O_CREAT)) return -EINVAL; /* Now handle the creative implementation of O_TMPFILE. */ if (flags & __O_TMPFILE) { /* * In order to ensure programs get explicit errors when trying * to use O_TMPFILE on old kernels we enforce that O_DIRECTORY * is raised alongside __O_TMPFILE. */ if (!(flags & O_DIRECTORY)) return -EINVAL; if (!(acc_mode & MAY_WRITE)) return -EINVAL; } if (flags & O_PATH) { /* O_PATH only permits certain other flags to be set. */ if (flags & ~O_PATH_FLAGS) return -EINVAL; acc_mode = 0; } /* * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only * check for O_DSYNC if the need any syncing at all we enforce it's * always set instead of having to deal with possibly weird behaviour * for malicious applications setting only __O_SYNC. */ if (flags & __O_SYNC) flags |= O_DSYNC; op->open_flag = flags; /* O_TRUNC implies we need access checks for write permissions */ if (flags & O_TRUNC) acc_mode |= MAY_WRITE; /* Allow the LSM permission hook to distinguish append access from general write access. */ if (flags & O_APPEND) acc_mode |= MAY_APPEND; op->acc_mode = acc_mode; op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN; if (flags & O_CREAT) { op->intent |= LOOKUP_CREATE; if (flags & O_EXCL) { op->intent |= LOOKUP_EXCL; flags |= O_NOFOLLOW; } } if (flags & O_DIRECTORY) lookup_flags |= LOOKUP_DIRECTORY; if (!(flags & O_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; if (how->resolve & RESOLVE_NO_XDEV) lookup_flags |= LOOKUP_NO_XDEV; if (how->resolve & RESOLVE_NO_MAGICLINKS) lookup_flags |= LOOKUP_NO_MAGICLINKS; if (how->resolve & RESOLVE_NO_SYMLINKS) lookup_flags |= LOOKUP_NO_SYMLINKS; if (how->resolve & RESOLVE_BENEATH) lookup_flags |= LOOKUP_BENEATH; if (how->resolve & RESOLVE_IN_ROOT) lookup_flags |= LOOKUP_IN_ROOT; if (how->resolve & RESOLVE_CACHED) { /* Don't bother even trying for create/truncate/tmpfile open */ if (flags & (O_TRUNC | O_CREAT | __O_TMPFILE)) return -EAGAIN; lookup_flags |= LOOKUP_CACHED; } op->lookup_flags = lookup_flags; return 0; } /** * file_open_name - open file and return file pointer * * @name: struct filename containing path to open * @flags: open flags as per the open(2) second argument * @mode: mode for the new file if O_CREAT is set, else ignored * * This is the helper to open a file from kernelspace if you really * have to. But in generally you should not do this, so please move * along, nothing to see here.. */ struct file *file_open_name(struct filename *name, int flags, umode_t mode) { struct open_flags op; struct open_how how = build_open_how(flags, mode); int err = build_open_flags(&how, &op); if (err) return ERR_PTR(err); return do_filp_open(AT_FDCWD, name, &op); } /** * filp_open - open file and return file pointer * * @filename: path to open * @flags: open flags as per the open(2) second argument * @mode: mode for the new file if O_CREAT is set, else ignored * * This is the helper to open a file from kernelspace if you really * have to. But in generally you should not do this, so please move * along, nothing to see here.. */ struct file *filp_open(const char *filename, int flags, umode_t mode) { struct filename *name = getname_kernel(filename); struct file *file = ERR_CAST(name); if (!IS_ERR(name)) { file = file_open_name(name, flags, mode); putname(name); } return file; } EXPORT_SYMBOL(filp_open); struct file *file_open_root(const struct path *root, const char *filename, int flags, umode_t mode) { struct open_flags op; struct open_how how = build_open_how(flags, mode); int err = build_open_flags(&how, &op); if (err) return ERR_PTR(err); return do_file_open_root(root, filename, &op); } EXPORT_SYMBOL(file_open_root); static int do_sys_openat2(int dfd, const char __user *filename, struct open_how *how) { struct open_flags op; struct filename *tmp; int err, fd; err = build_open_flags(how, &op); if (unlikely(err)) return err; tmp = getname(filename); if (IS_ERR(tmp)) return PTR_ERR(tmp); fd = get_unused_fd_flags(how->flags); if (likely(fd >= 0)) { struct file *f = do_filp_open(dfd, tmp, &op); if (IS_ERR(f)) { put_unused_fd(fd); fd = PTR_ERR(f); } else { fd_install(fd, f); } } putname(tmp); return fd; } int do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) { struct open_how how = build_open_how(flags, mode); return do_sys_openat2(dfd, filename, &how); } SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode) { if (force_o_largefile()) flags |= O_LARGEFILE; return do_sys_open(AT_FDCWD, filename, flags, mode); } SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode) { if (force_o_largefile()) flags |= O_LARGEFILE; return do_sys_open(dfd, filename, flags, mode); } SYSCALL_DEFINE4(openat2, int, dfd, const char __user *, filename, struct open_how __user *, how, size_t, usize) { int err; struct open_how tmp; BUILD_BUG_ON(sizeof(struct open_how) < OPEN_HOW_SIZE_VER0); BUILD_BUG_ON(sizeof(struct open_how) != OPEN_HOW_SIZE_LATEST); if (unlikely(usize < OPEN_HOW_SIZE_VER0)) return -EINVAL; if (unlikely(usize > PAGE_SIZE)) return -E2BIG; err = copy_struct_from_user(&tmp, sizeof(tmp), how, usize); if (err) return err; audit_openat2_how(&tmp); /* O_LARGEFILE is only allowed for non-O_PATH. */ if (!(tmp.flags & O_PATH) && force_o_largefile()) tmp.flags |= O_LARGEFILE; return do_sys_openat2(dfd, filename, &tmp); } #ifdef CONFIG_COMPAT /* * Exactly like sys_open(), except that it doesn't set the * O_LARGEFILE flag. */ COMPAT_SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode) { return do_sys_open(AT_FDCWD, filename, flags, mode); } /* * Exactly like sys_openat(), except that it doesn't set the * O_LARGEFILE flag. */ COMPAT_SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode) { return do_sys_open(dfd, filename, flags, mode); } #endif #ifndef __alpha__ /* * For backward compatibility? Maybe this should be moved * into arch/i386 instead? */ SYSCALL_DEFINE2(creat, const char __user *, pathname, umode_t, mode) { int flags = O_CREAT | O_WRONLY | O_TRUNC; if (force_o_largefile()) flags |= O_LARGEFILE; return do_sys_open(AT_FDCWD, pathname, flags, mode); } #endif /* * "id" is the POSIX thread ID. We use the * files pointer for this.. */ static int filp_flush(struct file *filp, fl_owner_t id) { int retval = 0; if (CHECK_DATA_CORRUPTION(file_count(filp) == 0, filp, "VFS: Close: file count is 0 (f_op=%ps)", filp->f_op)) { return 0; } if (filp->f_op->flush) retval = filp->f_op->flush(filp, id); if (likely(!(filp->f_mode & FMODE_PATH))) { dnotify_flush(filp, id); locks_remove_posix(filp, id); } return retval; } int filp_close(struct file *filp, fl_owner_t id) { int retval; retval = filp_flush(filp, id); fput_close(filp); return retval; } EXPORT_SYMBOL(filp_close); /* * Careful here! We test whether the file pointer is NULL before * releasing the fd. This ensures that one clone task can't release * an fd while another clone is opening it. */ SYSCALL_DEFINE1(close, unsigned int, fd) { int retval; struct file *file; file = file_close_fd(fd); if (!file) return -EBADF; retval = filp_flush(file, current->files); /* * We're returning to user space. Don't bother * with any delayed fput() cases. */ fput_close_sync(file); if (likely(retval == 0)) return 0; /* can't restart close syscall because file table entry was cleared */ if (retval == -ERESTARTSYS || retval == -ERESTARTNOINTR || retval == -ERESTARTNOHAND || retval == -ERESTART_RESTARTBLOCK) retval = -EINTR; return retval; } /* * This routine simulates a hangup on the tty, to arrange that users * are given clean terminals at login time. */ SYSCALL_DEFINE0(vhangup) { if (capable(CAP_SYS_TTY_CONFIG)) { tty_vhangup_self(); return 0; } return -EPERM; } /* * Called when an inode is about to be open. * We use this to disallow opening large files on 32bit systems if * the caller didn't specify O_LARGEFILE. On 64bit systems we force * on this flag in sys_open. */ int generic_file_open(struct inode * inode, struct file * filp) { if (!(filp->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) return -EOVERFLOW; return 0; } EXPORT_SYMBOL(generic_file_open); /* * This is used by subsystems that don't want seekable * file descriptors. The function is not supposed to ever fail, the only * reason it returns an 'int' and not 'void' is so that it can be plugged * directly into file_operations structure. */ int nonseekable_open(struct inode *inode, struct file *filp) { filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE); return 0; } EXPORT_SYMBOL(nonseekable_open); /* * stream_open is used by subsystems that want stream-like file descriptors. * Such file descriptors are not seekable and don't have notion of position * (file.f_pos is always 0 and ppos passed to .read()/.write() is always NULL). * Contrary to file descriptors of other regular files, .read() and .write() * can run simultaneously. * * stream_open never fails and is marked to return int so that it could be * directly used as file_operations.open . */ int stream_open(struct inode *inode, struct file *filp) { filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE | FMODE_ATOMIC_POS); filp->f_mode |= FMODE_STREAM; return 0; } EXPORT_SYMBOL(stream_open);
6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 // SPDX-License-Identifier: GPL-2.0 /* * linux/lib/kasprintf.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/stdarg.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/string.h> /* Simplified asprintf. */ char *kvasprintf(gfp_t gfp, const char *fmt, va_list ap) { unsigned int first, second; char *p; va_list aq; va_copy(aq, ap); first = vsnprintf(NULL, 0, fmt, aq); va_end(aq); p = kmalloc_track_caller(first+1, gfp); if (!p) return NULL; second = vsnprintf(p, first+1, fmt, ap); WARN(first != second, "different return values (%u and %u) from vsnprintf(\"%s\", ...)", first, second, fmt); return p; } EXPORT_SYMBOL(kvasprintf); /* * If fmt contains no % (or is exactly %s), use kstrdup_const. If fmt * (or the sole vararg) points to rodata, we will then save a memory * allocation and string copy. In any case, the return value should be * freed using kfree_const(). */ const char *kvasprintf_const(gfp_t gfp, const char *fmt, va_list ap) { if (!strchr(fmt, '%')) return kstrdup_const(fmt, gfp); if (!strcmp(fmt, "%s")) return kstrdup_const(va_arg(ap, const char*), gfp); return kvasprintf(gfp, fmt, ap); } EXPORT_SYMBOL(kvasprintf_const); char *kasprintf(gfp_t gfp, const char *fmt, ...) { va_list ap; char *p; va_start(ap, fmt); p = kvasprintf(gfp, fmt, ap); va_end(ap); return p; } EXPORT_SYMBOL(kasprintf);
1746 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_PREEMPT_H #define __LINUX_PREEMPT_H /* * include/linux/preempt.h - macros for accessing and manipulating * preempt_count (used for kernel preemption, interrupt count, etc.) */ #include <linux/linkage.h> #include <linux/cleanup.h> #include <linux/types.h> /* * We put the hardirq and softirq counter into the preemption * counter. The bitmask has the following meaning: * * - bits 0-7 are the preemption count (max preemption depth: 256) * - bits 8-15 are the softirq count (max # of softirqs: 256) * * The hardirq count could in theory be the same as the number of * interrupts in the system, but we run all interrupt handlers with * interrupts disabled, so we cannot have nesting interrupts. Though * there are a few palaeontologic drivers which reenable interrupts in * the handler, so we need more than one bit here. * * PREEMPT_MASK: 0x000000ff * SOFTIRQ_MASK: 0x0000ff00 * HARDIRQ_MASK: 0x000f0000 * NMI_MASK: 0x00f00000 * PREEMPT_NEED_RESCHED: 0x80000000 */ #define PREEMPT_BITS 8 #define SOFTIRQ_BITS 8 #define HARDIRQ_BITS 4 #define NMI_BITS 4 #define PREEMPT_SHIFT 0 #define SOFTIRQ_SHIFT (PREEMPT_SHIFT + PREEMPT_BITS) #define HARDIRQ_SHIFT (SOFTIRQ_SHIFT + SOFTIRQ_BITS) #define NMI_SHIFT (HARDIRQ_SHIFT + HARDIRQ_BITS) #define __IRQ_MASK(x) ((1UL << (x))-1) #define PREEMPT_MASK (__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT) #define SOFTIRQ_MASK (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT) #define HARDIRQ_MASK (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT) #define NMI_MASK (__IRQ_MASK(NMI_BITS) << NMI_SHIFT) #define PREEMPT_OFFSET (1UL << PREEMPT_SHIFT) #define SOFTIRQ_OFFSET (1UL << SOFTIRQ_SHIFT) #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT) #define NMI_OFFSET (1UL << NMI_SHIFT) #define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET) #define PREEMPT_DISABLED (PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED) /* * Disable preemption until the scheduler is running -- use an unconditional * value so that it also works on !PREEMPT_COUNT kernels. * * Reset by start_kernel()->sched_init()->init_idle()->init_idle_preempt_count(). */ #define INIT_PREEMPT_COUNT PREEMPT_OFFSET /* * Initial preempt_count value; reflects the preempt_count schedule invariant * which states that during context switches: * * preempt_count() == 2*PREEMPT_DISABLE_OFFSET * * Note: PREEMPT_DISABLE_OFFSET is 0 for !PREEMPT_COUNT kernels. * Note: See finish_task_switch(). */ #define FORK_PREEMPT_COUNT (2*PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED) /* preempt_count() and related functions, depends on PREEMPT_NEED_RESCHED */ #include <asm/preempt.h> /** * interrupt_context_level - return interrupt context level * * Returns the current interrupt context level. * 0 - normal context * 1 - softirq context * 2 - hardirq context * 3 - NMI context */ static __always_inline unsigned char interrupt_context_level(void) { unsigned long pc = preempt_count(); unsigned char level = 0; level += !!(pc & (NMI_MASK)); level += !!(pc & (NMI_MASK | HARDIRQ_MASK)); level += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)); return level; } /* * These macro definitions avoid redundant invocations of preempt_count() * because such invocations would result in redundant loads given that * preempt_count() is commonly implemented with READ_ONCE(). */ #define nmi_count() (preempt_count() & NMI_MASK) #define hardirq_count() (preempt_count() & HARDIRQ_MASK) #ifdef CONFIG_PREEMPT_RT # define softirq_count() (current->softirq_disable_cnt & SOFTIRQ_MASK) # define irq_count() ((preempt_count() & (NMI_MASK | HARDIRQ_MASK)) | softirq_count()) #else # define softirq_count() (preempt_count() & SOFTIRQ_MASK) # define irq_count() (preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_MASK)) #endif /* * Macros to retrieve the current execution context: * * in_nmi() - We're in NMI context * in_hardirq() - We're in hard IRQ context * in_serving_softirq() - We're in softirq context * in_task() - We're in task context */ #define in_nmi() (nmi_count()) #define in_hardirq() (hardirq_count()) #define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET) #ifdef CONFIG_PREEMPT_RT # define in_task() (!((preempt_count() & (NMI_MASK | HARDIRQ_MASK)) | in_serving_softirq())) #else # define in_task() (!(preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET))) #endif /* * The following macros are deprecated and should not be used in new code: * in_irq() - Obsolete version of in_hardirq() * in_softirq() - We have BH disabled, or are processing softirqs * in_interrupt() - We're in NMI,IRQ,SoftIRQ context or have BH disabled */ #define in_irq() (hardirq_count()) #define in_softirq() (softirq_count()) #define in_interrupt() (irq_count()) /* * The preempt_count offset after preempt_disable(); */ #if defined(CONFIG_PREEMPT_COUNT) # define PREEMPT_DISABLE_OFFSET PREEMPT_OFFSET #else # define PREEMPT_DISABLE_OFFSET 0 #endif /* * The preempt_count offset after spin_lock() */ #if !defined(CONFIG_PREEMPT_RT) #define PREEMPT_LOCK_OFFSET PREEMPT_DISABLE_OFFSET #else /* Locks on RT do not disable preemption */ #define PREEMPT_LOCK_OFFSET 0 #endif /* * The preempt_count offset needed for things like: * * spin_lock_bh() * * Which need to disable both preemption (CONFIG_PREEMPT_COUNT) and * softirqs, such that unlock sequences of: * * spin_unlock(); * local_bh_enable(); * * Work as expected. */ #define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_LOCK_OFFSET) /* * Are we running in atomic context? WARNING: this macro cannot * always detect atomic context; in particular, it cannot know about * held spinlocks in non-preemptible kernels. Thus it should not be * used in the general case to determine whether sleeping is possible. * Do not use in_atomic() in driver code. */ #define in_atomic() (preempt_count() != 0) /* * Check whether we were atomic before we did preempt_disable(): * (used by the scheduler) */ #define in_atomic_preempt_off() (preempt_count() != PREEMPT_DISABLE_OFFSET) #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_TRACE_PREEMPT_TOGGLE) extern void preempt_count_add(int val); extern void preempt_count_sub(int val); #define preempt_count_dec_and_test() \ ({ preempt_count_sub(1); should_resched(0); }) #else #define preempt_count_add(val) __preempt_count_add(val) #define preempt_count_sub(val) __preempt_count_sub(val) #define preempt_count_dec_and_test() __preempt_count_dec_and_test() #endif #define __preempt_count_inc() __preempt_count_add(1) #define __preempt_count_dec() __preempt_count_sub(1) #define preempt_count_inc() preempt_count_add(1) #define preempt_count_dec() preempt_count_sub(1) #ifdef CONFIG_PREEMPT_COUNT #define preempt_disable() \ do { \ preempt_count_inc(); \ barrier(); \ } while (0) #define sched_preempt_enable_no_resched() \ do { \ barrier(); \ preempt_count_dec(); \ } while (0) #define preempt_enable_no_resched() sched_preempt_enable_no_resched() #define preemptible() (preempt_count() == 0 && !irqs_disabled()) #ifdef CONFIG_PREEMPTION #define preempt_enable() \ do { \ barrier(); \ if (unlikely(preempt_count_dec_and_test())) \ __preempt_schedule(); \ } while (0) #define preempt_enable_notrace() \ do { \ barrier(); \ if (unlikely(__preempt_count_dec_and_test())) \ __preempt_schedule_notrace(); \ } while (0) #define preempt_check_resched() \ do { \ if (should_resched(0)) \ __preempt_schedule(); \ } while (0) #else /* !CONFIG_PREEMPTION */ #define preempt_enable() \ do { \ barrier(); \ preempt_count_dec(); \ } while (0) #define preempt_enable_notrace() \ do { \ barrier(); \ __preempt_count_dec(); \ } while (0) #define preempt_check_resched() do { } while (0) #endif /* CONFIG_PREEMPTION */ #define preempt_disable_notrace() \ do { \ __preempt_count_inc(); \ barrier(); \ } while (0) #define preempt_enable_no_resched_notrace() \ do { \ barrier(); \ __preempt_count_dec(); \ } while (0) #else /* !CONFIG_PREEMPT_COUNT */ /* * Even if we don't have any preemption, we need preempt disable/enable * to be barriers, so that we don't have things like get_user/put_user * that can cause faults and scheduling migrate into our preempt-protected * region. */ #define preempt_disable() barrier() #define sched_preempt_enable_no_resched() barrier() #define preempt_enable_no_resched() barrier() #define preempt_enable() barrier() #define preempt_check_resched() do { } while (0) #define preempt_disable_notrace() barrier() #define preempt_enable_no_resched_notrace() barrier() #define preempt_enable_notrace() barrier() #define preemptible() 0 #endif /* CONFIG_PREEMPT_COUNT */ #ifdef MODULE /* * Modules have no business playing preemption tricks. */ #undef sched_preempt_enable_no_resched #undef preempt_enable_no_resched #undef preempt_enable_no_resched_notrace #undef preempt_check_resched #endif #define preempt_set_need_resched() \ do { \ set_preempt_need_resched(); \ } while (0) #define preempt_fold_need_resched() \ do { \ if (tif_need_resched()) \ set_preempt_need_resched(); \ } while (0) #ifdef CONFIG_PREEMPT_NOTIFIERS struct preempt_notifier; struct task_struct; /** * preempt_ops - notifiers called when a task is preempted and rescheduled * @sched_in: we're about to be rescheduled: * notifier: struct preempt_notifier for the task being scheduled * cpu: cpu we're scheduled on * @sched_out: we've just been preempted * notifier: struct preempt_notifier for the task being preempted * next: the task that's kicking us out * * Please note that sched_in and out are called under different * contexts. sched_out is called with rq lock held and irq disabled * while sched_in is called without rq lock and irq enabled. This * difference is intentional and depended upon by its users. */ struct preempt_ops { void (*sched_in)(struct preempt_notifier *notifier, int cpu); void (*sched_out)(struct preempt_notifier *notifier, struct task_struct *next); }; /** * preempt_notifier - key for installing preemption notifiers * @link: internal use * @ops: defines the notifier functions to be called * * Usually used in conjunction with container_of(). */ struct preempt_notifier { struct hlist_node link; struct preempt_ops *ops; }; void preempt_notifier_inc(void); void preempt_notifier_dec(void); void preempt_notifier_register(struct preempt_notifier *notifier); void preempt_notifier_unregister(struct preempt_notifier *notifier); static inline void preempt_notifier_init(struct preempt_notifier *notifier, struct preempt_ops *ops) { /* INIT_HLIST_NODE() open coded, to avoid dependency on list.h */ notifier->link.next = NULL; notifier->link.pprev = NULL; notifier->ops = ops; } #endif #ifdef CONFIG_SMP /* * Migrate-Disable and why it is undesired. * * When a preempted task becomes elegible to run under the ideal model (IOW it * becomes one of the M highest priority tasks), it might still have to wait * for the preemptee's migrate_disable() section to complete. Thereby suffering * a reduction in bandwidth in the exact duration of the migrate_disable() * section. * * Per this argument, the change from preempt_disable() to migrate_disable() * gets us: * * - a higher priority tasks gains reduced wake-up latency; with preempt_disable() * it would have had to wait for the lower priority task. * * - a lower priority tasks; which under preempt_disable() could've instantly * migrated away when another CPU becomes available, is now constrained * by the ability to push the higher priority task away, which might itself be * in a migrate_disable() section, reducing it's available bandwidth. * * IOW it trades latency / moves the interference term, but it stays in the * system, and as long as it remains unbounded, the system is not fully * deterministic. * * * The reason we have it anyway. * * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a * number of primitives into becoming preemptible, they would also allow * migration. This turns out to break a bunch of per-cpu usage. To this end, * all these primitives employ migirate_disable() to restore this implicit * assumption. * * This is a 'temporary' work-around at best. The correct solution is getting * rid of the above assumptions and reworking the code to employ explicit * per-cpu locking or short preempt-disable regions. * * The end goal must be to get rid of migrate_disable(), alternatively we need * a schedulability theory that does not depend on abritrary migration. * * * Notes on the implementation. * * The implementation is particularly tricky since existing code patterns * dictate neither migrate_disable() nor migrate_enable() is allowed to block. * This means that it cannot use cpus_read_lock() to serialize against hotplug, * nor can it easily migrate itself into a pending affinity mask change on * migrate_enable(). * * * Note: even non-work-conserving schedulers like semi-partitioned depends on * migration, so migrate_disable() is not only a problem for * work-conserving schedulers. * */ extern void migrate_disable(void); extern void migrate_enable(void); #else static inline void migrate_disable(void) { } static inline void migrate_enable(void) { } #endif /* CONFIG_SMP */ /** * preempt_disable_nested - Disable preemption inside a normally preempt disabled section * * Use for code which requires preemption protection inside a critical * section which has preemption disabled implicitly on non-PREEMPT_RT * enabled kernels, by e.g.: * - holding a spinlock/rwlock * - soft interrupt context * - regular interrupt handlers * * On PREEMPT_RT enabled kernels spinlock/rwlock held sections, soft * interrupt context and regular interrupt handlers are preemptible and * only prevent migration. preempt_disable_nested() ensures that preemption * is disabled for cases which require CPU local serialization even on * PREEMPT_RT. For non-PREEMPT_RT kernels this is a NOP. * * The use cases are code sequences which are not serialized by a * particular lock instance, e.g.: * - seqcount write side critical sections where the seqcount is not * associated to a particular lock and therefore the automatic * protection mechanism does not work. This prevents a live lock * against a preempting high priority reader. * - RMW per CPU variable updates like vmstat. */ /* Macro to avoid header recursion hell vs. lockdep */ #define preempt_disable_nested() \ do { \ if (IS_ENABLED(CONFIG_PREEMPT_RT)) \ preempt_disable(); \ else \ lockdep_assert_preemption_disabled(); \ } while (0) /** * preempt_enable_nested - Undo the effect of preempt_disable_nested() */ static __always_inline void preempt_enable_nested(void) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) preempt_enable(); } DEFINE_LOCK_GUARD_0(preempt, preempt_disable(), preempt_enable()) DEFINE_LOCK_GUARD_0(preempt_notrace, preempt_disable_notrace(), preempt_enable_notrace()) DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable()) #ifdef CONFIG_PREEMPT_DYNAMIC extern bool preempt_model_none(void); extern bool preempt_model_voluntary(void); extern bool preempt_model_full(void); extern bool preempt_model_lazy(void); #else static inline bool preempt_model_none(void) { return IS_ENABLED(CONFIG_PREEMPT_NONE); } static inline bool preempt_model_voluntary(void) { return IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY); } static inline bool preempt_model_full(void) { return IS_ENABLED(CONFIG_PREEMPT); } static inline bool preempt_model_lazy(void) { return IS_ENABLED(CONFIG_PREEMPT_LAZY); } #endif static inline bool preempt_model_rt(void) { return IS_ENABLED(CONFIG_PREEMPT_RT); } extern const char *preempt_model_str(void); /* * Does the preemption model allow non-cooperative preemption? * * For !CONFIG_PREEMPT_DYNAMIC kernels this is an exact match with * CONFIG_PREEMPTION; for CONFIG_PREEMPT_DYNAMIC this doesn't work as the * kernel is *built* with CONFIG_PREEMPTION=y but may run with e.g. the * PREEMPT_NONE model. */ static inline bool preempt_model_preemptible(void) { return preempt_model_full() || preempt_model_lazy() || preempt_model_rt(); } #endif /* __LINUX_PREEMPT_H */
149 144 10 5 148 146 2 2 103 2 63 19 63 2 1 2 2 2 2 2 2 2 2 2 36 1 2 112 90 18 1 11 2 4 84 13 13 1 2 2 2 1 1 1 2 1 1 10 4 4 2 4 3 3 1 1 1 5 3 1 1 1 1 4 4 4 4 4 2 4 2 2 13 2 2 7 1 6 6 2 2 1 1 1 1 1 1 1 1 4 4 2 2 2 2 99 1 11 40 8 6 6 33 220 1 30 115 10 7 7 120 1 1 2 1 1 2 6 7 2 5 3 4 4 2 2 1 1 1 4 4 1 1 1 1 56 1 38 6 11 14 1 5 4 4 5 1 2 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2012,2013 - ARM Ltd * Author: Marc Zyngier <marc.zyngier@arm.com> * * Derived from arch/arm/kvm/guest.c: * Copyright (C) 2012 - Virtual Open Systems and Columbia University * Author: Christoffer Dall <c.dall@virtualopensystems.com> */ #include <linux/bits.h> #include <linux/errno.h> #include <linux/err.h> #include <linux/nospec.h> #include <linux/kvm_host.h> #include <linux/module.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/vmalloc.h> #include <linux/fs.h> #include <kvm/arm_hypercalls.h> #include <asm/cputype.h> #include <linux/uaccess.h> #include <asm/fpsimd.h> #include <asm/kvm.h> #include <asm/kvm_emulate.h> #include <asm/kvm_nested.h> #include <asm/sigcontext.h> #include "trace.h" const struct _kvm_stats_desc kvm_vm_stats_desc[] = { KVM_GENERIC_VM_STATS() }; const struct kvm_stats_header kvm_vm_stats_header = { .name_size = KVM_STATS_NAME_SIZE, .num_desc = ARRAY_SIZE(kvm_vm_stats_desc), .id_offset = sizeof(struct kvm_stats_header), .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE, .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE + sizeof(kvm_vm_stats_desc), }; const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { KVM_GENERIC_VCPU_STATS(), STATS_DESC_COUNTER(VCPU, hvc_exit_stat), STATS_DESC_COUNTER(VCPU, wfe_exit_stat), STATS_DESC_COUNTER(VCPU, wfi_exit_stat), STATS_DESC_COUNTER(VCPU, mmio_exit_user), STATS_DESC_COUNTER(VCPU, mmio_exit_kernel), STATS_DESC_COUNTER(VCPU, signal_exits), STATS_DESC_COUNTER(VCPU, exits) }; const struct kvm_stats_header kvm_vcpu_stats_header = { .name_size = KVM_STATS_NAME_SIZE, .num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc), .id_offset = sizeof(struct kvm_stats_header), .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE, .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE + sizeof(kvm_vcpu_stats_desc), }; static bool core_reg_offset_is_vreg(u64 off) { return off >= KVM_REG_ARM_CORE_REG(fp_regs.vregs) && off < KVM_REG_ARM_CORE_REG(fp_regs.fpsr); } static u64 core_reg_offset_from_id(u64 id) { return id & ~(KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK | KVM_REG_ARM_CORE); } static int core_reg_size_from_offset(const struct kvm_vcpu *vcpu, u64 off) { int size; switch (off) { case KVM_REG_ARM_CORE_REG(regs.regs[0]) ... KVM_REG_ARM_CORE_REG(regs.regs[30]): case KVM_REG_ARM_CORE_REG(regs.sp): case KVM_REG_ARM_CORE_REG(regs.pc): case KVM_REG_ARM_CORE_REG(regs.pstate): case KVM_REG_ARM_CORE_REG(sp_el1): case KVM_REG_ARM_CORE_REG(elr_el1): case KVM_REG_ARM_CORE_REG(spsr[0]) ... KVM_REG_ARM_CORE_REG(spsr[KVM_NR_SPSR - 1]): size = sizeof(__u64); break; case KVM_REG_ARM_CORE_REG(fp_regs.vregs[0]) ... KVM_REG_ARM_CORE_REG(fp_regs.vregs[31]): size = sizeof(__uint128_t); break; case KVM_REG_ARM_CORE_REG(fp_regs.fpsr): case KVM_REG_ARM_CORE_REG(fp_regs.fpcr): size = sizeof(__u32); break; default: return -EINVAL; } if (!IS_ALIGNED(off, size / sizeof(__u32))) return -EINVAL; /* * The KVM_REG_ARM64_SVE regs must be used instead of * KVM_REG_ARM_CORE for accessing the FPSIMD V-registers on * SVE-enabled vcpus: */ if (vcpu_has_sve(vcpu) && core_reg_offset_is_vreg(off)) return -EINVAL; return size; } static void *core_reg_addr(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { u64 off = core_reg_offset_from_id(reg->id); int size = core_reg_size_from_offset(vcpu, off); if (size < 0) return NULL; if (KVM_REG_SIZE(reg->id) != size) return NULL; switch (off) { case KVM_REG_ARM_CORE_REG(regs.regs[0]) ... KVM_REG_ARM_CORE_REG(regs.regs[30]): off -= KVM_REG_ARM_CORE_REG(regs.regs[0]); off /= 2; return &vcpu->arch.ctxt.regs.regs[off]; case KVM_REG_ARM_CORE_REG(regs.sp): return &vcpu->arch.ctxt.regs.sp; case KVM_REG_ARM_CORE_REG(regs.pc): return &vcpu->arch.ctxt.regs.pc; case KVM_REG_ARM_CORE_REG(regs.pstate): return &vcpu->arch.ctxt.regs.pstate; case KVM_REG_ARM_CORE_REG(sp_el1): return __ctxt_sys_reg(&vcpu->arch.ctxt, SP_EL1); case KVM_REG_ARM_CORE_REG(elr_el1): return __ctxt_sys_reg(&vcpu->arch.ctxt, ELR_EL1); case KVM_REG_ARM_CORE_REG(spsr[KVM_SPSR_EL1]): return __ctxt_sys_reg(&vcpu->arch.ctxt, SPSR_EL1); case KVM_REG_ARM_CORE_REG(spsr[KVM_SPSR_ABT]): return &vcpu->arch.ctxt.spsr_abt; case KVM_REG_ARM_CORE_REG(spsr[KVM_SPSR_UND]): return &vcpu->arch.ctxt.spsr_und; case KVM_REG_ARM_CORE_REG(spsr[KVM_SPSR_IRQ]): return &vcpu->arch.ctxt.spsr_irq; case KVM_REG_ARM_CORE_REG(spsr[KVM_SPSR_FIQ]): return &vcpu->arch.ctxt.spsr_fiq; case KVM_REG_ARM_CORE_REG(fp_regs.vregs[0]) ... KVM_REG_ARM_CORE_REG(fp_regs.vregs[31]): off -= KVM_REG_ARM_CORE_REG(fp_regs.vregs[0]); off /= 4; return &vcpu->arch.ctxt.fp_regs.vregs[off]; case KVM_REG_ARM_CORE_REG(fp_regs.fpsr): return &vcpu->arch.ctxt.fp_regs.fpsr; case KVM_REG_ARM_CORE_REG(fp_regs.fpcr): return &vcpu->arch.ctxt.fp_regs.fpcr; default: return NULL; } } static int get_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { /* * Because the kvm_regs structure is a mix of 32, 64 and * 128bit fields, we index it as if it was a 32bit * array. Hence below, nr_regs is the number of entries, and * off the index in the "array". */ __u32 __user *uaddr = (__u32 __user *)(unsigned long)reg->addr; int nr_regs = sizeof(struct kvm_regs) / sizeof(__u32); void *addr; u32 off; /* Our ID is an index into the kvm_regs struct. */ off = core_reg_offset_from_id(reg->id); if (off >= nr_regs || (off + (KVM_REG_SIZE(reg->id) / sizeof(__u32))) >= nr_regs) return -ENOENT; addr = core_reg_addr(vcpu, reg); if (!addr) return -EINVAL; if (copy_to_user(uaddr, addr, KVM_REG_SIZE(reg->id))) return -EFAULT; return 0; } static int set_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { __u32 __user *uaddr = (__u32 __user *)(unsigned long)reg->addr; int nr_regs = sizeof(struct kvm_regs) / sizeof(__u32); __uint128_t tmp; void *valp = &tmp, *addr; u64 off; int err = 0; /* Our ID is an index into the kvm_regs struct. */ off = core_reg_offset_from_id(reg->id); if (off >= nr_regs || (off + (KVM_REG_SIZE(reg->id) / sizeof(__u32))) >= nr_regs) return -ENOENT; addr = core_reg_addr(vcpu, reg); if (!addr) return -EINVAL; if (KVM_REG_SIZE(reg->id) > sizeof(tmp)) return -EINVAL; if (copy_from_user(valp, uaddr, KVM_REG_SIZE(reg->id))) { err = -EFAULT; goto out; } if (off == KVM_REG_ARM_CORE_REG(regs.pstate)) { u64 mode = (*(u64 *)valp) & PSR_AA32_MODE_MASK; switch (mode) { case PSR_AA32_MODE_USR: if (!kvm_supports_32bit_el0()) return -EINVAL; break; case PSR_AA32_MODE_FIQ: case PSR_AA32_MODE_IRQ: case PSR_AA32_MODE_SVC: case PSR_AA32_MODE_ABT: case PSR_AA32_MODE_UND: case PSR_AA32_MODE_SYS: if (!vcpu_el1_is_32bit(vcpu)) return -EINVAL; break; case PSR_MODE_EL2h: case PSR_MODE_EL2t: if (!vcpu_has_nv(vcpu)) return -EINVAL; fallthrough; case PSR_MODE_EL0t: case PSR_MODE_EL1t: case PSR_MODE_EL1h: if (vcpu_el1_is_32bit(vcpu)) return -EINVAL; break; default: err = -EINVAL; goto out; } } memcpy(addr, valp, KVM_REG_SIZE(reg->id)); if (*vcpu_cpsr(vcpu) & PSR_MODE32_BIT) { int i, nr_reg; switch (*vcpu_cpsr(vcpu) & PSR_AA32_MODE_MASK) { /* * Either we are dealing with user mode, and only the * first 15 registers (+ PC) must be narrowed to 32bit. * AArch32 r0-r14 conveniently map to AArch64 x0-x14. */ case PSR_AA32_MODE_USR: case PSR_AA32_MODE_SYS: nr_reg = 15; break; /* * Otherwise, this is a privileged mode, and *all* the * registers must be narrowed to 32bit. */ default: nr_reg = 31; break; } for (i = 0; i < nr_reg; i++) vcpu_set_reg(vcpu, i, (u32)vcpu_get_reg(vcpu, i)); *vcpu_pc(vcpu) = (u32)*vcpu_pc(vcpu); } out: return err; } #define vq_word(vq) (((vq) - SVE_VQ_MIN) / 64) #define vq_mask(vq) ((u64)1 << ((vq) - SVE_VQ_MIN) % 64) #define vq_present(vqs, vq) (!!((vqs)[vq_word(vq)] & vq_mask(vq))) static int get_sve_vls(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { unsigned int max_vq, vq; u64 vqs[KVM_ARM64_SVE_VLS_WORDS]; if (!vcpu_has_sve(vcpu)) return -ENOENT; if (WARN_ON(!sve_vl_valid(vcpu->arch.sve_max_vl))) return -EINVAL; memset(vqs, 0, sizeof(vqs)); max_vq = vcpu_sve_max_vq(vcpu); for (vq = SVE_VQ_MIN; vq <= max_vq; ++vq) if (sve_vq_available(vq)) vqs[vq_word(vq)] |= vq_mask(vq); if (copy_to_user((void __user *)reg->addr, vqs, sizeof(vqs))) return -EFAULT; return 0; } static int set_sve_vls(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { unsigned int max_vq, vq; u64 vqs[KVM_ARM64_SVE_VLS_WORDS]; if (!vcpu_has_sve(vcpu)) return -ENOENT; if (kvm_arm_vcpu_sve_finalized(vcpu)) return -EPERM; /* too late! */ if (WARN_ON(vcpu->arch.sve_state)) return -EINVAL; if (copy_from_user(vqs, (const void __user *)reg->addr, sizeof(vqs))) return -EFAULT; max_vq = 0; for (vq = SVE_VQ_MIN; vq <= SVE_VQ_MAX; ++vq) if (vq_present(vqs, vq)) max_vq = vq; if (max_vq > sve_vq_from_vl(kvm_sve_max_vl)) return -EINVAL; /* * Vector lengths supported by the host can't currently be * hidden from the guest individually: instead we can only set a * maximum via ZCR_EL2.LEN. So, make sure the available vector * lengths match the set requested exactly up to the requested * maximum: */ for (vq = SVE_VQ_MIN; vq <= max_vq; ++vq) if (vq_present(vqs, vq) != sve_vq_available(vq)) return -EINVAL; /* Can't run with no vector lengths at all: */ if (max_vq < SVE_VQ_MIN) return -EINVAL; /* vcpu->arch.sve_state will be alloc'd by kvm_vcpu_finalize_sve() */ vcpu->arch.sve_max_vl = sve_vl_from_vq(max_vq); return 0; } #define SVE_REG_SLICE_SHIFT 0 #define SVE_REG_SLICE_BITS 5 #define SVE_REG_ID_SHIFT (SVE_REG_SLICE_SHIFT + SVE_REG_SLICE_BITS) #define SVE_REG_ID_BITS 5 #define SVE_REG_SLICE_MASK \ GENMASK(SVE_REG_SLICE_SHIFT + SVE_REG_SLICE_BITS - 1, \ SVE_REG_SLICE_SHIFT) #define SVE_REG_ID_MASK \ GENMASK(SVE_REG_ID_SHIFT + SVE_REG_ID_BITS - 1, SVE_REG_ID_SHIFT) #define SVE_NUM_SLICES (1 << SVE_REG_SLICE_BITS) #define KVM_SVE_ZREG_SIZE KVM_REG_SIZE(KVM_REG_ARM64_SVE_ZREG(0, 0)) #define KVM_SVE_PREG_SIZE KVM_REG_SIZE(KVM_REG_ARM64_SVE_PREG(0, 0)) /* * Number of register slices required to cover each whole SVE register. * NOTE: Only the first slice every exists, for now. * If you are tempted to modify this, you must also rework sve_reg_to_region() * to match: */ #define vcpu_sve_slices(vcpu) 1 /* Bounds of a single SVE register slice within vcpu->arch.sve_state */ struct sve_state_reg_region { unsigned int koffset; /* offset into sve_state in kernel memory */ unsigned int klen; /* length in kernel memory */ unsigned int upad; /* extra trailing padding in user memory */ }; /* * Validate SVE register ID and get sanitised bounds for user/kernel SVE * register copy */ static int sve_reg_to_region(struct sve_state_reg_region *region, struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { /* reg ID ranges for Z- registers */ const u64 zreg_id_min = KVM_REG_ARM64_SVE_ZREG(0, 0); const u64 zreg_id_max = KVM_REG_ARM64_SVE_ZREG(SVE_NUM_ZREGS - 1, SVE_NUM_SLICES - 1); /* reg ID ranges for P- registers and FFR (which are contiguous) */ const u64 preg_id_min = KVM_REG_ARM64_SVE_PREG(0, 0); const u64 preg_id_max = KVM_REG_ARM64_SVE_FFR(SVE_NUM_SLICES - 1); unsigned int vq; unsigned int reg_num; unsigned int reqoffset, reqlen; /* User-requested offset and length */ unsigned int maxlen; /* Maximum permitted length */ size_t sve_state_size; const u64 last_preg_id = KVM_REG_ARM64_SVE_PREG(SVE_NUM_PREGS - 1, SVE_NUM_SLICES - 1); /* Verify that the P-regs and FFR really do have contiguous IDs: */ BUILD_BUG_ON(KVM_REG_ARM64_SVE_FFR(0) != last_preg_id + 1); /* Verify that we match the UAPI header: */ BUILD_BUG_ON(SVE_NUM_SLICES != KVM_ARM64_SVE_MAX_SLICES); reg_num = (reg->id & SVE_REG_ID_MASK) >> SVE_REG_ID_SHIFT; if (reg->id >= zreg_id_min && reg->id <= zreg_id_max) { if (!vcpu_has_sve(vcpu) || (reg->id & SVE_REG_SLICE_MASK) > 0) return -ENOENT; vq = vcpu_sve_max_vq(vcpu); reqoffset = SVE_SIG_ZREG_OFFSET(vq, reg_num) - SVE_SIG_REGS_OFFSET; reqlen = KVM_SVE_ZREG_SIZE; maxlen = SVE_SIG_ZREG_SIZE(vq); } else if (reg->id >= preg_id_min && reg->id <= preg_id_max) { if (!vcpu_has_sve(vcpu) || (reg->id & SVE_REG_SLICE_MASK) > 0) return -ENOENT; vq = vcpu_sve_max_vq(vcpu); reqoffset = SVE_SIG_PREG_OFFSET(vq, reg_num) - SVE_SIG_REGS_OFFSET; reqlen = KVM_SVE_PREG_SIZE; maxlen = SVE_SIG_PREG_SIZE(vq); } else { return -EINVAL; } sve_state_size = vcpu_sve_state_size(vcpu); if (WARN_ON(!sve_state_size)) return -EINVAL; region->koffset = array_index_nospec(reqoffset, sve_state_size); region->klen = min(maxlen, reqlen); region->upad = reqlen - region->klen; return 0; } static int get_sve_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { int ret; struct sve_state_reg_region region; char __user *uptr = (char __user *)reg->addr; /* Handle the KVM_REG_ARM64_SVE_VLS pseudo-reg as a special case: */ if (reg->id == KVM_REG_ARM64_SVE_VLS) return get_sve_vls(vcpu, reg); /* Try to interpret reg ID as an architectural SVE register... */ ret = sve_reg_to_region(&region, vcpu, reg); if (ret) return ret; if (!kvm_arm_vcpu_sve_finalized(vcpu)) return -EPERM; if (copy_to_user(uptr, vcpu->arch.sve_state + region.koffset, region.klen) || clear_user(uptr + region.klen, region.upad)) return -EFAULT; return 0; } static int set_sve_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { int ret; struct sve_state_reg_region region; const char __user *uptr = (const char __user *)reg->addr; /* Handle the KVM_REG_ARM64_SVE_VLS pseudo-reg as a special case: */ if (reg->id == KVM_REG_ARM64_SVE_VLS) return set_sve_vls(vcpu, reg); /* Try to interpret reg ID as an architectural SVE register... */ ret = sve_reg_to_region(&region, vcpu, reg); if (ret) return ret; if (!kvm_arm_vcpu_sve_finalized(vcpu)) return -EPERM; if (copy_from_user(vcpu->arch.sve_state + region.koffset, uptr, region.klen)) return -EFAULT; return 0; } int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { return -EINVAL; } int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { return -EINVAL; } static int copy_core_reg_indices(const struct kvm_vcpu *vcpu, u64 __user *uindices) { unsigned int i; int n = 0; for (i = 0; i < sizeof(struct kvm_regs) / sizeof(__u32); i++) { u64 reg = KVM_REG_ARM64 | KVM_REG_ARM_CORE | i; int size = core_reg_size_from_offset(vcpu, i); if (size < 0) continue; switch (size) { case sizeof(__u32): reg |= KVM_REG_SIZE_U32; break; case sizeof(__u64): reg |= KVM_REG_SIZE_U64; break; case sizeof(__uint128_t): reg |= KVM_REG_SIZE_U128; break; default: WARN_ON(1); continue; } if (uindices) { if (put_user(reg, uindices)) return -EFAULT; uindices++; } n++; } return n; } static unsigned long num_core_regs(const struct kvm_vcpu *vcpu) { return copy_core_reg_indices(vcpu, NULL); } static const u64 timer_reg_list[] = { KVM_REG_ARM_TIMER_CTL, KVM_REG_ARM_TIMER_CNT, KVM_REG_ARM_TIMER_CVAL, KVM_REG_ARM_PTIMER_CTL, KVM_REG_ARM_PTIMER_CNT, KVM_REG_ARM_PTIMER_CVAL, }; #define NUM_TIMER_REGS ARRAY_SIZE(timer_reg_list) static bool is_timer_reg(u64 index) { switch (index) { case KVM_REG_ARM_TIMER_CTL: case KVM_REG_ARM_TIMER_CNT: case KVM_REG_ARM_TIMER_CVAL: case KVM_REG_ARM_PTIMER_CTL: case KVM_REG_ARM_PTIMER_CNT: case KVM_REG_ARM_PTIMER_CVAL: return true; } return false; } static int copy_timer_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) { for (int i = 0; i < NUM_TIMER_REGS; i++) { if (put_user(timer_reg_list[i], uindices)) return -EFAULT; uindices++; } return 0; } static int set_timer_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { void __user *uaddr = (void __user *)(long)reg->addr; u64 val; int ret; ret = copy_from_user(&val, uaddr, KVM_REG_SIZE(reg->id)); if (ret != 0) return -EFAULT; return kvm_arm_timer_set_reg(vcpu, reg->id, val); } static int get_timer_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { void __user *uaddr = (void __user *)(long)reg->addr; u64 val; val = kvm_arm_timer_get_reg(vcpu, reg->id); return copy_to_user(uaddr, &val, KVM_REG_SIZE(reg->id)) ? -EFAULT : 0; } static unsigned long num_sve_regs(const struct kvm_vcpu *vcpu) { const unsigned int slices = vcpu_sve_slices(vcpu); if (!vcpu_has_sve(vcpu)) return 0; /* Policed by KVM_GET_REG_LIST: */ WARN_ON(!kvm_arm_vcpu_sve_finalized(vcpu)); return slices * (SVE_NUM_PREGS + SVE_NUM_ZREGS + 1 /* FFR */) + 1; /* KVM_REG_ARM64_SVE_VLS */ } static int copy_sve_reg_indices(const struct kvm_vcpu *vcpu, u64 __user *uindices) { const unsigned int slices = vcpu_sve_slices(vcpu); u64 reg; unsigned int i, n; int num_regs = 0; if (!vcpu_has_sve(vcpu)) return 0; /* Policed by KVM_GET_REG_LIST: */ WARN_ON(!kvm_arm_vcpu_sve_finalized(vcpu)); /* * Enumerate this first, so that userspace can save/restore in * the order reported by KVM_GET_REG_LIST: */ reg = KVM_REG_ARM64_SVE_VLS; if (put_user(reg, uindices++)) return -EFAULT; ++num_regs; for (i = 0; i < slices; i++) { for (n = 0; n < SVE_NUM_ZREGS; n++) { reg = KVM_REG_ARM64_SVE_ZREG(n, i); if (put_user(reg, uindices++)) return -EFAULT; num_regs++; } for (n = 0; n < SVE_NUM_PREGS; n++) { reg = KVM_REG_ARM64_SVE_PREG(n, i); if (put_user(reg, uindices++)) return -EFAULT; num_regs++; } reg = KVM_REG_ARM64_SVE_FFR(i); if (put_user(reg, uindices++)) return -EFAULT; num_regs++; } return num_regs; } /** * kvm_arm_num_regs - how many registers do we present via KVM_GET_ONE_REG * @vcpu: the vCPU pointer * * This is for all registers. */ unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu) { unsigned long res = 0; res += num_core_regs(vcpu); res += num_sve_regs(vcpu); res += kvm_arm_num_sys_reg_descs(vcpu); res += kvm_arm_get_fw_num_regs(vcpu); res += NUM_TIMER_REGS; return res; } /** * kvm_arm_copy_reg_indices - get indices of all registers. * @vcpu: the vCPU pointer * @uindices: register list to copy * * We do core registers right here, then we append system regs. */ int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) { int ret; ret = copy_core_reg_indices(vcpu, uindices); if (ret < 0) return ret; uindices += ret; ret = copy_sve_reg_indices(vcpu, uindices); if (ret < 0) return ret; uindices += ret; ret = kvm_arm_copy_fw_reg_indices(vcpu, uindices); if (ret < 0) return ret; uindices += kvm_arm_get_fw_num_regs(vcpu); ret = copy_timer_indices(vcpu, uindices); if (ret < 0) return ret; uindices += NUM_TIMER_REGS; return kvm_arm_copy_sys_reg_indices(vcpu, uindices); } int kvm_arm_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { /* We currently use nothing arch-specific in upper 32 bits */ if ((reg->id & ~KVM_REG_SIZE_MASK) >> 32 != KVM_REG_ARM64 >> 32) return -EINVAL; switch (reg->id & KVM_REG_ARM_COPROC_MASK) { case KVM_REG_ARM_CORE: return get_core_reg(vcpu, reg); case KVM_REG_ARM_FW: case KVM_REG_ARM_FW_FEAT_BMAP: return kvm_arm_get_fw_reg(vcpu, reg); case KVM_REG_ARM64_SVE: return get_sve_reg(vcpu, reg); } if (is_timer_reg(reg->id)) return get_timer_reg(vcpu, reg); return kvm_arm_sys_reg_get_reg(vcpu, reg); } int kvm_arm_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { /* We currently use nothing arch-specific in upper 32 bits */ if ((reg->id & ~KVM_REG_SIZE_MASK) >> 32 != KVM_REG_ARM64 >> 32) return -EINVAL; switch (reg->id & KVM_REG_ARM_COPROC_MASK) { case KVM_REG_ARM_CORE: return set_core_reg(vcpu, reg); case KVM_REG_ARM_FW: case KVM_REG_ARM_FW_FEAT_BMAP: return kvm_arm_set_fw_reg(vcpu, reg); case KVM_REG_ARM64_SVE: return set_sve_reg(vcpu, reg); } if (is_timer_reg(reg->id)) return set_timer_reg(vcpu, reg); return kvm_arm_sys_reg_set_reg(vcpu, reg); } int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { return -EINVAL; } int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { return -EINVAL; } int __kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) { events->exception.serror_has_esr = cpus_have_final_cap(ARM64_HAS_RAS_EXTN); events->exception.serror_pending = (vcpu->arch.hcr_el2 & HCR_VSE) || vcpu_get_flag(vcpu, NESTED_SERROR_PENDING); if (events->exception.serror_pending && events->exception.serror_has_esr) events->exception.serror_esr = vcpu_get_vsesr(vcpu); /* * We never return a pending ext_dabt here because we deliver it to * the virtual CPU directly when setting the event and it's no longer * 'pending' at this point. */ return 0; } static void commit_pending_events(struct kvm_vcpu *vcpu) { if (!vcpu_get_flag(vcpu, PENDING_EXCEPTION)) return; /* * Reset the MMIO emulation state to avoid stepping PC after emulating * the exception entry. */ vcpu->mmio_needed = false; kvm_call_hyp(__kvm_adjust_pc, vcpu); } int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) { bool serror_pending = events->exception.serror_pending; bool has_esr = events->exception.serror_has_esr; bool ext_dabt_pending = events->exception.ext_dabt_pending; u64 esr = events->exception.serror_esr; int ret = 0; /* * Immediately commit the pending SEA to the vCPU's architectural * state which is necessary since we do not return a pending SEA * to userspace via KVM_GET_VCPU_EVENTS. */ if (ext_dabt_pending) { ret = kvm_inject_sea_dabt(vcpu, kvm_vcpu_get_hfar(vcpu)); commit_pending_events(vcpu); } if (ret < 0) return ret; if (!serror_pending) return 0; if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN) && has_esr) return -EINVAL; if (has_esr && (esr & ~ESR_ELx_ISS_MASK)) return -EINVAL; if (has_esr) ret = kvm_inject_serror_esr(vcpu, esr); else ret = kvm_inject_serror(vcpu); /* * We could've decided that the SError is due for immediate software * injection; commit the exception in case userspace decides it wants * to inject more exceptions for some strange reason. */ commit_pending_events(vcpu); return (ret < 0) ? ret : 0; } u32 __attribute_const__ kvm_target_cpu(void) { unsigned long implementor = read_cpuid_implementor(); unsigned long part_number = read_cpuid_part_number(); switch (implementor) { case ARM_CPU_IMP_ARM: switch (part_number) { case ARM_CPU_PART_AEM_V8: return KVM_ARM_TARGET_AEM_V8; case ARM_CPU_PART_FOUNDATION: return KVM_ARM_TARGET_FOUNDATION_V8; case ARM_CPU_PART_CORTEX_A53: return KVM_ARM_TARGET_CORTEX_A53; case ARM_CPU_PART_CORTEX_A57: return KVM_ARM_TARGET_CORTEX_A57; } break; case ARM_CPU_IMP_APM: switch (part_number) { case APM_CPU_PART_XGENE: return KVM_ARM_TARGET_XGENE_POTENZA; } break; } /* Return a default generic target */ return KVM_ARM_TARGET_GENERIC_V8; } int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { return -EINVAL; } int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { return -EINVAL; } int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, struct kvm_translation *tr) { return -EINVAL; } /** * kvm_arch_vcpu_ioctl_set_guest_debug - set up guest debugging * @vcpu: the vCPU pointer * @dbg: the ioctl data buffer * * This sets up and enables the VM for guest debugging. Userspace * passes in a control flag to enable different debug types and * potentially other architecture specific information in the rest of * the structure. */ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) { trace_kvm_set_guest_debug(vcpu, dbg->control); if (dbg->control & ~KVM_GUESTDBG_VALID_MASK) return -EINVAL; if (!(dbg->control & KVM_GUESTDBG_ENABLE)) { vcpu->guest_debug = 0; vcpu_clear_flag(vcpu, HOST_SS_ACTIVE_PENDING); return 0; } vcpu->guest_debug = dbg->control; /* Hardware assisted Break and Watch points */ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) vcpu->arch.external_debug_state = dbg->arch; return 0; } int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { int ret; switch (attr->group) { case KVM_ARM_VCPU_PMU_V3_CTRL: mutex_lock(&vcpu->kvm->arch.config_lock); ret = kvm_arm_pmu_v3_set_attr(vcpu, attr); mutex_unlock(&vcpu->kvm->arch.config_lock); break; case KVM_ARM_VCPU_TIMER_CTRL: ret = kvm_arm_timer_set_attr(vcpu, attr); break; case KVM_ARM_VCPU_PVTIME_CTRL: ret = kvm_arm_pvtime_set_attr(vcpu, attr); break; default: ret = -ENXIO; break; } return ret; } int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { int ret; switch (attr->group) { case KVM_ARM_VCPU_PMU_V3_CTRL: ret = kvm_arm_pmu_v3_get_attr(vcpu, attr); break; case KVM_ARM_VCPU_TIMER_CTRL: ret = kvm_arm_timer_get_attr(vcpu, attr); break; case KVM_ARM_VCPU_PVTIME_CTRL: ret = kvm_arm_pvtime_get_attr(vcpu, attr); break; default: ret = -ENXIO; break; } return ret; } int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { int ret; switch (attr->group) { case KVM_ARM_VCPU_PMU_V3_CTRL: ret = kvm_arm_pmu_v3_has_attr(vcpu, attr); break; case KVM_ARM_VCPU_TIMER_CTRL: ret = kvm_arm_timer_has_attr(vcpu, attr); break; case KVM_ARM_VCPU_PVTIME_CTRL: ret = kvm_arm_pvtime_has_attr(vcpu, attr); break; default: ret = -ENXIO; break; } return ret; } int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, struct kvm_arm_copy_mte_tags *copy_tags) { gpa_t guest_ipa = copy_tags->guest_ipa; size_t length = copy_tags->length; void __user *tags = copy_tags->addr; gpa_t gfn; bool write = !(copy_tags->flags & KVM_ARM_TAGS_FROM_GUEST); int ret = 0; if (!kvm_has_mte(kvm)) return -EINVAL; if (copy_tags->reserved[0] || copy_tags->reserved[1]) return -EINVAL; if (copy_tags->flags & ~KVM_ARM_TAGS_FROM_GUEST) return -EINVAL; if (length & ~PAGE_MASK || guest_ipa & ~PAGE_MASK) return -EINVAL; /* Lengths above INT_MAX cannot be represented in the return value */ if (length > INT_MAX) return -EINVAL; gfn = gpa_to_gfn(guest_ipa); mutex_lock(&kvm->slots_lock); if (write && atomic_read(&kvm->nr_memslots_dirty_logging)) { ret = -EBUSY; goto out; } while (length > 0) { struct page *page = __gfn_to_page(kvm, gfn, write); void *maddr; unsigned long num_tags; struct folio *folio; if (!page) { ret = -EFAULT; goto out; } if (!pfn_to_online_page(page_to_pfn(page))) { /* Reject ZONE_DEVICE memory */ kvm_release_page_unused(page); ret = -EFAULT; goto out; } folio = page_folio(page); maddr = page_address(page); if (!write) { if ((folio_test_hugetlb(folio) && folio_test_hugetlb_mte_tagged(folio)) || page_mte_tagged(page)) num_tags = mte_copy_tags_to_user(tags, maddr, MTE_GRANULES_PER_PAGE); else /* No tags in memory, so write zeros */ num_tags = MTE_GRANULES_PER_PAGE - clear_user(tags, MTE_GRANULES_PER_PAGE); kvm_release_page_clean(page); } else { /* * Only locking to serialise with a concurrent * __set_ptes() in the VMM but still overriding the * tags, hence ignoring the return value. */ if (folio_test_hugetlb(folio)) folio_try_hugetlb_mte_tagging(folio); else try_page_mte_tagging(page); num_tags = mte_copy_tags_from_user(maddr, tags, MTE_GRANULES_PER_PAGE); /* uaccess failed, don't leave stale tags */ if (num_tags != MTE_GRANULES_PER_PAGE) mte_clear_page_tags(maddr); if (folio_test_hugetlb(folio)) folio_set_hugetlb_mte_tagged(folio); else set_page_mte_tagged(page); kvm_release_page_dirty(page); } if (num_tags != MTE_GRANULES_PER_PAGE) { ret = -EFAULT; goto out; } gfn++; tags += num_tags; length -= PAGE_SIZE; } out: mutex_unlock(&kvm->slots_lock); /* If some data has been copied report the number of bytes copied */ if (length != copy_tags->length) return copy_tags->length - length; return ret; }
226 13 13 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_FILELOCK_H #define _LINUX_FILELOCK_H #include <linux/fs.h> #define FL_POSIX 1 #define FL_FLOCK 2 #define FL_DELEG 4 /* NFSv4 delegation */ #define FL_ACCESS 8 /* not trying to lock, just looking */ #define FL_EXISTS 16 /* when unlocking, test for existence */ #define FL_LEASE 32 /* lease held on this file */ #define FL_CLOSE 64 /* unlock on close */ #define FL_SLEEP 128 /* A blocking lock */ #define FL_DOWNGRADE_PENDING 256 /* Lease is being downgraded */ #define FL_UNLOCK_PENDING 512 /* Lease is being broken */ #define FL_OFDLCK 1024 /* lock is "owned" by struct file */ #define FL_LAYOUT 2048 /* outstanding pNFS layout */ #define FL_RECLAIM 4096 /* reclaiming from a reboot server */ #define FL_CLOSE_POSIX (FL_POSIX | FL_CLOSE) /* * Special return value from posix_lock_file() and vfs_lock_file() for * asynchronous locking. */ #define FILE_LOCK_DEFERRED 1 struct file_lock; struct file_lease; struct file_lock_operations { void (*fl_copy_lock)(struct file_lock *, struct file_lock *); void (*fl_release_private)(struct file_lock *); }; struct lock_manager_operations { void *lm_mod_owner; fl_owner_t (*lm_get_owner)(fl_owner_t); void (*lm_put_owner)(fl_owner_t); void (*lm_notify)(struct file_lock *); /* unblock callback */ int (*lm_grant)(struct file_lock *, int); bool (*lm_lock_expirable)(struct file_lock *cfl); void (*lm_expire_lock)(void); }; struct lease_manager_operations { bool (*lm_break)(struct file_lease *); int (*lm_change)(struct file_lease *, int, struct list_head *); void (*lm_setup)(struct file_lease *, void **); bool (*lm_breaker_owns_lease)(struct file_lease *); }; struct lock_manager { struct list_head list; /* * NFSv4 and up also want opens blocked during the grace period; * NLM doesn't care: */ bool block_opens; }; struct net; void locks_start_grace(struct net *, struct lock_manager *); void locks_end_grace(struct lock_manager *); bool locks_in_grace(struct net *); bool opens_in_grace(struct net *); /* * struct file_lock has a union that some filesystems use to track * their own private info. The NFS side of things is defined here: */ #include <linux/nfs_fs_i.h> /* * struct file_lock represents a generic "file lock". It's used to represent * POSIX byte range locks, BSD (flock) locks, and leases. It's important to * note that the same struct is used to represent both a request for a lock and * the lock itself, but the same object is never used for both. * * FIXME: should we create a separate "struct lock_request" to help distinguish * these two uses? * * The varous i_flctx lists are ordered by: * * 1) lock owner * 2) lock range start * 3) lock range end * * Obviously, the last two criteria only matter for POSIX locks. */ struct file_lock_core { struct file_lock_core *flc_blocker; /* The lock that is blocking us */ struct list_head flc_list; /* link into file_lock_context */ struct hlist_node flc_link; /* node in global lists */ struct list_head flc_blocked_requests; /* list of requests with * ->fl_blocker pointing here */ struct list_head flc_blocked_member; /* node in * ->fl_blocker->fl_blocked_requests */ fl_owner_t flc_owner; unsigned int flc_flags; unsigned char flc_type; pid_t flc_pid; int flc_link_cpu; /* what cpu's list is this on? */ wait_queue_head_t flc_wait; struct file *flc_file; }; struct file_lock { struct file_lock_core c; loff_t fl_start; loff_t fl_end; const struct file_lock_operations *fl_ops; /* Callbacks for filesystems */ const struct lock_manager_operations *fl_lmops; /* Callbacks for lockmanagers */ union { struct nfs_lock_info nfs_fl; struct nfs4_lock_info nfs4_fl; struct { struct list_head link; /* link in AFS vnode's pending_locks list */ int state; /* state of grant or error if -ve */ unsigned int debug_id; } afs; struct { struct inode *inode; } ceph; } fl_u; } __randomize_layout; struct file_lease { struct file_lock_core c; struct fasync_struct * fl_fasync; /* for lease break notifications */ /* for lease breaks: */ unsigned long fl_break_time; unsigned long fl_downgrade_time; const struct lease_manager_operations *fl_lmops; /* Callbacks for lease managers */ } __randomize_layout; struct file_lock_context { spinlock_t flc_lock; struct list_head flc_flock; struct list_head flc_posix; struct list_head flc_lease; }; #ifdef CONFIG_FILE_LOCKING int fcntl_getlk(struct file *, unsigned int, struct flock *); int fcntl_setlk(unsigned int, struct file *, unsigned int, struct flock *); #if BITS_PER_LONG == 32 int fcntl_getlk64(struct file *, unsigned int, struct flock64 *); int fcntl_setlk64(unsigned int, struct file *, unsigned int, struct flock64 *); #endif int fcntl_setlease(unsigned int fd, struct file *filp, int arg); int fcntl_getlease(struct file *filp); static inline bool lock_is_unlock(struct file_lock *fl) { return fl->c.flc_type == F_UNLCK; } static inline bool lock_is_read(struct file_lock *fl) { return fl->c.flc_type == F_RDLCK; } static inline bool lock_is_write(struct file_lock *fl) { return fl->c.flc_type == F_WRLCK; } static inline void locks_wake_up(struct file_lock *fl) { wake_up(&fl->c.flc_wait); } static inline bool locks_can_async_lock(const struct file_operations *fops) { return !fops->lock || fops->fop_flags & FOP_ASYNC_LOCK; } /* fs/locks.c */ void locks_free_lock_context(struct inode *inode); void locks_free_lock(struct file_lock *fl); void locks_init_lock(struct file_lock *); struct file_lock *locks_alloc_lock(void); void locks_copy_lock(struct file_lock *, struct file_lock *); void locks_copy_conflock(struct file_lock *, struct file_lock *); void locks_remove_posix(struct file *, fl_owner_t); void locks_remove_file(struct file *); void locks_release_private(struct file_lock *); void posix_test_lock(struct file *, struct file_lock *); int posix_lock_file(struct file *, struct file_lock *, struct file_lock *); int locks_delete_block(struct file_lock *); int vfs_test_lock(struct file *, struct file_lock *); int vfs_lock_file(struct file *, unsigned int, struct file_lock *, struct file_lock *); int vfs_cancel_lock(struct file *filp, struct file_lock *fl); bool vfs_inode_has_locks(struct inode *inode); int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl); void locks_init_lease(struct file_lease *); void locks_free_lease(struct file_lease *fl); struct file_lease *locks_alloc_lease(void); int __break_lease(struct inode *inode, unsigned int flags, unsigned int type); void lease_get_mtime(struct inode *, struct timespec64 *time); int generic_setlease(struct file *, int, struct file_lease **, void **priv); int kernel_setlease(struct file *, int, struct file_lease **, void **); int vfs_setlease(struct file *, int, struct file_lease **, void **); int lease_modify(struct file_lease *, int, struct list_head *); struct notifier_block; int lease_register_notifier(struct notifier_block *); void lease_unregister_notifier(struct notifier_block *); struct files_struct; void show_fd_locks(struct seq_file *f, struct file *filp, struct files_struct *files); bool locks_owner_has_blockers(struct file_lock_context *flctx, fl_owner_t owner); static inline struct file_lock_context * locks_inode_context(const struct inode *inode) { return smp_load_acquire(&inode->i_flctx); } #else /* !CONFIG_FILE_LOCKING */ static inline int fcntl_getlk(struct file *file, unsigned int cmd, struct flock __user *user) { return -EINVAL; } static inline int fcntl_setlk(unsigned int fd, struct file *file, unsigned int cmd, struct flock __user *user) { return -EACCES; } #if BITS_PER_LONG == 32 static inline int fcntl_getlk64(struct file *file, unsigned int cmd, struct flock64 *user) { return -EINVAL; } static inline int fcntl_setlk64(unsigned int fd, struct file *file, unsigned int cmd, struct flock64 *user) { return -EACCES; } #endif static inline int fcntl_setlease(unsigned int fd, struct file *filp, int arg) { return -EINVAL; } static inline int fcntl_getlease(struct file *filp) { return F_UNLCK; } static inline bool lock_is_unlock(struct file_lock *fl) { return false; } static inline bool lock_is_read(struct file_lock *fl) { return false; } static inline bool lock_is_write(struct file_lock *fl) { return false; } static inline void locks_wake_up(struct file_lock *fl) { } static inline void locks_free_lock_context(struct inode *inode) { } static inline void locks_init_lock(struct file_lock *fl) { return; } static inline void locks_init_lease(struct file_lease *fl) { return; } static inline void locks_copy_conflock(struct file_lock *new, struct file_lock *fl) { return; } static inline void locks_copy_lock(struct file_lock *new, struct file_lock *fl) { return; } static inline void locks_remove_posix(struct file *filp, fl_owner_t owner) { return; } static inline void locks_remove_file(struct file *filp) { return; } static inline void posix_test_lock(struct file *filp, struct file_lock *fl) { return; } static inline int posix_lock_file(struct file *filp, struct file_lock *fl, struct file_lock *conflock) { return -ENOLCK; } static inline int locks_delete_block(struct file_lock *waiter) { return -ENOENT; } static inline int vfs_test_lock(struct file *filp, struct file_lock *fl) { return 0; } static inline int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf) { return -ENOLCK; } static inline int vfs_cancel_lock(struct file *filp, struct file_lock *fl) { return 0; } static inline bool vfs_inode_has_locks(struct inode *inode) { return false; } static inline int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl) { return -ENOLCK; } static inline int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) { return 0; } static inline void lease_get_mtime(struct inode *inode, struct timespec64 *time) { return; } static inline int generic_setlease(struct file *filp, int arg, struct file_lease **flp, void **priv) { return -EINVAL; } static inline int kernel_setlease(struct file *filp, int arg, struct file_lease **lease, void **priv) { return -EINVAL; } static inline int vfs_setlease(struct file *filp, int arg, struct file_lease **lease, void **priv) { return -EINVAL; } static inline int lease_modify(struct file_lease *fl, int arg, struct list_head *dispose) { return -EINVAL; } struct files_struct; static inline void show_fd_locks(struct seq_file *f, struct file *filp, struct files_struct *files) {} static inline bool locks_owner_has_blockers(struct file_lock_context *flctx, fl_owner_t owner) { return false; } static inline struct file_lock_context * locks_inode_context(const struct inode *inode) { return NULL; } #endif /* !CONFIG_FILE_LOCKING */ /* for walking lists of file_locks linked by fl_list */ #define for_each_file_lock(_fl, _head) list_for_each_entry(_fl, _head, c.flc_list) static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl) { return locks_lock_inode_wait(file_inode(filp), fl); } #ifdef CONFIG_FILE_LOCKING static inline int break_lease(struct inode *inode, unsigned int mode) { struct file_lock_context *flctx; /* * Since this check is lockless, we must ensure that any refcounts * taken are done before checking i_flctx->flc_lease. Otherwise, we * could end up racing with tasks trying to set a new lease on this * file. */ flctx = READ_ONCE(inode->i_flctx); if (!flctx) return 0; smp_mb(); if (!list_empty_careful(&flctx->flc_lease)) return __break_lease(inode, mode, FL_LEASE); return 0; } static inline int break_deleg(struct inode *inode, unsigned int mode) { struct file_lock_context *flctx; /* * Since this check is lockless, we must ensure that any refcounts * taken are done before checking i_flctx->flc_lease. Otherwise, we * could end up racing with tasks trying to set a new lease on this * file. */ flctx = READ_ONCE(inode->i_flctx); if (!flctx) return 0; smp_mb(); if (!list_empty_careful(&flctx->flc_lease)) return __break_lease(inode, mode, FL_DELEG); return 0; } static inline int try_break_deleg(struct inode *inode, struct inode **delegated_inode) { int ret; ret = break_deleg(inode, O_WRONLY|O_NONBLOCK); if (ret == -EWOULDBLOCK && delegated_inode) { *delegated_inode = inode; ihold(inode); } return ret; } static inline int break_deleg_wait(struct inode **delegated_inode) { int ret; ret = break_deleg(*delegated_inode, O_WRONLY); iput(*delegated_inode); *delegated_inode = NULL; return ret; } static inline int break_layout(struct inode *inode, bool wait) { smp_mb(); if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease)) return __break_lease(inode, wait ? O_WRONLY : O_WRONLY | O_NONBLOCK, FL_LAYOUT); return 0; } #else /* !CONFIG_FILE_LOCKING */ static inline int break_lease(struct inode *inode, unsigned int mode) { return 0; } static inline int break_deleg(struct inode *inode, unsigned int mode) { return 0; } static inline int try_break_deleg(struct inode *inode, struct inode **delegated_inode) { return 0; } static inline int break_deleg_wait(struct inode **delegated_inode) { BUG(); return 0; } static inline int break_layout(struct inode *inode, bool wait) { return 0; } #endif /* CONFIG_FILE_LOCKING */ #endif /* _LINUX_FILELOCK_H */
1070 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Based on arch/arm/include/asm/memory.h * * Copyright (C) 2000-2002 Russell King * Copyright (C) 2012 ARM Ltd. * * Note: this file should not be included by non-asm/.h files */ #ifndef __ASM_MEMORY_H #define __ASM_MEMORY_H #include <linux/const.h> #include <linux/sizes.h> #include <asm/page-def.h> /* * Size of the PCI I/O space. This must remain a power of two so that * IO_SPACE_LIMIT acts as a mask for the low bits of I/O addresses. */ #define PCI_IO_SIZE SZ_16M /* * VMEMMAP_SIZE - allows the whole linear region to be covered by * a struct page array * * If we are configured with a 52-bit kernel VA then our VMEMMAP_SIZE * needs to cover the memory region from the beginning of the 52-bit * PAGE_OFFSET all the way to PAGE_END for 48-bit. This allows us to * keep a constant PAGE_OFFSET and "fallback" to using the higher end * of the VMEMMAP where 52-bit support is not available in hardware. */ #define VMEMMAP_RANGE (_PAGE_END(VA_BITS_MIN) - PAGE_OFFSET) #define VMEMMAP_SIZE ((VMEMMAP_RANGE >> PAGE_SHIFT) * sizeof(struct page)) /* * PAGE_OFFSET - the virtual address of the start of the linear map, at the * start of the TTBR1 address space. * PAGE_END - the end of the linear map, where all other kernel mappings begin. * KIMAGE_VADDR - the virtual address of the start of the kernel image. * VA_BITS - the maximum number of bits for virtual addresses. */ #define VA_BITS (CONFIG_ARM64_VA_BITS) #define _PAGE_OFFSET(va) (-(UL(1) << (va))) #define PAGE_OFFSET (_PAGE_OFFSET(VA_BITS)) #define KIMAGE_VADDR (MODULES_END) #define MODULES_END (MODULES_VADDR + MODULES_VSIZE) #define MODULES_VADDR (_PAGE_END(VA_BITS_MIN)) #define MODULES_VSIZE (SZ_2G) #define VMEMMAP_START (VMEMMAP_END - VMEMMAP_SIZE) #define VMEMMAP_END (-UL(SZ_1G)) #define PCI_IO_START (VMEMMAP_END + SZ_8M) #define PCI_IO_END (PCI_IO_START + PCI_IO_SIZE) #define FIXADDR_TOP (-UL(SZ_8M)) #if VA_BITS > 48 #ifdef CONFIG_ARM64_16K_PAGES #define VA_BITS_MIN (47) #else #define VA_BITS_MIN (48) #endif #else #define VA_BITS_MIN (VA_BITS) #endif #define _PAGE_END(va) (-(UL(1) << ((va) - 1))) #define KERNEL_START _text #define KERNEL_END _end /* * Generic and Software Tag-Based KASAN modes require 1/8th and 1/16th of the * kernel virtual address space for storing the shadow memory respectively. * * The mapping between a virtual memory address and its corresponding shadow * memory address is defined based on the formula: * * shadow_addr = (addr >> KASAN_SHADOW_SCALE_SHIFT) + KASAN_SHADOW_OFFSET * * where KASAN_SHADOW_SCALE_SHIFT is the order of the number of bits that map * to a single shadow byte and KASAN_SHADOW_OFFSET is a constant that offsets * the mapping. Note that KASAN_SHADOW_OFFSET does not point to the start of * the shadow memory region. * * Based on this mapping, we define two constants: * * KASAN_SHADOW_START: the start of the shadow memory region; * KASAN_SHADOW_END: the end of the shadow memory region. * * KASAN_SHADOW_END is defined first as the shadow address that corresponds to * the upper bound of possible virtual kernel memory addresses UL(1) << 64 * according to the mapping formula. * * KASAN_SHADOW_START is defined second based on KASAN_SHADOW_END. The shadow * memory start must map to the lowest possible kernel virtual memory address * and thus it depends on the actual bitness of the address space. * * As KASAN inserts redzones between stack variables, this increases the stack * memory usage significantly. Thus, we double the (minimum) stack size. */ #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) #define KASAN_SHADOW_OFFSET _AC(CONFIG_KASAN_SHADOW_OFFSET, UL) #define KASAN_SHADOW_END ((UL(1) << (64 - KASAN_SHADOW_SCALE_SHIFT)) + KASAN_SHADOW_OFFSET) #define _KASAN_SHADOW_START(va) (KASAN_SHADOW_END - (UL(1) << ((va) - KASAN_SHADOW_SCALE_SHIFT))) #define KASAN_SHADOW_START _KASAN_SHADOW_START(vabits_actual) #define PAGE_END KASAN_SHADOW_START #define KASAN_THREAD_SHIFT 1 #else #define KASAN_THREAD_SHIFT 0 #define PAGE_END (_PAGE_END(VA_BITS_MIN)) #endif /* CONFIG_KASAN */ #define DIRECT_MAP_PHYSMEM_END __pa(PAGE_END - 1) #define MIN_THREAD_SHIFT (14 + KASAN_THREAD_SHIFT) /* * VMAP'd stacks are allocated at page granularity, so we must ensure that such * stacks are a multiple of page size. */ #if defined(CONFIG_VMAP_STACK) && (MIN_THREAD_SHIFT < PAGE_SHIFT) #define THREAD_SHIFT PAGE_SHIFT #else #define THREAD_SHIFT MIN_THREAD_SHIFT #endif #if THREAD_SHIFT >= PAGE_SHIFT #define THREAD_SIZE_ORDER (THREAD_SHIFT - PAGE_SHIFT) #endif #define THREAD_SIZE (UL(1) << THREAD_SHIFT) /* * By aligning VMAP'd stacks to 2 * THREAD_SIZE, we can detect overflow by * checking sp & (1 << THREAD_SHIFT), which we can do cheaply in the entry * assembly. */ #ifdef CONFIG_VMAP_STACK #define THREAD_ALIGN (2 * THREAD_SIZE) #else #define THREAD_ALIGN THREAD_SIZE #endif #define IRQ_STACK_SIZE THREAD_SIZE #define OVERFLOW_STACK_SIZE SZ_4K #define NVHE_STACK_SHIFT PAGE_SHIFT #define NVHE_STACK_SIZE (UL(1) << NVHE_STACK_SHIFT) /* * With the minimum frame size of [x29, x30], exactly half the combined * sizes of the hyp and overflow stacks is the maximum size needed to * save the unwinded stacktrace; plus an additional entry to delimit the * end. */ #define NVHE_STACKTRACE_SIZE ((OVERFLOW_STACK_SIZE + NVHE_STACK_SIZE) / 2 + sizeof(long)) /* * Alignment of kernel segments (e.g. .text, .data). * * 4 KB granule: 16 level 3 entries, with contiguous bit * 16 KB granule: 4 level 3 entries, without contiguous bit * 64 KB granule: 1 level 3 entry */ #define SEGMENT_ALIGN SZ_64K /* * Memory types available. * * IMPORTANT: MT_NORMAL must be index 0 since vm_get_page_prot() may 'or' in * the MT_NORMAL_TAGGED memory type for PROT_MTE mappings. Note * that protection_map[] only contains MT_NORMAL attributes. */ #define MT_NORMAL 0 #define MT_NORMAL_TAGGED 1 #define MT_NORMAL_NC 2 #define MT_DEVICE_nGnRnE 3 #define MT_DEVICE_nGnRE 4 /* * Memory types for Stage-2 translation */ #define MT_S2_NORMAL 0xf #define MT_S2_NORMAL_NC 0x5 #define MT_S2_DEVICE_nGnRE 0x1 /* * Memory types for Stage-2 translation when ID_AA64MMFR2_EL1.FWB is 0001 * Stage-2 enforces Normal-WB and Device-nGnRE */ #define MT_S2_FWB_NORMAL 6 #define MT_S2_FWB_NORMAL_NC 5 #define MT_S2_FWB_DEVICE_nGnRE 1 #ifdef CONFIG_ARM64_4K_PAGES #define IOREMAP_MAX_ORDER (PUD_SHIFT) #else #define IOREMAP_MAX_ORDER (PMD_SHIFT) #endif /* * Open-coded (swapper_pg_dir - reserved_pg_dir) as this cannot be calculated * until link time. */ #define RESERVED_SWAPPER_OFFSET (PAGE_SIZE) /* * Open-coded (swapper_pg_dir - tramp_pg_dir) as this cannot be calculated * until link time. */ #define TRAMP_SWAPPER_OFFSET (2 * PAGE_SIZE) #ifndef __ASSEMBLY__ #include <linux/bitops.h> #include <linux/compiler.h> #include <linux/mmdebug.h> #include <linux/types.h> #include <asm/boot.h> #include <asm/bug.h> #include <asm/sections.h> #include <asm/sysreg.h> static inline u64 __pure read_tcr(void) { u64 tcr; // read_sysreg() uses asm volatile, so avoid it here asm("mrs %0, tcr_el1" : "=r"(tcr)); return tcr; } #if VA_BITS > 48 // For reasons of #include hell, we can't use TCR_T1SZ_OFFSET/TCR_T1SZ_MASK here #define vabits_actual (64 - ((read_tcr() >> 16) & 63)) #else #define vabits_actual ((u64)VA_BITS) #endif extern s64 memstart_addr; /* PHYS_OFFSET - the physical address of the start of memory. */ #define PHYS_OFFSET ({ VM_BUG_ON(memstart_addr & 1); memstart_addr; }) /* the offset between the kernel virtual and physical mappings */ extern u64 kimage_voffset; static inline unsigned long kaslr_offset(void) { return (u64)&_text - KIMAGE_VADDR; } #ifdef CONFIG_RANDOMIZE_BASE void kaslr_init(void); static inline bool kaslr_enabled(void) { extern bool __kaslr_is_enabled; return __kaslr_is_enabled; } #else static inline void kaslr_init(void) { } static inline bool kaslr_enabled(void) { return false; } #endif /* * Allow all memory at the discovery stage. We will clip it later. */ #define MIN_MEMBLOCK_ADDR 0 #define MAX_MEMBLOCK_ADDR U64_MAX /* * PFNs are used to describe any physical page; this means * PFN 0 == physical address 0. * * This is the PFN of the first RAM page in the kernel * direct-mapped view. We assume this is the first page * of RAM in the mem_map as well. */ #define PHYS_PFN_OFFSET (PHYS_OFFSET >> PAGE_SHIFT) /* * When dealing with data aborts, watchpoints, or instruction traps we may end * up with a tagged userland pointer. Clear the tag to get a sane pointer to * pass on to access_ok(), for instance. */ #define __untagged_addr(addr) \ ((__force __typeof__(addr))sign_extend64((__force u64)(addr), 55)) #define untagged_addr(addr) ({ \ u64 __addr = (__force u64)(addr); \ __addr &= __untagged_addr(__addr); \ (__force __typeof__(addr))__addr; \ }) #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) #define __tag_shifted(tag) ((u64)(tag) << 56) #define __tag_reset(addr) __untagged_addr(addr) #define __tag_get(addr) (__u8)((u64)(addr) >> 56) #else #define __tag_shifted(tag) 0UL #define __tag_reset(addr) (addr) #define __tag_get(addr) 0 #endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ static inline const void *__tag_set(const void *addr, u8 tag) { u64 __addr = (u64)addr & ~__tag_shifted(0xff); return (const void *)(__addr | __tag_shifted(tag)); } #ifdef CONFIG_KASAN_HW_TAGS #define arch_enable_tag_checks_sync() mte_enable_kernel_sync() #define arch_enable_tag_checks_async() mte_enable_kernel_async() #define arch_enable_tag_checks_asymm() mte_enable_kernel_asymm() #define arch_suppress_tag_checks_start() mte_enable_tco() #define arch_suppress_tag_checks_stop() mte_disable_tco() #define arch_force_async_tag_fault() mte_check_tfsr_exit() #define arch_get_random_tag() mte_get_random_tag() #define arch_get_mem_tag(addr) mte_get_mem_tag(addr) #define arch_set_mem_tag_range(addr, size, tag, init) \ mte_set_mem_tag_range((addr), (size), (tag), (init)) #endif /* CONFIG_KASAN_HW_TAGS */ /* * Physical vs virtual RAM address space conversion. These are * private definitions which should NOT be used outside memory.h * files. Use virt_to_phys/phys_to_virt/__pa/__va instead. */ /* * Check whether an arbitrary address is within the linear map, which * lives in the [PAGE_OFFSET, PAGE_END) interval at the bottom of the * kernel's TTBR1 address range. */ #define __is_lm_address(addr) (((u64)(addr) - PAGE_OFFSET) < (PAGE_END - PAGE_OFFSET)) #define __lm_to_phys(addr) (((addr) - PAGE_OFFSET) + PHYS_OFFSET) #define __kimg_to_phys(addr) ((addr) - kimage_voffset) #define __virt_to_phys_nodebug(x) ({ \ phys_addr_t __x = (phys_addr_t)(__tag_reset(x)); \ __is_lm_address(__x) ? __lm_to_phys(__x) : __kimg_to_phys(__x); \ }) #define __pa_symbol_nodebug(x) __kimg_to_phys((phys_addr_t)(x)) #ifdef CONFIG_DEBUG_VIRTUAL extern phys_addr_t __virt_to_phys(unsigned long x); extern phys_addr_t __phys_addr_symbol(unsigned long x); #else #define __virt_to_phys(x) __virt_to_phys_nodebug(x) #define __phys_addr_symbol(x) __pa_symbol_nodebug(x) #endif /* CONFIG_DEBUG_VIRTUAL */ #define __phys_to_virt(x) ((unsigned long)((x) - PHYS_OFFSET) | PAGE_OFFSET) #define __phys_to_kimg(x) ((unsigned long)((x) + kimage_voffset)) /* * Note: Drivers should NOT use these. They are the wrong * translation for translating DMA addresses. Use the driver * DMA support - see dma-mapping.h. */ #define virt_to_phys virt_to_phys static inline phys_addr_t virt_to_phys(const volatile void *x) { return __virt_to_phys((unsigned long)(x)); } #define phys_to_virt phys_to_virt static inline void *phys_to_virt(phys_addr_t x) { return (void *)(__phys_to_virt(x)); } /* Needed already here for resolving __phys_to_pfn() in virt_to_pfn() */ #include <asm-generic/memory_model.h> static inline unsigned long virt_to_pfn(const void *kaddr) { return __phys_to_pfn(virt_to_phys(kaddr)); } /* * Drivers should NOT use these either. */ #define __pa(x) __virt_to_phys((unsigned long)(x)) #define __pa_symbol(x) __phys_addr_symbol(RELOC_HIDE((unsigned long)(x), 0)) #define __pa_nodebug(x) __virt_to_phys_nodebug((unsigned long)(x)) #define __va(x) ((void *)__phys_to_virt((phys_addr_t)(x))) #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) #define sym_to_pfn(x) __phys_to_pfn(__pa_symbol(x)) /* * virt_to_page(x) convert a _valid_ virtual address to struct page * * virt_addr_valid(x) indicates whether a virtual address is valid */ #define ARCH_PFN_OFFSET ((unsigned long)PHYS_PFN_OFFSET) #if defined(CONFIG_DEBUG_VIRTUAL) #define page_to_virt(x) ({ \ __typeof__(x) __page = x; \ void *__addr = __va(page_to_phys(__page)); \ (void *)__tag_set((const void *)__addr, page_kasan_tag(__page));\ }) #define virt_to_page(x) pfn_to_page(virt_to_pfn(x)) #else #define page_to_virt(x) ({ \ __typeof__(x) __page = x; \ u64 __idx = ((u64)__page - VMEMMAP_START) / sizeof(struct page);\ u64 __addr = PAGE_OFFSET + (__idx * PAGE_SIZE); \ (void *)__tag_set((const void *)__addr, page_kasan_tag(__page));\ }) #define virt_to_page(x) ({ \ u64 __idx = (__tag_reset((u64)x) - PAGE_OFFSET) / PAGE_SIZE; \ u64 __addr = VMEMMAP_START + (__idx * sizeof(struct page)); \ (struct page *)__addr; \ }) #endif /* CONFIG_DEBUG_VIRTUAL */ #define virt_addr_valid(addr) ({ \ __typeof__(addr) __addr = __tag_reset(addr); \ __is_lm_address(__addr) && pfn_is_map_memory(virt_to_pfn(__addr)); \ }) void dump_mem_limit(void); #endif /* !ASSEMBLY */ /* * Given that the GIC architecture permits ITS implementations that can only be * configured with a LPI table address once, GICv3 systems with many CPUs may * end up reserving a lot of different regions after a kexec for their LPI * tables (one per CPU), as we are forced to reuse the same memory after kexec * (and thus reserve it persistently with EFI beforehand) */ #if defined(CONFIG_EFI) && defined(CONFIG_ARM_GIC_V3_ITS) # define INIT_MEMBLOCK_RESERVED_REGIONS (INIT_MEMBLOCK_REGIONS + NR_CPUS + 1) #endif /* * memory regions which marked with flag MEMBLOCK_NOMAP(for example, the memory * of the EFI_UNUSABLE_MEMORY type) may divide a continuous memory block into * multiple parts. As a result, the number of memory regions is large. */ #ifdef CONFIG_EFI #define INIT_MEMBLOCK_MEMORY_REGIONS (INIT_MEMBLOCK_REGIONS * 8) #endif #endif /* __ASM_MEMORY_H */
347 385 384 118 347 346 347 100 139 241 277 277 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 // SPDX-License-Identifier: GPL-2.0-only /* * Based on arch/arm/mm/flush.c * * Copyright (C) 1995-2002 Russell King * Copyright (C) 2012 ARM Ltd. */ #include <linux/export.h> #include <linux/mm.h> #include <linux/libnvdimm.h> #include <linux/pagemap.h> #include <asm/cacheflush.h> #include <asm/cache.h> #include <asm/tlbflush.h> void sync_icache_aliases(unsigned long start, unsigned long end) { if (icache_is_aliasing()) { dcache_clean_pou(start, end); icache_inval_all_pou(); } else { /* * Don't issue kick_all_cpus_sync() after I-cache invalidation * for user mappings. */ caches_clean_inval_pou(start, end); } } static void flush_ptrace_access(struct vm_area_struct *vma, unsigned long start, unsigned long end) { if (vma->vm_flags & VM_EXEC) sync_icache_aliases(start, end); } /* * Copy user data from/to a page which is mapped into a different processes * address space. Really, we want to allow our "user space" model to handle * this. */ void copy_to_user_page(struct vm_area_struct *vma, struct page *page, unsigned long uaddr, void *dst, const void *src, unsigned long len) { memcpy(dst, src, len); flush_ptrace_access(vma, (unsigned long)dst, (unsigned long)dst + len); } void __sync_icache_dcache(pte_t pte) { struct folio *folio = page_folio(pte_page(pte)); if (!test_bit(PG_dcache_clean, &folio->flags)) { sync_icache_aliases((unsigned long)folio_address(folio), (unsigned long)folio_address(folio) + folio_size(folio)); set_bit(PG_dcache_clean, &folio->flags); } } EXPORT_SYMBOL_GPL(__sync_icache_dcache); /* * This function is called when a page has been modified by the kernel. Mark * it as dirty for later flushing when mapped in user space (if executable, * see __sync_icache_dcache). */ void flush_dcache_folio(struct folio *folio) { if (test_bit(PG_dcache_clean, &folio->flags)) clear_bit(PG_dcache_clean, &folio->flags); } EXPORT_SYMBOL(flush_dcache_folio); void flush_dcache_page(struct page *page) { flush_dcache_folio(page_folio(page)); } EXPORT_SYMBOL(flush_dcache_page); /* * Additional functions defined in assembly. */ EXPORT_SYMBOL(caches_clean_inval_pou); #ifdef CONFIG_ARCH_HAS_PMEM_API void arch_wb_cache_pmem(void *addr, size_t size) { /* Ensure order against any prior non-cacheable writes */ dmb(osh); dcache_clean_pop((unsigned long)addr, (unsigned long)addr + size); } EXPORT_SYMBOL_GPL(arch_wb_cache_pmem); void arch_invalidate_pmem(void *addr, size_t size) { dcache_inval_poc((unsigned long)addr, (unsigned long)addr + size); } EXPORT_SYMBOL_GPL(arch_invalidate_pmem); #endif
196 92 309 310 250 250 19 19 19 19 19 19 19 19 19 19 19 98 97 98 98 269 275 249 268 268 269 269 274 1 133 134 4 257 258 258 27 27 10 195 188 28 44 44 44 44 195 44 4 44 44 44 44 29 44 195 195 195 44 44 44 44 195 195 195 20 195 27 27 19 10 21 21 27 27 8 1 26 294 2 292 292 26 298 5 294 5 5 5 5 5 27 8 27 27 27 27 27 26 4 27 27 27 27 27 27 91 144 37 37 27 28 164 164 162 3 8 162 163 162 104 13 27 106 87 85 21 26 123 123 104 27 26 3 27 13 123 104 26 8 108 123 123 123 4 13 92 103 2 1 100 2 4 121 121 11 13 104 101 103 110 13 9 2 123 24 24 104 13 8 3 126 109 24 125 3 2 104 13 8 3 121 113 113 11 136 136 1 1 1 1 2 130 1 1 2 13 107 107 9 1 11 5 113 110 4 137 1 136 1 2 7 4 2 1 1 2 2 3 10 10 1 2 2 12 11 2 2 2 2 4 1 1 1 1 2 1 17 17 299 336 84 84 84 276 248 2 4 9 3 10 10 298 245 276 251 251 30 263 266 266 266 250 251 266 235 257 271 260 32 253 237 236 2 249 268 271 254 10 10 10 259 239 271 271 87 6 86 88 88 86 85 10 17 3 15 4 10 18 18 18 11 7 256 256 251 6 243 2 15 2 2 310 304 4 249 247 2 3 9 3 9 9 9 9 9 4 3 2 8 2 9 9 9 1 9 2 3 3 2 3 2 2 3 3 1 28 1 8 20 27 50 103 91 11 7 96 52 52 52 4 91 98 1 3 95 1 95 1 6 91 1 1 1 1 1 717 718 2 2 717 1 22 12 300 1 1 1 1 1 1 1 4 2 1 1 4 2 2 1 1 1 444 2 2 1 5 245 253 1 186 47 28 2 9 8 1 65 1 4 6 57 57 57 57 115 17 1 1 1 2 7 86 1 1 1 1 1 1 3 1 1 1 1 6 6 6 27 7 2 1 3 1 6 2 1 1 11 1 1 1 545 1 160 98 2 28 5 133 138 138 2 10 2 17 2 5 2 12 2 24 2 42 1 13 2 14 8 3 2 1 3 15 1 2 8 60 43 3 1 124 298 5 294 390 1 298 65 2 26 294 294 27 4 12 27 5 35 7 138 116 116 86 187 202 195 23 4 91 91 202 202 202 83 15 6 1 84 88 88 88 150 150 150 54 123 150 17 16 17 4 5 10 1 10 11 11 10 8 11 314 27 294 294 27 2 313 306 388 400 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 // SPDX-License-Identifier: GPL-2.0-only /* * Kernel-based Virtual Machine (KVM) Hypervisor * * Copyright (C) 2006 Qumranet, Inc. * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * Authors: * Avi Kivity <avi@qumranet.com> * Yaniv Kamay <yaniv@qumranet.com> */ #include <kvm/iodev.h> #include <linux/kvm_host.h> #include <linux/kvm.h> #include <linux/module.h> #include <linux/errno.h> #include <linux/percpu.h> #include <linux/mm.h> #include <linux/miscdevice.h> #include <linux/vmalloc.h> #include <linux/reboot.h> #include <linux/debugfs.h> #include <linux/highmem.h> #include <linux/file.h> #include <linux/syscore_ops.h> #include <linux/cpu.h> #include <linux/sched/signal.h> #include <linux/sched/mm.h> #include <linux/sched/stat.h> #include <linux/cpumask.h> #include <linux/smp.h> #include <linux/anon_inodes.h> #include <linux/profile.h> #include <linux/kvm_para.h> #include <linux/pagemap.h> #include <linux/mman.h> #include <linux/swap.h> #include <linux/bitops.h> #include <linux/spinlock.h> #include <linux/compat.h> #include <linux/srcu.h> #include <linux/hugetlb.h> #include <linux/slab.h> #include <linux/sort.h> #include <linux/bsearch.h> #include <linux/io.h> #include <linux/lockdep.h> #include <linux/kthread.h> #include <linux/suspend.h> #include <asm/processor.h> #include <asm/ioctl.h> #include <linux/uaccess.h> #include "coalesced_mmio.h" #include "async_pf.h" #include "kvm_mm.h" #include "vfio.h" #include <trace/events/ipi.h> #define CREATE_TRACE_POINTS #include <trace/events/kvm.h> #include <linux/kvm_dirty_ring.h> /* Worst case buffer size needed for holding an integer. */ #define ITOA_MAX_LEN 12 MODULE_AUTHOR("Qumranet"); MODULE_DESCRIPTION("Kernel-based Virtual Machine (KVM) Hypervisor"); MODULE_LICENSE("GPL"); /* Architectures should define their poll value according to the halt latency */ unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT; module_param(halt_poll_ns, uint, 0644); EXPORT_SYMBOL_GPL(halt_poll_ns); /* Default doubles per-vcpu halt_poll_ns. */ unsigned int halt_poll_ns_grow = 2; module_param(halt_poll_ns_grow, uint, 0644); EXPORT_SYMBOL_GPL(halt_poll_ns_grow); /* The start value to grow halt_poll_ns from */ unsigned int halt_poll_ns_grow_start = 10000; /* 10us */ module_param(halt_poll_ns_grow_start, uint, 0644); EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start); /* Default halves per-vcpu halt_poll_ns. */ unsigned int halt_poll_ns_shrink = 2; module_param(halt_poll_ns_shrink, uint, 0644); EXPORT_SYMBOL_GPL(halt_poll_ns_shrink); /* * Allow direct access (from KVM or the CPU) without MMU notifier protection * to unpinned pages. */ static bool allow_unsafe_mappings; module_param(allow_unsafe_mappings, bool, 0444); /* * Ordering of locks: * * kvm->lock --> kvm->slots_lock --> kvm->irq_lock */ DEFINE_MUTEX(kvm_lock); LIST_HEAD(vm_list); static struct kmem_cache *kvm_vcpu_cache; static __read_mostly struct preempt_ops kvm_preempt_ops; static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu); static struct dentry *kvm_debugfs_dir; static const struct file_operations stat_fops_per_vm; static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, unsigned long arg); #ifdef CONFIG_KVM_COMPAT static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl, unsigned long arg); #define KVM_COMPAT(c) .compat_ioctl = (c) #else /* * For architectures that don't implement a compat infrastructure, * adopt a double line of defense: * - Prevent a compat task from opening /dev/kvm * - If the open has been done by a 64bit task, and the KVM fd * passed to a compat task, let the ioctls fail. */ static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl, unsigned long arg) { return -EINVAL; } static int kvm_no_compat_open(struct inode *inode, struct file *file) { return is_compat_task() ? -ENODEV : 0; } #define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \ .open = kvm_no_compat_open #endif static void kvm_io_bus_destroy(struct kvm_io_bus *bus); #define KVM_EVENT_CREATE_VM 0 #define KVM_EVENT_DESTROY_VM 1 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm); static unsigned long long kvm_createvm_count; static unsigned long long kvm_active_vms; static DEFINE_PER_CPU(cpumask_var_t, cpu_kick_mask); __weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm) { } /* * Switches to specified vcpu, until a matching vcpu_put() */ void vcpu_load(struct kvm_vcpu *vcpu) { int cpu = get_cpu(); __this_cpu_write(kvm_running_vcpu, vcpu); preempt_notifier_register(&vcpu->preempt_notifier); kvm_arch_vcpu_load(vcpu, cpu); put_cpu(); } EXPORT_SYMBOL_GPL(vcpu_load); void vcpu_put(struct kvm_vcpu *vcpu) { preempt_disable(); kvm_arch_vcpu_put(vcpu); preempt_notifier_unregister(&vcpu->preempt_notifier); __this_cpu_write(kvm_running_vcpu, NULL); preempt_enable(); } EXPORT_SYMBOL_GPL(vcpu_put); /* TODO: merge with kvm_arch_vcpu_should_kick */ static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req) { int mode = kvm_vcpu_exiting_guest_mode(vcpu); /* * We need to wait for the VCPU to reenable interrupts and get out of * READING_SHADOW_PAGE_TABLES mode. */ if (req & KVM_REQUEST_WAIT) return mode != OUTSIDE_GUEST_MODE; /* * Need to kick a running VCPU, but otherwise there is nothing to do. */ return mode == IN_GUEST_MODE; } static void ack_kick(void *_completed) { } static inline bool kvm_kick_many_cpus(struct cpumask *cpus, bool wait) { if (cpumask_empty(cpus)) return false; smp_call_function_many(cpus, ack_kick, NULL, wait); return true; } static void kvm_make_vcpu_request(struct kvm_vcpu *vcpu, unsigned int req, struct cpumask *tmp, int current_cpu) { int cpu; if (likely(!(req & KVM_REQUEST_NO_ACTION))) __kvm_make_request(req, vcpu); if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu)) return; /* * Note, the vCPU could get migrated to a different pCPU at any point * after kvm_request_needs_ipi(), which could result in sending an IPI * to the previous pCPU. But, that's OK because the purpose of the IPI * is to ensure the vCPU returns to OUTSIDE_GUEST_MODE, which is * satisfied if the vCPU migrates. Entering READING_SHADOW_PAGE_TABLES * after this point is also OK, as the requirement is only that KVM wait * for vCPUs that were reading SPTEs _before_ any changes were * finalized. See kvm_vcpu_kick() for more details on handling requests. */ if (kvm_request_needs_ipi(vcpu, req)) { cpu = READ_ONCE(vcpu->cpu); if (cpu != -1 && cpu != current_cpu) __cpumask_set_cpu(cpu, tmp); } } bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, unsigned long *vcpu_bitmap) { struct kvm_vcpu *vcpu; struct cpumask *cpus; int i, me; bool called; me = get_cpu(); cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask); cpumask_clear(cpus); for_each_set_bit(i, vcpu_bitmap, KVM_MAX_VCPUS) { vcpu = kvm_get_vcpu(kvm, i); if (!vcpu) continue; kvm_make_vcpu_request(vcpu, req, cpus, me); } called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT)); put_cpu(); return called; } bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req) { struct kvm_vcpu *vcpu; struct cpumask *cpus; unsigned long i; bool called; int me; me = get_cpu(); cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask); cpumask_clear(cpus); kvm_for_each_vcpu(i, vcpu, kvm) kvm_make_vcpu_request(vcpu, req, cpus, me); called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT)); put_cpu(); return called; } EXPORT_SYMBOL_GPL(kvm_make_all_cpus_request); void kvm_flush_remote_tlbs(struct kvm *kvm) { ++kvm->stat.generic.remote_tlb_flush_requests; /* * We want to publish modifications to the page tables before reading * mode. Pairs with a memory barrier in arch-specific code. * - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest * and smp_mb in walk_shadow_page_lockless_begin/end. * - powerpc: smp_mb in kvmppc_prepare_to_enter. * * There is already an smp_mb__after_atomic() before * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that * barrier here. */ if (!kvm_arch_flush_remote_tlbs(kvm) || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) ++kvm->stat.generic.remote_tlb_flush; } EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs); void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages) { if (!kvm_arch_flush_remote_tlbs_range(kvm, gfn, nr_pages)) return; /* * Fall back to a flushing entire TLBs if the architecture range-based * TLB invalidation is unsupported or can't be performed for whatever * reason. */ kvm_flush_remote_tlbs(kvm); } void kvm_flush_remote_tlbs_memslot(struct kvm *kvm, const struct kvm_memory_slot *memslot) { /* * All current use cases for flushing the TLBs for a specific memslot * are related to dirty logging, and many do the TLB flush out of * mmu_lock. The interaction between the various operations on memslot * must be serialized by slots_locks to ensure the TLB flush from one * operation is observed by any other operation on the same memslot. */ lockdep_assert_held(&kvm->slots_lock); kvm_flush_remote_tlbs_range(kvm, memslot->base_gfn, memslot->npages); } static void kvm_flush_shadow_all(struct kvm *kvm) { kvm_arch_flush_shadow_all(kvm); kvm_arch_guest_memory_reclaimed(kvm); } #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc, gfp_t gfp_flags) { void *page; gfp_flags |= mc->gfp_zero; if (mc->kmem_cache) return kmem_cache_alloc(mc->kmem_cache, gfp_flags); page = (void *)__get_free_page(gfp_flags); if (page && mc->init_value) memset64(page, mc->init_value, PAGE_SIZE / sizeof(u64)); return page; } int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min) { gfp_t gfp = mc->gfp_custom ? mc->gfp_custom : GFP_KERNEL_ACCOUNT; void *obj; if (mc->nobjs >= min) return 0; if (unlikely(!mc->objects)) { if (WARN_ON_ONCE(!capacity)) return -EIO; /* * Custom init values can be used only for page allocations, * and obviously conflict with __GFP_ZERO. */ if (WARN_ON_ONCE(mc->init_value && (mc->kmem_cache || mc->gfp_zero))) return -EIO; mc->objects = kvmalloc_array(capacity, sizeof(void *), gfp); if (!mc->objects) return -ENOMEM; mc->capacity = capacity; } /* It is illegal to request a different capacity across topups. */ if (WARN_ON_ONCE(mc->capacity != capacity)) return -EIO; while (mc->nobjs < mc->capacity) { obj = mmu_memory_cache_alloc_obj(mc, gfp); if (!obj) return mc->nobjs >= min ? 0 : -ENOMEM; mc->objects[mc->nobjs++] = obj; } return 0; } int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min) { return __kvm_mmu_topup_memory_cache(mc, KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE, min); } int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc) { return mc->nobjs; } void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) { while (mc->nobjs) { if (mc->kmem_cache) kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]); else free_page((unsigned long)mc->objects[--mc->nobjs]); } kvfree(mc->objects); mc->objects = NULL; mc->capacity = 0; } void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) { void *p; if (WARN_ON(!mc->nobjs)) p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC | __GFP_ACCOUNT); else p = mc->objects[--mc->nobjs]; BUG_ON(!p); return p; } #endif static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) { mutex_init(&vcpu->mutex); vcpu->cpu = -1; vcpu->kvm = kvm; vcpu->vcpu_id = id; vcpu->pid = NULL; rwlock_init(&vcpu->pid_lock); #ifndef __KVM_HAVE_ARCH_WQP rcuwait_init(&vcpu->wait); #endif kvm_async_pf_vcpu_init(vcpu); kvm_vcpu_set_in_spin_loop(vcpu, false); kvm_vcpu_set_dy_eligible(vcpu, false); vcpu->preempted = false; vcpu->ready = false; preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); vcpu->last_used_slot = NULL; /* Fill the stats id string for the vcpu */ snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d", task_pid_nr(current), id); } static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu) { kvm_arch_vcpu_destroy(vcpu); kvm_dirty_ring_free(&vcpu->dirty_ring); /* * No need for rcu_read_lock as VCPU_RUN is the only place that changes * the vcpu->pid pointer, and at destruction time all file descriptors * are already gone. */ put_pid(vcpu->pid); free_page((unsigned long)vcpu->run); kmem_cache_free(kvm_vcpu_cache, vcpu); } void kvm_destroy_vcpus(struct kvm *kvm) { unsigned long i; struct kvm_vcpu *vcpu; kvm_for_each_vcpu(i, vcpu, kvm) { kvm_vcpu_destroy(vcpu); xa_erase(&kvm->vcpu_array, i); /* * Assert that the vCPU isn't visible in any way, to ensure KVM * doesn't trigger a use-after-free if destroying vCPUs results * in VM-wide request, e.g. to flush remote TLBs when tearing * down MMUs, or to mark the VM dead if a KVM_BUG_ON() fires. */ WARN_ON_ONCE(xa_load(&kvm->vcpu_array, i) || kvm_get_vcpu(kvm, i)); } atomic_set(&kvm->online_vcpus, 0); } EXPORT_SYMBOL_GPL(kvm_destroy_vcpus); #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) { return container_of(mn, struct kvm, mmu_notifier); } typedef bool (*gfn_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range); typedef void (*on_lock_fn_t)(struct kvm *kvm); struct kvm_mmu_notifier_range { /* * 64-bit addresses, as KVM notifiers can operate on host virtual * addresses (unsigned long) and guest physical addresses (64-bit). */ u64 start; u64 end; union kvm_mmu_notifier_arg arg; gfn_handler_t handler; on_lock_fn_t on_lock; bool flush_on_ret; bool may_block; bool lockless; }; /* * The inner-most helper returns a tuple containing the return value from the * arch- and action-specific handler, plus a flag indicating whether or not at * least one memslot was found, i.e. if the handler found guest memory. * * Note, most notifiers are averse to booleans, so even though KVM tracks the * return from arch code as a bool, outer helpers will cast it to an int. :-( */ typedef struct kvm_mmu_notifier_return { bool ret; bool found_memslot; } kvm_mn_ret_t; /* * Use a dedicated stub instead of NULL to indicate that there is no callback * function/handler. The compiler technically can't guarantee that a real * function will have a non-zero address, and so it will generate code to * check for !NULL, whereas comparing against a stub will be elided at compile * time (unless the compiler is getting long in the tooth, e.g. gcc 4.9). */ static void kvm_null_fn(void) { } #define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn) /* Iterate over each memslot intersecting [start, last] (inclusive) range */ #define kvm_for_each_memslot_in_hva_range(node, slots, start, last) \ for (node = interval_tree_iter_first(&slots->hva_tree, start, last); \ node; \ node = interval_tree_iter_next(node, start, last)) \ static __always_inline kvm_mn_ret_t kvm_handle_hva_range(struct kvm *kvm, const struct kvm_mmu_notifier_range *range) { struct kvm_mmu_notifier_return r = { .ret = false, .found_memslot = false, }; struct kvm_gfn_range gfn_range; struct kvm_memory_slot *slot; struct kvm_memslots *slots; int i, idx; if (WARN_ON_ONCE(range->end <= range->start)) return r; /* A null handler is allowed if and only if on_lock() is provided. */ if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) && IS_KVM_NULL_FN(range->handler))) return r; /* on_lock will never be called for lockless walks */ if (WARN_ON_ONCE(range->lockless && !IS_KVM_NULL_FN(range->on_lock))) return r; idx = srcu_read_lock(&kvm->srcu); for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { struct interval_tree_node *node; slots = __kvm_memslots(kvm, i); kvm_for_each_memslot_in_hva_range(node, slots, range->start, range->end - 1) { unsigned long hva_start, hva_end; slot = container_of(node, struct kvm_memory_slot, hva_node[slots->node_idx]); hva_start = max_t(unsigned long, range->start, slot->userspace_addr); hva_end = min_t(unsigned long, range->end, slot->userspace_addr + (slot->npages << PAGE_SHIFT)); /* * To optimize for the likely case where the address * range is covered by zero or one memslots, don't * bother making these conditional (to avoid writes on * the second or later invocation of the handler). */ gfn_range.arg = range->arg; gfn_range.may_block = range->may_block; /* * HVA-based notifications aren't relevant to private * mappings as they don't have a userspace mapping. */ gfn_range.attr_filter = KVM_FILTER_SHARED; /* * {gfn(page) | page intersects with [hva_start, hva_end)} = * {gfn_start, gfn_start+1, ..., gfn_end-1}. */ gfn_range.start = hva_to_gfn_memslot(hva_start, slot); gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot); gfn_range.slot = slot; gfn_range.lockless = range->lockless; if (!r.found_memslot) { r.found_memslot = true; if (!range->lockless) { KVM_MMU_LOCK(kvm); if (!IS_KVM_NULL_FN(range->on_lock)) range->on_lock(kvm); if (IS_KVM_NULL_FN(range->handler)) goto mmu_unlock; } } r.ret |= range->handler(kvm, &gfn_range); } } if (range->flush_on_ret && r.ret) kvm_flush_remote_tlbs(kvm); mmu_unlock: if (r.found_memslot && !range->lockless) KVM_MMU_UNLOCK(kvm); srcu_read_unlock(&kvm->srcu, idx); return r; } static __always_inline int kvm_age_hva_range(struct mmu_notifier *mn, unsigned long start, unsigned long end, gfn_handler_t handler, bool flush_on_ret) { struct kvm *kvm = mmu_notifier_to_kvm(mn); const struct kvm_mmu_notifier_range range = { .start = start, .end = end, .handler = handler, .on_lock = (void *)kvm_null_fn, .flush_on_ret = flush_on_ret, .may_block = false, .lockless = IS_ENABLED(CONFIG_KVM_MMU_LOCKLESS_AGING), }; return kvm_handle_hva_range(kvm, &range).ret; } static __always_inline int kvm_age_hva_range_no_flush(struct mmu_notifier *mn, unsigned long start, unsigned long end, gfn_handler_t handler) { return kvm_age_hva_range(mn, start, end, handler, false); } void kvm_mmu_invalidate_begin(struct kvm *kvm) { lockdep_assert_held_write(&kvm->mmu_lock); /* * The count increase must become visible at unlock time as no * spte can be established without taking the mmu_lock and * count is also read inside the mmu_lock critical section. */ kvm->mmu_invalidate_in_progress++; if (likely(kvm->mmu_invalidate_in_progress == 1)) { kvm->mmu_invalidate_range_start = INVALID_GPA; kvm->mmu_invalidate_range_end = INVALID_GPA; } } void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end) { lockdep_assert_held_write(&kvm->mmu_lock); WARN_ON_ONCE(!kvm->mmu_invalidate_in_progress); if (likely(kvm->mmu_invalidate_range_start == INVALID_GPA)) { kvm->mmu_invalidate_range_start = start; kvm->mmu_invalidate_range_end = end; } else { /* * Fully tracking multiple concurrent ranges has diminishing * returns. Keep things simple and just find the minimal range * which includes the current and new ranges. As there won't be * enough information to subtract a range after its invalidate * completes, any ranges invalidated concurrently will * accumulate and persist until all outstanding invalidates * complete. */ kvm->mmu_invalidate_range_start = min(kvm->mmu_invalidate_range_start, start); kvm->mmu_invalidate_range_end = max(kvm->mmu_invalidate_range_end, end); } } bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { kvm_mmu_invalidate_range_add(kvm, range->start, range->end); return kvm_unmap_gfn_range(kvm, range); } static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, const struct mmu_notifier_range *range) { struct kvm *kvm = mmu_notifier_to_kvm(mn); const struct kvm_mmu_notifier_range hva_range = { .start = range->start, .end = range->end, .handler = kvm_mmu_unmap_gfn_range, .on_lock = kvm_mmu_invalidate_begin, .flush_on_ret = true, .may_block = mmu_notifier_range_blockable(range), }; trace_kvm_unmap_hva_range(range->start, range->end); /* * Prevent memslot modification between range_start() and range_end() * so that conditionally locking provides the same result in both * functions. Without that guarantee, the mmu_invalidate_in_progress * adjustments will be imbalanced. * * Pairs with the decrement in range_end(). */ spin_lock(&kvm->mn_invalidate_lock); kvm->mn_active_invalidate_count++; spin_unlock(&kvm->mn_invalidate_lock); /* * Invalidate pfn caches _before_ invalidating the secondary MMUs, i.e. * before acquiring mmu_lock, to avoid holding mmu_lock while acquiring * each cache's lock. There are relatively few caches in existence at * any given time, and the caches themselves can check for hva overlap, * i.e. don't need to rely on memslot overlap checks for performance. * Because this runs without holding mmu_lock, the pfn caches must use * mn_active_invalidate_count (see above) instead of * mmu_invalidate_in_progress. */ gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end); /* * If one or more memslots were found and thus zapped, notify arch code * that guest memory has been reclaimed. This needs to be done *after* * dropping mmu_lock, as x86's reclaim path is slooooow. */ if (kvm_handle_hva_range(kvm, &hva_range).found_memslot) kvm_arch_guest_memory_reclaimed(kvm); return 0; } void kvm_mmu_invalidate_end(struct kvm *kvm) { lockdep_assert_held_write(&kvm->mmu_lock); /* * This sequence increase will notify the kvm page fault that * the page that is going to be mapped in the spte could have * been freed. */ kvm->mmu_invalidate_seq++; smp_wmb(); /* * The above sequence increase must be visible before the * below count decrease, which is ensured by the smp_wmb above * in conjunction with the smp_rmb in mmu_invalidate_retry(). */ kvm->mmu_invalidate_in_progress--; KVM_BUG_ON(kvm->mmu_invalidate_in_progress < 0, kvm); /* * Assert that at least one range was added between start() and end(). * Not adding a range isn't fatal, but it is a KVM bug. */ WARN_ON_ONCE(kvm->mmu_invalidate_range_start == INVALID_GPA); } static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, const struct mmu_notifier_range *range) { struct kvm *kvm = mmu_notifier_to_kvm(mn); const struct kvm_mmu_notifier_range hva_range = { .start = range->start, .end = range->end, .handler = (void *)kvm_null_fn, .on_lock = kvm_mmu_invalidate_end, .flush_on_ret = false, .may_block = mmu_notifier_range_blockable(range), }; bool wake; kvm_handle_hva_range(kvm, &hva_range); /* Pairs with the increment in range_start(). */ spin_lock(&kvm->mn_invalidate_lock); if (!WARN_ON_ONCE(!kvm->mn_active_invalidate_count)) --kvm->mn_active_invalidate_count; wake = !kvm->mn_active_invalidate_count; spin_unlock(&kvm->mn_invalidate_lock); /* * There can only be one waiter, since the wait happens under * slots_lock. */ if (wake) rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait); } static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, unsigned long end) { trace_kvm_age_hva(start, end); return kvm_age_hva_range(mn, start, end, kvm_age_gfn, !IS_ENABLED(CONFIG_KVM_ELIDE_TLB_FLUSH_IF_YOUNG)); } static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, unsigned long end) { trace_kvm_age_hva(start, end); /* * Even though we do not flush TLB, this will still adversely * affect performance on pre-Haswell Intel EPT, where there is * no EPT Access Bit to clear so that we have to tear down EPT * tables instead. If we find this unacceptable, we can always * add a parameter to kvm_age_hva so that it effectively doesn't * do anything on clear_young. * * Also note that currently we never issue secondary TLB flushes * from clear_young, leaving this job up to the regular system * cadence. If we find this inaccurate, we might come up with a * more sophisticated heuristic later. */ return kvm_age_hva_range_no_flush(mn, start, end, kvm_age_gfn); } static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long address) { trace_kvm_test_age_hva(address); return kvm_age_hva_range_no_flush(mn, address, address + 1, kvm_test_age_gfn); } static void kvm_mmu_notifier_release(struct mmu_notifier *mn, struct mm_struct *mm) { struct kvm *kvm = mmu_notifier_to_kvm(mn); int idx; idx = srcu_read_lock(&kvm->srcu); kvm_flush_shadow_all(kvm); srcu_read_unlock(&kvm->srcu, idx); } static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, .clear_flush_young = kvm_mmu_notifier_clear_flush_young, .clear_young = kvm_mmu_notifier_clear_young, .test_young = kvm_mmu_notifier_test_young, .release = kvm_mmu_notifier_release, }; static int kvm_init_mmu_notifier(struct kvm *kvm) { kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; return mmu_notifier_register(&kvm->mmu_notifier, current->mm); } #else /* !CONFIG_KVM_GENERIC_MMU_NOTIFIER */ static int kvm_init_mmu_notifier(struct kvm *kvm) { return 0; } #endif /* CONFIG_KVM_GENERIC_MMU_NOTIFIER */ #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER static int kvm_pm_notifier_call(struct notifier_block *bl, unsigned long state, void *unused) { struct kvm *kvm = container_of(bl, struct kvm, pm_notifier); return kvm_arch_pm_notifier(kvm, state); } static void kvm_init_pm_notifier(struct kvm *kvm) { kvm->pm_notifier.notifier_call = kvm_pm_notifier_call; /* Suspend KVM before we suspend ftrace, RCU, etc. */ kvm->pm_notifier.priority = INT_MAX; register_pm_notifier(&kvm->pm_notifier); } static void kvm_destroy_pm_notifier(struct kvm *kvm) { unregister_pm_notifier(&kvm->pm_notifier); } #else /* !CONFIG_HAVE_KVM_PM_NOTIFIER */ static void kvm_init_pm_notifier(struct kvm *kvm) { } static void kvm_destroy_pm_notifier(struct kvm *kvm) { } #endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) { if (!memslot->dirty_bitmap) return; vfree(memslot->dirty_bitmap); memslot->dirty_bitmap = NULL; } /* This does not remove the slot from struct kvm_memslots data structures */ static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) { if (slot->flags & KVM_MEM_GUEST_MEMFD) kvm_gmem_unbind(slot); kvm_destroy_dirty_bitmap(slot); kvm_arch_free_memslot(kvm, slot); kfree(slot); } static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots) { struct hlist_node *idnode; struct kvm_memory_slot *memslot; int bkt; /* * The same memslot objects live in both active and inactive sets, * arbitrarily free using index '1' so the second invocation of this * function isn't operating over a structure with dangling pointers * (even though this function isn't actually touching them). */ if (!slots->node_idx) return; hash_for_each_safe(slots->id_hash, bkt, idnode, memslot, id_node[1]) kvm_free_memslot(kvm, memslot); } static umode_t kvm_stats_debugfs_mode(const struct _kvm_stats_desc *pdesc) { switch (pdesc->desc.flags & KVM_STATS_TYPE_MASK) { case KVM_STATS_TYPE_INSTANT: return 0444; case KVM_STATS_TYPE_CUMULATIVE: case KVM_STATS_TYPE_PEAK: default: return 0644; } } static void kvm_destroy_vm_debugfs(struct kvm *kvm) { int i; int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc + kvm_vcpu_stats_header.num_desc; if (IS_ERR(kvm->debugfs_dentry)) return; debugfs_remove_recursive(kvm->debugfs_dentry); if (kvm->debugfs_stat_data) { for (i = 0; i < kvm_debugfs_num_entries; i++) kfree(kvm->debugfs_stat_data[i]); kfree(kvm->debugfs_stat_data); } } static int kvm_create_vm_debugfs(struct kvm *kvm, const char *fdname) { static DEFINE_MUTEX(kvm_debugfs_lock); struct dentry *dent; char dir_name[ITOA_MAX_LEN * 2]; struct kvm_stat_data *stat_data; const struct _kvm_stats_desc *pdesc; int i, ret = -ENOMEM; int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc + kvm_vcpu_stats_header.num_desc; if (!debugfs_initialized()) return 0; snprintf(dir_name, sizeof(dir_name), "%d-%s", task_pid_nr(current), fdname); mutex_lock(&kvm_debugfs_lock); dent = debugfs_lookup(dir_name, kvm_debugfs_dir); if (dent) { pr_warn_ratelimited("KVM: debugfs: duplicate directory %s\n", dir_name); dput(dent); mutex_unlock(&kvm_debugfs_lock); return 0; } dent = debugfs_create_dir(dir_name, kvm_debugfs_dir); mutex_unlock(&kvm_debugfs_lock); if (IS_ERR(dent)) return 0; kvm->debugfs_dentry = dent; kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries, sizeof(*kvm->debugfs_stat_data), GFP_KERNEL_ACCOUNT); if (!kvm->debugfs_stat_data) goto out_err; for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) { pdesc = &kvm_vm_stats_desc[i]; stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT); if (!stat_data) goto out_err; stat_data->kvm = kvm; stat_data->desc = pdesc; stat_data->kind = KVM_STAT_VM; kvm->debugfs_stat_data[i] = stat_data; debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc), kvm->debugfs_dentry, stat_data, &stat_fops_per_vm); } for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) { pdesc = &kvm_vcpu_stats_desc[i]; stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT); if (!stat_data) goto out_err; stat_data->kvm = kvm; stat_data->desc = pdesc; stat_data->kind = KVM_STAT_VCPU; kvm->debugfs_stat_data[i + kvm_vm_stats_header.num_desc] = stat_data; debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc), kvm->debugfs_dentry, stat_data, &stat_fops_per_vm); } kvm_arch_create_vm_debugfs(kvm); return 0; out_err: kvm_destroy_vm_debugfs(kvm); return ret; } /* * Called just after removing the VM from the vm_list, but before doing any * other destruction. */ void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm) { } /* * Called after per-vm debugfs created. When called kvm->debugfs_dentry should * be setup already, so we can create arch-specific debugfs entries under it. * Cleanup should be automatic done in kvm_destroy_vm_debugfs() recursively, so * a per-arch destroy interface is not needed. */ void __weak kvm_arch_create_vm_debugfs(struct kvm *kvm) { } static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) { struct kvm *kvm = kvm_arch_alloc_vm(); struct kvm_memslots *slots; int r, i, j; if (!kvm) return ERR_PTR(-ENOMEM); KVM_MMU_LOCK_INIT(kvm); mmgrab(current->mm); kvm->mm = current->mm; kvm_eventfd_init(kvm); mutex_init(&kvm->lock); mutex_init(&kvm->irq_lock); mutex_init(&kvm->slots_lock); mutex_init(&kvm->slots_arch_lock); spin_lock_init(&kvm->mn_invalidate_lock); rcuwait_init(&kvm->mn_memslots_update_rcuwait); xa_init(&kvm->vcpu_array); #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES xa_init(&kvm->mem_attr_array); #endif INIT_LIST_HEAD(&kvm->gpc_list); spin_lock_init(&kvm->gpc_lock); INIT_LIST_HEAD(&kvm->devices); kvm->max_vcpus = KVM_MAX_VCPUS; BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX); /* * Force subsequent debugfs file creations to fail if the VM directory * is not created (by kvm_create_vm_debugfs()). */ kvm->debugfs_dentry = ERR_PTR(-ENOENT); snprintf(kvm->stats_id, sizeof(kvm->stats_id), "kvm-%d", task_pid_nr(current)); r = -ENOMEM; if (init_srcu_struct(&kvm->srcu)) goto out_err_no_srcu; if (init_srcu_struct(&kvm->irq_srcu)) goto out_err_no_irq_srcu; r = kvm_init_irq_routing(kvm); if (r) goto out_err_no_irq_routing; refcount_set(&kvm->users_count, 1); for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { for (j = 0; j < 2; j++) { slots = &kvm->__memslots[i][j]; atomic_long_set(&slots->last_used_slot, (unsigned long)NULL); slots->hva_tree = RB_ROOT_CACHED; slots->gfn_tree = RB_ROOT; hash_init(slots->id_hash); slots->node_idx = j; /* Generations must be different for each address space. */ slots->generation = i; } rcu_assign_pointer(kvm->memslots[i], &kvm->__memslots[i][0]); } r = -ENOMEM; for (i = 0; i < KVM_NR_BUSES; i++) { rcu_assign_pointer(kvm->buses[i], kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT)); if (!kvm->buses[i]) goto out_err_no_arch_destroy_vm; } r = kvm_arch_init_vm(kvm, type); if (r) goto out_err_no_arch_destroy_vm; r = kvm_enable_virtualization(); if (r) goto out_err_no_disable; #ifdef CONFIG_HAVE_KVM_IRQCHIP INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list); #endif r = kvm_init_mmu_notifier(kvm); if (r) goto out_err_no_mmu_notifier; r = kvm_coalesced_mmio_init(kvm); if (r < 0) goto out_no_coalesced_mmio; r = kvm_create_vm_debugfs(kvm, fdname); if (r) goto out_err_no_debugfs; mutex_lock(&kvm_lock); list_add(&kvm->vm_list, &vm_list); mutex_unlock(&kvm_lock); preempt_notifier_inc(); kvm_init_pm_notifier(kvm); return kvm; out_err_no_debugfs: kvm_coalesced_mmio_free(kvm); out_no_coalesced_mmio: #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER if (kvm->mmu_notifier.ops) mmu_notifier_unregister(&kvm->mmu_notifier, current->mm); #endif out_err_no_mmu_notifier: kvm_disable_virtualization(); out_err_no_disable: kvm_arch_destroy_vm(kvm); out_err_no_arch_destroy_vm: WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count)); for (i = 0; i < KVM_NR_BUSES; i++) kfree(kvm_get_bus(kvm, i)); kvm_free_irq_routing(kvm); out_err_no_irq_routing: cleanup_srcu_struct(&kvm->irq_srcu); out_err_no_irq_srcu: cleanup_srcu_struct(&kvm->srcu); out_err_no_srcu: kvm_arch_free_vm(kvm); mmdrop(current->mm); return ERR_PTR(r); } static void kvm_destroy_devices(struct kvm *kvm) { struct kvm_device *dev, *tmp; /* * We do not need to take the kvm->lock here, because nobody else * has a reference to the struct kvm at this point and therefore * cannot access the devices list anyhow. * * The device list is generally managed as an rculist, but list_del() * is used intentionally here. If a bug in KVM introduced a reader that * was not backed by a reference on the kvm struct, the hope is that * it'd consume the poisoned forward pointer instead of suffering a * use-after-free, even though this cannot be guaranteed. */ list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) { list_del(&dev->vm_node); dev->ops->destroy(dev); } } static void kvm_destroy_vm(struct kvm *kvm) { int i; struct mm_struct *mm = kvm->mm; kvm_destroy_pm_notifier(kvm); kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm); kvm_destroy_vm_debugfs(kvm); mutex_lock(&kvm_lock); list_del(&kvm->vm_list); mutex_unlock(&kvm_lock); kvm_arch_pre_destroy_vm(kvm); kvm_free_irq_routing(kvm); for (i = 0; i < KVM_NR_BUSES; i++) { struct kvm_io_bus *bus = kvm_get_bus(kvm, i); if (bus) kvm_io_bus_destroy(bus); kvm->buses[i] = NULL; } kvm_coalesced_mmio_free(kvm); #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); /* * At this point, pending calls to invalidate_range_start() * have completed but no more MMU notifiers will run, so * mn_active_invalidate_count may remain unbalanced. * No threads can be waiting in kvm_swap_active_memslots() as the * last reference on KVM has been dropped, but freeing * memslots would deadlock without this manual intervention. * * If the count isn't unbalanced, i.e. KVM did NOT unregister its MMU * notifier between a start() and end(), then there shouldn't be any * in-progress invalidations. */ WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait)); if (kvm->mn_active_invalidate_count) kvm->mn_active_invalidate_count = 0; else WARN_ON(kvm->mmu_invalidate_in_progress); #else kvm_flush_shadow_all(kvm); #endif kvm_arch_destroy_vm(kvm); kvm_destroy_devices(kvm); for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { kvm_free_memslots(kvm, &kvm->__memslots[i][0]); kvm_free_memslots(kvm, &kvm->__memslots[i][1]); } cleanup_srcu_struct(&kvm->irq_srcu); cleanup_srcu_struct(&kvm->srcu); #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES xa_destroy(&kvm->mem_attr_array); #endif kvm_arch_free_vm(kvm); preempt_notifier_dec(); kvm_disable_virtualization(); mmdrop(mm); } void kvm_get_kvm(struct kvm *kvm) { refcount_inc(&kvm->users_count); } EXPORT_SYMBOL_GPL(kvm_get_kvm); /* * Make sure the vm is not during destruction, which is a safe version of * kvm_get_kvm(). Return true if kvm referenced successfully, false otherwise. */ bool kvm_get_kvm_safe(struct kvm *kvm) { return refcount_inc_not_zero(&kvm->users_count); } EXPORT_SYMBOL_GPL(kvm_get_kvm_safe); void kvm_put_kvm(struct kvm *kvm) { if (refcount_dec_and_test(&kvm->users_count)) kvm_destroy_vm(kvm); } EXPORT_SYMBOL_GPL(kvm_put_kvm); /* * Used to put a reference that was taken on behalf of an object associated * with a user-visible file descriptor, e.g. a vcpu or device, if installation * of the new file descriptor fails and the reference cannot be transferred to * its final owner. In such cases, the caller is still actively using @kvm and * will fail miserably if the refcount unexpectedly hits zero. */ void kvm_put_kvm_no_destroy(struct kvm *kvm) { WARN_ON(refcount_dec_and_test(&kvm->users_count)); } EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy); static int kvm_vm_release(struct inode *inode, struct file *filp) { struct kvm *kvm = filp->private_data; kvm_irqfd_release(kvm); kvm_put_kvm(kvm); return 0; } int kvm_trylock_all_vcpus(struct kvm *kvm) { struct kvm_vcpu *vcpu; unsigned long i, j; lockdep_assert_held(&kvm->lock); kvm_for_each_vcpu(i, vcpu, kvm) if (!mutex_trylock_nest_lock(&vcpu->mutex, &kvm->lock)) goto out_unlock; return 0; out_unlock: kvm_for_each_vcpu(j, vcpu, kvm) { if (i == j) break; mutex_unlock(&vcpu->mutex); } return -EINTR; } EXPORT_SYMBOL_GPL(kvm_trylock_all_vcpus); int kvm_lock_all_vcpus(struct kvm *kvm) { struct kvm_vcpu *vcpu; unsigned long i, j; int r; lockdep_assert_held(&kvm->lock); kvm_for_each_vcpu(i, vcpu, kvm) { r = mutex_lock_killable_nest_lock(&vcpu->mutex, &kvm->lock); if (r) goto out_unlock; } return 0; out_unlock: kvm_for_each_vcpu(j, vcpu, kvm) { if (i == j) break; mutex_unlock(&vcpu->mutex); } return r; } EXPORT_SYMBOL_GPL(kvm_lock_all_vcpus); void kvm_unlock_all_vcpus(struct kvm *kvm) { struct kvm_vcpu *vcpu; unsigned long i; lockdep_assert_held(&kvm->lock); kvm_for_each_vcpu(i, vcpu, kvm) mutex_unlock(&vcpu->mutex); } EXPORT_SYMBOL_GPL(kvm_unlock_all_vcpus); /* * Allocation size is twice as large as the actual dirty bitmap size. * See kvm_vm_ioctl_get_dirty_log() why this is needed. */ static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot) { unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(memslot); memslot->dirty_bitmap = __vcalloc(2, dirty_bytes, GFP_KERNEL_ACCOUNT); if (!memslot->dirty_bitmap) return -ENOMEM; return 0; } static struct kvm_memslots *kvm_get_inactive_memslots(struct kvm *kvm, int as_id) { struct kvm_memslots *active = __kvm_memslots(kvm, as_id); int node_idx_inactive = active->node_idx ^ 1; return &kvm->__memslots[as_id][node_idx_inactive]; } /* * Helper to get the address space ID when one of memslot pointers may be NULL. * This also serves as a sanity that at least one of the pointers is non-NULL, * and that their address space IDs don't diverge. */ static int kvm_memslots_get_as_id(struct kvm_memory_slot *a, struct kvm_memory_slot *b) { if (WARN_ON_ONCE(!a && !b)) return 0; if (!a) return b->as_id; if (!b) return a->as_id; WARN_ON_ONCE(a->as_id != b->as_id); return a->as_id; } static void kvm_insert_gfn_node(struct kvm_memslots *slots, struct kvm_memory_slot *slot) { struct rb_root *gfn_tree = &slots->gfn_tree; struct rb_node **node, *parent; int idx = slots->node_idx; parent = NULL; for (node = &gfn_tree->rb_node; *node; ) { struct kvm_memory_slot *tmp; tmp = container_of(*node, struct kvm_memory_slot, gfn_node[idx]); parent = *node; if (slot->base_gfn < tmp->base_gfn) node = &(*node)->rb_left; else if (slot->base_gfn > tmp->base_gfn) node = &(*node)->rb_right; else BUG(); } rb_link_node(&slot->gfn_node[idx], parent, node); rb_insert_color(&slot->gfn_node[idx], gfn_tree); } static void kvm_erase_gfn_node(struct kvm_memslots *slots, struct kvm_memory_slot *slot) { rb_erase(&slot->gfn_node[slots->node_idx], &slots->gfn_tree); } static void kvm_replace_gfn_node(struct kvm_memslots *slots, struct kvm_memory_slot *old, struct kvm_memory_slot *new) { int idx = slots->node_idx; WARN_ON_ONCE(old->base_gfn != new->base_gfn); rb_replace_node(&old->gfn_node[idx], &new->gfn_node[idx], &slots->gfn_tree); } /* * Replace @old with @new in the inactive memslots. * * With NULL @old this simply adds @new. * With NULL @new this simply removes @old. * * If @new is non-NULL its hva_node[slots_idx] range has to be set * appropriately. */ static void kvm_replace_memslot(struct kvm *kvm, struct kvm_memory_slot *old, struct kvm_memory_slot *new) { int as_id = kvm_memslots_get_as_id(old, new); struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id); int idx = slots->node_idx; if (old) { hash_del(&old->id_node[idx]); interval_tree_remove(&old->hva_node[idx], &slots->hva_tree); if ((long)old == atomic_long_read(&slots->last_used_slot)) atomic_long_set(&slots->last_used_slot, (long)new); if (!new) { kvm_erase_gfn_node(slots, old); return; } } /* * Initialize @new's hva range. Do this even when replacing an @old * slot, kvm_copy_memslot() deliberately does not touch node data. */ new->hva_node[idx].start = new->userspace_addr; new->hva_node[idx].last = new->userspace_addr + (new->npages << PAGE_SHIFT) - 1; /* * (Re)Add the new memslot. There is no O(1) interval_tree_replace(), * hva_node needs to be swapped with remove+insert even though hva can't * change when replacing an existing slot. */ hash_add(slots->id_hash, &new->id_node[idx], new->id); interval_tree_insert(&new->hva_node[idx], &slots->hva_tree); /* * If the memslot gfn is unchanged, rb_replace_node() can be used to * switch the node in the gfn tree instead of removing the old and * inserting the new as two separate operations. Replacement is a * single O(1) operation versus two O(log(n)) operations for * remove+insert. */ if (old && old->base_gfn == new->base_gfn) { kvm_replace_gfn_node(slots, old, new); } else { if (old) kvm_erase_gfn_node(slots, old); kvm_insert_gfn_node(slots, new); } } /* * Flags that do not access any of the extra space of struct * kvm_userspace_memory_region2. KVM_SET_USER_MEMORY_REGION_V1_FLAGS * only allows these. */ #define KVM_SET_USER_MEMORY_REGION_V1_FLAGS \ (KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY) static int check_memory_region_flags(struct kvm *kvm, const struct kvm_userspace_memory_region2 *mem) { u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; if (kvm_arch_has_private_mem(kvm)) valid_flags |= KVM_MEM_GUEST_MEMFD; /* Dirty logging private memory is not currently supported. */ if (mem->flags & KVM_MEM_GUEST_MEMFD) valid_flags &= ~KVM_MEM_LOG_DIRTY_PAGES; /* * GUEST_MEMFD is incompatible with read-only memslots, as writes to * read-only memslots have emulated MMIO, not page fault, semantics, * and KVM doesn't allow emulated MMIO for private memory. */ if (kvm_arch_has_readonly_mem(kvm) && !(mem->flags & KVM_MEM_GUEST_MEMFD)) valid_flags |= KVM_MEM_READONLY; if (mem->flags & ~valid_flags) return -EINVAL; return 0; } static void kvm_swap_active_memslots(struct kvm *kvm, int as_id) { struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id); /* Grab the generation from the activate memslots. */ u64 gen = __kvm_memslots(kvm, as_id)->generation; WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS; /* * Do not store the new memslots while there are invalidations in * progress, otherwise the locking in invalidate_range_start and * invalidate_range_end will be unbalanced. */ spin_lock(&kvm->mn_invalidate_lock); prepare_to_rcuwait(&kvm->mn_memslots_update_rcuwait); while (kvm->mn_active_invalidate_count) { set_current_state(TASK_UNINTERRUPTIBLE); spin_unlock(&kvm->mn_invalidate_lock); schedule(); spin_lock(&kvm->mn_invalidate_lock); } finish_rcuwait(&kvm->mn_memslots_update_rcuwait); rcu_assign_pointer(kvm->memslots[as_id], slots); spin_unlock(&kvm->mn_invalidate_lock); /* * Acquired in kvm_set_memslot. Must be released before synchronize * SRCU below in order to avoid deadlock with another thread * acquiring the slots_arch_lock in an srcu critical section. */ mutex_unlock(&kvm->slots_arch_lock); synchronize_srcu_expedited(&kvm->srcu); /* * Increment the new memslot generation a second time, dropping the * update in-progress flag and incrementing the generation based on * the number of address spaces. This provides a unique and easily * identifiable generation number while the memslots are in flux. */ gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS; /* * Generations must be unique even across address spaces. We do not need * a global counter for that, instead the generation space is evenly split * across address spaces. For example, with two address spaces, address * space 0 will use generations 0, 2, 4, ... while address space 1 will * use generations 1, 3, 5, ... */ gen += kvm_arch_nr_memslot_as_ids(kvm); kvm_arch_memslots_updated(kvm, gen); slots->generation = gen; } static int kvm_prepare_memory_region(struct kvm *kvm, const struct kvm_memory_slot *old, struct kvm_memory_slot *new, enum kvm_mr_change change) { int r; /* * If dirty logging is disabled, nullify the bitmap; the old bitmap * will be freed on "commit". If logging is enabled in both old and * new, reuse the existing bitmap. If logging is enabled only in the * new and KVM isn't using a ring buffer, allocate and initialize a * new bitmap. */ if (change != KVM_MR_DELETE) { if (!(new->flags & KVM_MEM_LOG_DIRTY_PAGES)) new->dirty_bitmap = NULL; else if (old && old->dirty_bitmap) new->dirty_bitmap = old->dirty_bitmap; else if (kvm_use_dirty_bitmap(kvm)) { r = kvm_alloc_dirty_bitmap(new); if (r) return r; if (kvm_dirty_log_manual_protect_and_init_set(kvm)) bitmap_set(new->dirty_bitmap, 0, new->npages); } } r = kvm_arch_prepare_memory_region(kvm, old, new, change); /* Free the bitmap on failure if it was allocated above. */ if (r && new && new->dirty_bitmap && (!old || !old->dirty_bitmap)) kvm_destroy_dirty_bitmap(new); return r; } static void kvm_commit_memory_region(struct kvm *kvm, struct kvm_memory_slot *old, const struct kvm_memory_slot *new, enum kvm_mr_change change) { int old_flags = old ? old->flags : 0; int new_flags = new ? new->flags : 0; /* * Update the total number of memslot pages before calling the arch * hook so that architectures can consume the result directly. */ if (change == KVM_MR_DELETE) kvm->nr_memslot_pages -= old->npages; else if (change == KVM_MR_CREATE) kvm->nr_memslot_pages += new->npages; if ((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES) { int change = (new_flags & KVM_MEM_LOG_DIRTY_PAGES) ? 1 : -1; atomic_set(&kvm->nr_memslots_dirty_logging, atomic_read(&kvm->nr_memslots_dirty_logging) + change); } kvm_arch_commit_memory_region(kvm, old, new, change); switch (change) { case KVM_MR_CREATE: /* Nothing more to do. */ break; case KVM_MR_DELETE: /* Free the old memslot and all its metadata. */ kvm_free_memslot(kvm, old); break; case KVM_MR_MOVE: case KVM_MR_FLAGS_ONLY: /* * Free the dirty bitmap as needed; the below check encompasses * both the flags and whether a ring buffer is being used) */ if (old->dirty_bitmap && !new->dirty_bitmap) kvm_destroy_dirty_bitmap(old); /* * The final quirk. Free the detached, old slot, but only its * memory, not any metadata. Metadata, including arch specific * data, may be reused by @new. */ kfree(old); break; default: BUG(); } } /* * Activate @new, which must be installed in the inactive slots by the caller, * by swapping the active slots and then propagating @new to @old once @old is * unreachable and can be safely modified. * * With NULL @old this simply adds @new to @active (while swapping the sets). * With NULL @new this simply removes @old from @active and frees it * (while also swapping the sets). */ static void kvm_activate_memslot(struct kvm *kvm, struct kvm_memory_slot *old, struct kvm_memory_slot *new) { int as_id = kvm_memslots_get_as_id(old, new); kvm_swap_active_memslots(kvm, as_id); /* Propagate the new memslot to the now inactive memslots. */ kvm_replace_memslot(kvm, old, new); } static void kvm_copy_memslot(struct kvm_memory_slot *dest, const struct kvm_memory_slot *src) { dest->base_gfn = src->base_gfn; dest->npages = src->npages; dest->dirty_bitmap = src->dirty_bitmap; dest->arch = src->arch; dest->userspace_addr = src->userspace_addr; dest->flags = src->flags; dest->id = src->id; dest->as_id = src->as_id; } static void kvm_invalidate_memslot(struct kvm *kvm, struct kvm_memory_slot *old, struct kvm_memory_slot *invalid_slot) { /* * Mark the current slot INVALID. As with all memslot modifications, * this must be done on an unreachable slot to avoid modifying the * current slot in the active tree. */ kvm_copy_memslot(invalid_slot, old); invalid_slot->flags |= KVM_MEMSLOT_INVALID; kvm_replace_memslot(kvm, old, invalid_slot); /* * Activate the slot that is now marked INVALID, but don't propagate * the slot to the now inactive slots. The slot is either going to be * deleted or recreated as a new slot. */ kvm_swap_active_memslots(kvm, old->as_id); /* * From this point no new shadow pages pointing to a deleted, or moved, * memslot will be created. Validation of sp->gfn happens in: * - gfn_to_hva (kvm_read_guest, gfn_to_pfn) * - kvm_is_visible_gfn (mmu_check_root) */ kvm_arch_flush_shadow_memslot(kvm, old); kvm_arch_guest_memory_reclaimed(kvm); /* Was released by kvm_swap_active_memslots(), reacquire. */ mutex_lock(&kvm->slots_arch_lock); /* * Copy the arch-specific field of the newly-installed slot back to the * old slot as the arch data could have changed between releasing * slots_arch_lock in kvm_swap_active_memslots() and re-acquiring the lock * above. Writers are required to retrieve memslots *after* acquiring * slots_arch_lock, thus the active slot's data is guaranteed to be fresh. */ old->arch = invalid_slot->arch; } static void kvm_create_memslot(struct kvm *kvm, struct kvm_memory_slot *new) { /* Add the new memslot to the inactive set and activate. */ kvm_replace_memslot(kvm, NULL, new); kvm_activate_memslot(kvm, NULL, new); } static void kvm_delete_memslot(struct kvm *kvm, struct kvm_memory_slot *old, struct kvm_memory_slot *invalid_slot) { /* * Remove the old memslot (in the inactive memslots) by passing NULL as * the "new" slot, and for the invalid version in the active slots. */ kvm_replace_memslot(kvm, old, NULL); kvm_activate_memslot(kvm, invalid_slot, NULL); } static void kvm_move_memslot(struct kvm *kvm, struct kvm_memory_slot *old, struct kvm_memory_slot *new, struct kvm_memory_slot *invalid_slot) { /* * Replace the old memslot in the inactive slots, and then swap slots * and replace the current INVALID with the new as well. */ kvm_replace_memslot(kvm, old, new); kvm_activate_memslot(kvm, invalid_slot, new); } static void kvm_update_flags_memslot(struct kvm *kvm, struct kvm_memory_slot *old, struct kvm_memory_slot *new) { /* * Similar to the MOVE case, but the slot doesn't need to be zapped as * an intermediate step. Instead, the old memslot is simply replaced * with a new, updated copy in both memslot sets. */ kvm_replace_memslot(kvm, old, new); kvm_activate_memslot(kvm, old, new); } static int kvm_set_memslot(struct kvm *kvm, struct kvm_memory_slot *old, struct kvm_memory_slot *new, enum kvm_mr_change change) { struct kvm_memory_slot *invalid_slot; int r; /* * Released in kvm_swap_active_memslots(). * * Must be held from before the current memslots are copied until after * the new memslots are installed with rcu_assign_pointer, then * released before the synchronize srcu in kvm_swap_active_memslots(). * * When modifying memslots outside of the slots_lock, must be held * before reading the pointer to the current memslots until after all * changes to those memslots are complete. * * These rules ensure that installing new memslots does not lose * changes made to the previous memslots. */ mutex_lock(&kvm->slots_arch_lock); /* * Invalidate the old slot if it's being deleted or moved. This is * done prior to actually deleting/moving the memslot to allow vCPUs to * continue running by ensuring there are no mappings or shadow pages * for the memslot when it is deleted/moved. Without pre-invalidation * (and without a lock), a window would exist between effecting the * delete/move and committing the changes in arch code where KVM or a * guest could access a non-existent memslot. * * Modifications are done on a temporary, unreachable slot. The old * slot needs to be preserved in case a later step fails and the * invalidation needs to be reverted. */ if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) { invalid_slot = kzalloc(sizeof(*invalid_slot), GFP_KERNEL_ACCOUNT); if (!invalid_slot) { mutex_unlock(&kvm->slots_arch_lock); return -ENOMEM; } kvm_invalidate_memslot(kvm, old, invalid_slot); } r = kvm_prepare_memory_region(kvm, old, new, change); if (r) { /* * For DELETE/MOVE, revert the above INVALID change. No * modifications required since the original slot was preserved * in the inactive slots. Changing the active memslots also * release slots_arch_lock. */ if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) { kvm_activate_memslot(kvm, invalid_slot, old); kfree(invalid_slot); } else { mutex_unlock(&kvm->slots_arch_lock); } return r; } /* * For DELETE and MOVE, the working slot is now active as the INVALID * version of the old slot. MOVE is particularly special as it reuses * the old slot and returns a copy of the old slot (in working_slot). * For CREATE, there is no old slot. For DELETE and FLAGS_ONLY, the * old slot is detached but otherwise preserved. */ if (change == KVM_MR_CREATE) kvm_create_memslot(kvm, new); else if (change == KVM_MR_DELETE) kvm_delete_memslot(kvm, old, invalid_slot); else if (change == KVM_MR_MOVE) kvm_move_memslot(kvm, old, new, invalid_slot); else if (change == KVM_MR_FLAGS_ONLY) kvm_update_flags_memslot(kvm, old, new); else BUG(); /* Free the temporary INVALID slot used for DELETE and MOVE. */ if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) kfree(invalid_slot); /* * No need to refresh new->arch, changes after dropping slots_arch_lock * will directly hit the final, active memslot. Architectures are * responsible for knowing that new->arch may be stale. */ kvm_commit_memory_region(kvm, old, new, change); return 0; } static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id, gfn_t start, gfn_t end) { struct kvm_memslot_iter iter; kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) { if (iter.slot->id != id) return true; } return false; } static int kvm_set_memory_region(struct kvm *kvm, const struct kvm_userspace_memory_region2 *mem) { struct kvm_memory_slot *old, *new; struct kvm_memslots *slots; enum kvm_mr_change change; unsigned long npages; gfn_t base_gfn; int as_id, id; int r; lockdep_assert_held(&kvm->slots_lock); r = check_memory_region_flags(kvm, mem); if (r) return r; as_id = mem->slot >> 16; id = (u16)mem->slot; /* General sanity checks */ if ((mem->memory_size & (PAGE_SIZE - 1)) || (mem->memory_size != (unsigned long)mem->memory_size)) return -EINVAL; if (mem->guest_phys_addr & (PAGE_SIZE - 1)) return -EINVAL; /* We can read the guest memory with __xxx_user() later on. */ if ((mem->userspace_addr & (PAGE_SIZE - 1)) || (mem->userspace_addr != untagged_addr(mem->userspace_addr)) || !access_ok((void __user *)(unsigned long)mem->userspace_addr, mem->memory_size)) return -EINVAL; if (mem->flags & KVM_MEM_GUEST_MEMFD && (mem->guest_memfd_offset & (PAGE_SIZE - 1) || mem->guest_memfd_offset + mem->memory_size < mem->guest_memfd_offset)) return -EINVAL; if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_MEM_SLOTS_NUM) return -EINVAL; if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) return -EINVAL; /* * The size of userspace-defined memory regions is restricted in order * to play nice with dirty bitmap operations, which are indexed with an * "unsigned int". KVM's internal memory regions don't support dirty * logging, and so are exempt. */ if (id < KVM_USER_MEM_SLOTS && (mem->memory_size >> PAGE_SHIFT) > KVM_MEM_MAX_NR_PAGES) return -EINVAL; slots = __kvm_memslots(kvm, as_id); /* * Note, the old memslot (and the pointer itself!) may be invalidated * and/or destroyed by kvm_set_memslot(). */ old = id_to_memslot(slots, id); if (!mem->memory_size) { if (!old || !old->npages) return -EINVAL; if (WARN_ON_ONCE(kvm->nr_memslot_pages < old->npages)) return -EIO; return kvm_set_memslot(kvm, old, NULL, KVM_MR_DELETE); } base_gfn = (mem->guest_phys_addr >> PAGE_SHIFT); npages = (mem->memory_size >> PAGE_SHIFT); if (!old || !old->npages) { change = KVM_MR_CREATE; /* * To simplify KVM internals, the total number of pages across * all memslots must fit in an unsigned long. */ if ((kvm->nr_memslot_pages + npages) < kvm->nr_memslot_pages) return -EINVAL; } else { /* Modify an existing slot. */ /* Private memslots are immutable, they can only be deleted. */ if (mem->flags & KVM_MEM_GUEST_MEMFD) return -EINVAL; if ((mem->userspace_addr != old->userspace_addr) || (npages != old->npages) || ((mem->flags ^ old->flags) & KVM_MEM_READONLY)) return -EINVAL; if (base_gfn != old->base_gfn) change = KVM_MR_MOVE; else if (mem->flags != old->flags) change = KVM_MR_FLAGS_ONLY; else /* Nothing to change. */ return 0; } if ((change == KVM_MR_CREATE || change == KVM_MR_MOVE) && kvm_check_memslot_overlap(slots, id, base_gfn, base_gfn + npages)) return -EEXIST; /* Allocate a slot that will persist in the memslot. */ new = kzalloc(sizeof(*new), GFP_KERNEL_ACCOUNT); if (!new) return -ENOMEM; new->as_id = as_id; new->id = id; new->base_gfn = base_gfn; new->npages = npages; new->flags = mem->flags; new->userspace_addr = mem->userspace_addr; if (mem->flags & KVM_MEM_GUEST_MEMFD) { r = kvm_gmem_bind(kvm, new, mem->guest_memfd, mem->guest_memfd_offset); if (r) goto out; } r = kvm_set_memslot(kvm, old, new, change); if (r) goto out_unbind; return 0; out_unbind: if (mem->flags & KVM_MEM_GUEST_MEMFD) kvm_gmem_unbind(new); out: kfree(new); return r; } int kvm_set_internal_memslot(struct kvm *kvm, const struct kvm_userspace_memory_region2 *mem) { if (WARN_ON_ONCE(mem->slot < KVM_USER_MEM_SLOTS)) return -EINVAL; if (WARN_ON_ONCE(mem->flags)) return -EINVAL; return kvm_set_memory_region(kvm, mem); } EXPORT_SYMBOL_GPL(kvm_set_internal_memslot); static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region2 *mem) { if ((u16)mem->slot >= KVM_USER_MEM_SLOTS) return -EINVAL; guard(mutex)(&kvm->slots_lock); return kvm_set_memory_region(kvm, mem); } #ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT /** * kvm_get_dirty_log - get a snapshot of dirty pages * @kvm: pointer to kvm instance * @log: slot id and address to which we copy the log * @is_dirty: set to '1' if any dirty pages were found * @memslot: set to the associated memslot, always valid on success */ int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log, int *is_dirty, struct kvm_memory_slot **memslot) { struct kvm_memslots *slots; int i, as_id, id; unsigned long n; unsigned long any = 0; /* Dirty ring tracking may be exclusive to dirty log tracking */ if (!kvm_use_dirty_bitmap(kvm)) return -ENXIO; *memslot = NULL; *is_dirty = 0; as_id = log->slot >> 16; id = (u16)log->slot; if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS) return -EINVAL; slots = __kvm_memslots(kvm, as_id); *memslot = id_to_memslot(slots, id); if (!(*memslot) || !(*memslot)->dirty_bitmap) return -ENOENT; kvm_arch_sync_dirty_log(kvm, *memslot); n = kvm_dirty_bitmap_bytes(*memslot); for (i = 0; !any && i < n/sizeof(long); ++i) any = (*memslot)->dirty_bitmap[i]; if (copy_to_user(log->dirty_bitmap, (*memslot)->dirty_bitmap, n)) return -EFAULT; if (any) *is_dirty = 1; return 0; } EXPORT_SYMBOL_GPL(kvm_get_dirty_log); #else /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */ /** * kvm_get_dirty_log_protect - get a snapshot of dirty pages * and reenable dirty page tracking for the corresponding pages. * @kvm: pointer to kvm instance * @log: slot id and address to which we copy the log * * We need to keep it in mind that VCPU threads can write to the bitmap * concurrently. So, to avoid losing track of dirty pages we keep the * following order: * * 1. Take a snapshot of the bit and clear it if needed. * 2. Write protect the corresponding page. * 3. Copy the snapshot to the userspace. * 4. Upon return caller flushes TLB's if needed. * * Between 2 and 4, the guest may write to the page using the remaining TLB * entry. This is not a problem because the page is reported dirty using * the snapshot taken before and step 4 ensures that writes done after * exiting to userspace will be logged for the next call. * */ static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log) { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; int i, as_id, id; unsigned long n; unsigned long *dirty_bitmap; unsigned long *dirty_bitmap_buffer; bool flush; /* Dirty ring tracking may be exclusive to dirty log tracking */ if (!kvm_use_dirty_bitmap(kvm)) return -ENXIO; as_id = log->slot >> 16; id = (u16)log->slot; if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS) return -EINVAL; slots = __kvm_memslots(kvm, as_id); memslot = id_to_memslot(slots, id); if (!memslot || !memslot->dirty_bitmap) return -ENOENT; dirty_bitmap = memslot->dirty_bitmap; kvm_arch_sync_dirty_log(kvm, memslot); n = kvm_dirty_bitmap_bytes(memslot); flush = false; if (kvm->manual_dirty_log_protect) { /* * Unlike kvm_get_dirty_log, we always return false in *flush, * because no flush is needed until KVM_CLEAR_DIRTY_LOG. There * is some code duplication between this function and * kvm_get_dirty_log, but hopefully all architecture * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log * can be eliminated. */ dirty_bitmap_buffer = dirty_bitmap; } else { dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); memset(dirty_bitmap_buffer, 0, n); KVM_MMU_LOCK(kvm); for (i = 0; i < n / sizeof(long); i++) { unsigned long mask; gfn_t offset; if (!dirty_bitmap[i]) continue; flush = true; mask = xchg(&dirty_bitmap[i], 0); dirty_bitmap_buffer[i] = mask; offset = i * BITS_PER_LONG; kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, offset, mask); } KVM_MMU_UNLOCK(kvm); } if (flush) kvm_flush_remote_tlbs_memslot(kvm, memslot); if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) return -EFAULT; return 0; } /** * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot * @kvm: kvm instance * @log: slot id and address to which we copy the log * * Steps 1-4 below provide general overview of dirty page logging. See * kvm_get_dirty_log_protect() function description for additional details. * * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we * always flush the TLB (step 4) even if previous step failed and the dirty * bitmap may be corrupt. Regardless of previous outcome the KVM logging API * does not preclude user space subsequent dirty log read. Flushing TLB ensures * writes will be marked dirty for next log read. * * 1. Take a snapshot of the bit and clear it if needed. * 2. Write protect the corresponding page. * 3. Copy the snapshot to the userspace. * 4. Flush TLB's if needed. */ static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { int r; mutex_lock(&kvm->slots_lock); r = kvm_get_dirty_log_protect(kvm, log); mutex_unlock(&kvm->slots_lock); return r; } /** * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap * and reenable dirty page tracking for the corresponding pages. * @kvm: pointer to kvm instance * @log: slot id and address from which to fetch the bitmap of dirty pages */ static int kvm_clear_dirty_log_protect(struct kvm *kvm, struct kvm_clear_dirty_log *log) { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; int as_id, id; gfn_t offset; unsigned long i, n; unsigned long *dirty_bitmap; unsigned long *dirty_bitmap_buffer; bool flush; /* Dirty ring tracking may be exclusive to dirty log tracking */ if (!kvm_use_dirty_bitmap(kvm)) return -ENXIO; as_id = log->slot >> 16; id = (u16)log->slot; if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS) return -EINVAL; if (log->first_page & 63) return -EINVAL; slots = __kvm_memslots(kvm, as_id); memslot = id_to_memslot(slots, id); if (!memslot || !memslot->dirty_bitmap) return -ENOENT; dirty_bitmap = memslot->dirty_bitmap; n = ALIGN(log->num_pages, BITS_PER_LONG) / 8; if (log->first_page > memslot->npages || log->num_pages > memslot->npages - log->first_page || (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63))) return -EINVAL; kvm_arch_sync_dirty_log(kvm, memslot); flush = false; dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n)) return -EFAULT; KVM_MMU_LOCK(kvm); for (offset = log->first_page, i = offset / BITS_PER_LONG, n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--; i++, offset += BITS_PER_LONG) { unsigned long mask = *dirty_bitmap_buffer++; atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i]; if (!mask) continue; mask &= atomic_long_fetch_andnot(mask, p); /* * mask contains the bits that really have been cleared. This * never includes any bits beyond the length of the memslot (if * the length is not aligned to 64 pages), therefore it is not * a problem if userspace sets them in log->dirty_bitmap. */ if (mask) { flush = true; kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, offset, mask); } } KVM_MMU_UNLOCK(kvm); if (flush) kvm_flush_remote_tlbs_memslot(kvm, memslot); return 0; } static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log) { int r; mutex_lock(&kvm->slots_lock); r = kvm_clear_dirty_log_protect(kvm, log); mutex_unlock(&kvm->slots_lock); return r; } #endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */ #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES static u64 kvm_supported_mem_attributes(struct kvm *kvm) { if (!kvm || kvm_arch_has_private_mem(kvm)) return KVM_MEMORY_ATTRIBUTE_PRIVATE; return 0; } /* * Returns true if _all_ gfns in the range [@start, @end) have attributes * such that the bits in @mask match @attrs. */ bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end, unsigned long mask, unsigned long attrs) { XA_STATE(xas, &kvm->mem_attr_array, start); unsigned long index; void *entry; mask &= kvm_supported_mem_attributes(kvm); if (attrs & ~mask) return false; if (end == start + 1) return (kvm_get_memory_attributes(kvm, start) & mask) == attrs; guard(rcu)(); if (!attrs) return !xas_find(&xas, end - 1); for (index = start; index < end; index++) { do { entry = xas_next(&xas); } while (xas_retry(&xas, entry)); if (xas.xa_index != index || (xa_to_value(entry) & mask) != attrs) return false; } return true; } static __always_inline void kvm_handle_gfn_range(struct kvm *kvm, struct kvm_mmu_notifier_range *range) { struct kvm_gfn_range gfn_range; struct kvm_memory_slot *slot; struct kvm_memslots *slots; struct kvm_memslot_iter iter; bool found_memslot = false; bool ret = false; int i; gfn_range.arg = range->arg; gfn_range.may_block = range->may_block; /* * If/when KVM supports more attributes beyond private .vs shared, this * _could_ set KVM_FILTER_{SHARED,PRIVATE} appropriately if the entire target * range already has the desired private vs. shared state (it's unclear * if that is a net win). For now, KVM reaches this point if and only * if the private flag is being toggled, i.e. all mappings are in play. */ for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { slots = __kvm_memslots(kvm, i); kvm_for_each_memslot_in_gfn_range(&iter, slots, range->start, range->end) { slot = iter.slot; gfn_range.slot = slot; gfn_range.start = max(range->start, slot->base_gfn); gfn_range.end = min(range->end, slot->base_gfn + slot->npages); if (gfn_range.start >= gfn_range.end) continue; if (!found_memslot) { found_memslot = true; KVM_MMU_LOCK(kvm); if (!IS_KVM_NULL_FN(range->on_lock)) range->on_lock(kvm); } ret |= range->handler(kvm, &gfn_range); } } if (range->flush_on_ret && ret) kvm_flush_remote_tlbs(kvm); if (found_memslot) KVM_MMU_UNLOCK(kvm); } static bool kvm_pre_set_memory_attributes(struct kvm *kvm, struct kvm_gfn_range *range) { /* * Unconditionally add the range to the invalidation set, regardless of * whether or not the arch callback actually needs to zap SPTEs. E.g. * if KVM supports RWX attributes in the future and the attributes are * going from R=>RW, zapping isn't strictly necessary. Unconditionally * adding the range allows KVM to require that MMU invalidations add at * least one range between begin() and end(), e.g. allows KVM to detect * bugs where the add() is missed. Relaxing the rule *might* be safe, * but it's not obvious that allowing new mappings while the attributes * are in flux is desirable or worth the complexity. */ kvm_mmu_invalidate_range_add(kvm, range->start, range->end); return kvm_arch_pre_set_memory_attributes(kvm, range); } /* Set @attributes for the gfn range [@start, @end). */ static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end, unsigned long attributes) { struct kvm_mmu_notifier_range pre_set_range = { .start = start, .end = end, .arg.attributes = attributes, .handler = kvm_pre_set_memory_attributes, .on_lock = kvm_mmu_invalidate_begin, .flush_on_ret = true, .may_block = true, }; struct kvm_mmu_notifier_range post_set_range = { .start = start, .end = end, .arg.attributes = attributes, .handler = kvm_arch_post_set_memory_attributes, .on_lock = kvm_mmu_invalidate_end, .may_block = true, }; unsigned long i; void *entry; int r = 0; entry = attributes ? xa_mk_value(attributes) : NULL; mutex_lock(&kvm->slots_lock); /* Nothing to do if the entire range as the desired attributes. */ if (kvm_range_has_memory_attributes(kvm, start, end, ~0, attributes)) goto out_unlock; /* * Reserve memory ahead of time to avoid having to deal with failures * partway through setting the new attributes. */ for (i = start; i < end; i++) { r = xa_reserve(&kvm->mem_attr_array, i, GFP_KERNEL_ACCOUNT); if (r) goto out_unlock; } kvm_handle_gfn_range(kvm, &pre_set_range); for (i = start; i < end; i++) { r = xa_err(xa_store(&kvm->mem_attr_array, i, entry, GFP_KERNEL_ACCOUNT)); KVM_BUG_ON(r, kvm); } kvm_handle_gfn_range(kvm, &post_set_range); out_unlock: mutex_unlock(&kvm->slots_lock); return r; } static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm, struct kvm_memory_attributes *attrs) { gfn_t start, end; /* flags is currently not used. */ if (attrs->flags) return -EINVAL; if (attrs->attributes & ~kvm_supported_mem_attributes(kvm)) return -EINVAL; if (attrs->size == 0 || attrs->address + attrs->size < attrs->address) return -EINVAL; if (!PAGE_ALIGNED(attrs->address) || !PAGE_ALIGNED(attrs->size)) return -EINVAL; start = attrs->address >> PAGE_SHIFT; end = (attrs->address + attrs->size) >> PAGE_SHIFT; /* * xarray tracks data using "unsigned long", and as a result so does * KVM. For simplicity, supports generic attributes only on 64-bit * architectures. */ BUILD_BUG_ON(sizeof(attrs->attributes) != sizeof(unsigned long)); return kvm_vm_set_mem_attributes(kvm, start, end, attrs->attributes); } #endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */ struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) { return __gfn_to_memslot(kvm_memslots(kvm), gfn); } EXPORT_SYMBOL_GPL(gfn_to_memslot); struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn) { struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu); u64 gen = slots->generation; struct kvm_memory_slot *slot; /* * This also protects against using a memslot from a different address space, * since different address spaces have different generation numbers. */ if (unlikely(gen != vcpu->last_used_slot_gen)) { vcpu->last_used_slot = NULL; vcpu->last_used_slot_gen = gen; } slot = try_get_memslot(vcpu->last_used_slot, gfn); if (slot) return slot; /* * Fall back to searching all memslots. We purposely use * search_memslots() instead of __gfn_to_memslot() to avoid * thrashing the VM-wide last_used_slot in kvm_memslots. */ slot = search_memslots(slots, gfn, false); if (slot) { vcpu->last_used_slot = slot; return slot; } return NULL; } bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) { struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn); return kvm_is_visible_memslot(memslot); } EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) { struct kvm_memory_slot *memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); return kvm_is_visible_memslot(memslot); } EXPORT_SYMBOL_GPL(kvm_vcpu_is_visible_gfn); unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn) { struct vm_area_struct *vma; unsigned long addr, size; size = PAGE_SIZE; addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL); if (kvm_is_error_hva(addr)) return PAGE_SIZE; mmap_read_lock(current->mm); vma = find_vma(current->mm, addr); if (!vma) goto out; size = vma_kernel_pagesize(vma); out: mmap_read_unlock(current->mm); return size; } static bool memslot_is_readonly(const struct kvm_memory_slot *slot) { return slot->flags & KVM_MEM_READONLY; } static unsigned long __gfn_to_hva_many(const struct kvm_memory_slot *slot, gfn_t gfn, gfn_t *nr_pages, bool write) { if (!slot || slot->flags & KVM_MEMSLOT_INVALID) return KVM_HVA_ERR_BAD; if (memslot_is_readonly(slot) && write) return KVM_HVA_ERR_RO_BAD; if (nr_pages) *nr_pages = slot->npages - (gfn - slot->base_gfn); return __gfn_to_hva_memslot(slot, gfn); } static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, gfn_t *nr_pages) { return __gfn_to_hva_many(slot, gfn, nr_pages, true); } unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn) { return gfn_to_hva_many(slot, gfn, NULL); } EXPORT_SYMBOL_GPL(gfn_to_hva_memslot); unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) { return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL); } EXPORT_SYMBOL_GPL(gfn_to_hva); unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn) { return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL); } EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva); /* * Return the hva of a @gfn and the R/W attribute if possible. * * @slot: the kvm_memory_slot which contains @gfn * @gfn: the gfn to be translated * @writable: used to return the read/write attribute of the @slot if the hva * is valid and @writable is not NULL */ unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, gfn_t gfn, bool *writable) { unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false); if (!kvm_is_error_hva(hva) && writable) *writable = !memslot_is_readonly(slot); return hva; } unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable) { struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); return gfn_to_hva_memslot_prot(slot, gfn, writable); } unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable) { struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); return gfn_to_hva_memslot_prot(slot, gfn, writable); } static bool kvm_is_ad_tracked_page(struct page *page) { /* * Per page-flags.h, pages tagged PG_reserved "should in general not be * touched (e.g. set dirty) except by its owner". */ return !PageReserved(page); } static void kvm_set_page_dirty(struct page *page) { if (kvm_is_ad_tracked_page(page)) SetPageDirty(page); } static void kvm_set_page_accessed(struct page *page) { if (kvm_is_ad_tracked_page(page)) mark_page_accessed(page); } void kvm_release_page_clean(struct page *page) { if (!page) return; kvm_set_page_accessed(page); put_page(page); } EXPORT_SYMBOL_GPL(kvm_release_page_clean); void kvm_release_page_dirty(struct page *page) { if (!page) return; kvm_set_page_dirty(page); kvm_release_page_clean(page); } EXPORT_SYMBOL_GPL(kvm_release_page_dirty); static kvm_pfn_t kvm_resolve_pfn(struct kvm_follow_pfn *kfp, struct page *page, struct follow_pfnmap_args *map, bool writable) { kvm_pfn_t pfn; WARN_ON_ONCE(!!page == !!map); if (kfp->map_writable) *kfp->map_writable = writable; if (map) pfn = map->pfn; else pfn = page_to_pfn(page); *kfp->refcounted_page = page; return pfn; } /* * The fast path to get the writable pfn which will be stored in @pfn, * true indicates success, otherwise false is returned. */ static bool hva_to_pfn_fast(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn) { struct page *page; bool r; /* * Try the fast-only path when the caller wants to pin/get the page for * writing. If the caller only wants to read the page, KVM must go * down the full, slow path in order to avoid racing an operation that * breaks Copy-on-Write (CoW), e.g. so that KVM doesn't end up pointing * at the old, read-only page while mm/ points at a new, writable page. */ if (!((kfp->flags & FOLL_WRITE) || kfp->map_writable)) return false; if (kfp->pin) r = pin_user_pages_fast(kfp->hva, 1, FOLL_WRITE, &page) == 1; else r = get_user_page_fast_only(kfp->hva, FOLL_WRITE, &page); if (r) { *pfn = kvm_resolve_pfn(kfp, page, NULL, true); return true; } return false; } /* * The slow path to get the pfn of the specified host virtual address, * 1 indicates success, -errno is returned if error is detected. */ static int hva_to_pfn_slow(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn) { /* * When a VCPU accesses a page that is not mapped into the secondary * MMU, we lookup the page using GUP to map it, so the guest VCPU can * make progress. We always want to honor NUMA hinting faults in that * case, because GUP usage corresponds to memory accesses from the VCPU. * Otherwise, we'd not trigger NUMA hinting faults once a page is * mapped into the secondary MMU and gets accessed by a VCPU. * * Note that get_user_page_fast_only() and FOLL_WRITE for now * implicitly honor NUMA hinting faults and don't need this flag. */ unsigned int flags = FOLL_HWPOISON | FOLL_HONOR_NUMA_FAULT | kfp->flags; struct page *page, *wpage; int npages; if (kfp->pin) npages = pin_user_pages_unlocked(kfp->hva, 1, &page, flags); else npages = get_user_pages_unlocked(kfp->hva, 1, &page, flags); if (npages != 1) return npages; /* * Pinning is mutually exclusive with opportunistically mapping a read * fault as writable, as KVM should never pin pages when mapping memory * into the guest (pinning is only for direct accesses from KVM). */ if (WARN_ON_ONCE(kfp->map_writable && kfp->pin)) goto out; /* map read fault as writable if possible */ if (!(flags & FOLL_WRITE) && kfp->map_writable && get_user_page_fast_only(kfp->hva, FOLL_WRITE, &wpage)) { put_page(page); page = wpage; flags |= FOLL_WRITE; } out: *pfn = kvm_resolve_pfn(kfp, page, NULL, flags & FOLL_WRITE); return npages; } static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault) { if (unlikely(!(vma->vm_flags & VM_READ))) return false; if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE)))) return false; return true; } static int hva_to_pfn_remapped(struct vm_area_struct *vma, struct kvm_follow_pfn *kfp, kvm_pfn_t *p_pfn) { struct follow_pfnmap_args args = { .vma = vma, .address = kfp->hva }; bool write_fault = kfp->flags & FOLL_WRITE; int r; /* * Remapped memory cannot be pinned in any meaningful sense. Bail if * the caller wants to pin the page, i.e. access the page outside of * MMU notifier protection, and unsafe umappings are disallowed. */ if (kfp->pin && !allow_unsafe_mappings) return -EINVAL; r = follow_pfnmap_start(&args); if (r) { /* * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does * not call the fault handler, so do it here. */ bool unlocked = false; r = fixup_user_fault(current->mm, kfp->hva, (write_fault ? FAULT_FLAG_WRITE : 0), &unlocked); if (unlocked) return -EAGAIN; if (r) return r; r = follow_pfnmap_start(&args); if (r) return r; } if (write_fault && !args.writable) { *p_pfn = KVM_PFN_ERR_RO_FAULT; goto out; } *p_pfn = kvm_resolve_pfn(kfp, NULL, &args, args.writable); out: follow_pfnmap_end(&args); return r; } kvm_pfn_t hva_to_pfn(struct kvm_follow_pfn *kfp) { struct vm_area_struct *vma; kvm_pfn_t pfn; int npages, r; might_sleep(); if (WARN_ON_ONCE(!kfp->refcounted_page)) return KVM_PFN_ERR_FAULT; if (hva_to_pfn_fast(kfp, &pfn)) return pfn; npages = hva_to_pfn_slow(kfp, &pfn); if (npages == 1) return pfn; if (npages == -EINTR || npages == -EAGAIN) return KVM_PFN_ERR_SIGPENDING; if (npages == -EHWPOISON) return KVM_PFN_ERR_HWPOISON; mmap_read_lock(current->mm); retry: vma = vma_lookup(current->mm, kfp->hva); if (vma == NULL) pfn = KVM_PFN_ERR_FAULT; else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) { r = hva_to_pfn_remapped(vma, kfp, &pfn); if (r == -EAGAIN) goto retry; if (r < 0) pfn = KVM_PFN_ERR_FAULT; } else { if ((kfp->flags & FOLL_NOWAIT) && vma_is_valid(vma, kfp->flags & FOLL_WRITE)) pfn = KVM_PFN_ERR_NEEDS_IO; else pfn = KVM_PFN_ERR_FAULT; } mmap_read_unlock(current->mm); return pfn; } static kvm_pfn_t kvm_follow_pfn(struct kvm_follow_pfn *kfp) { kfp->hva = __gfn_to_hva_many(kfp->slot, kfp->gfn, NULL, kfp->flags & FOLL_WRITE); if (kfp->hva == KVM_HVA_ERR_RO_BAD) return KVM_PFN_ERR_RO_FAULT; if (kvm_is_error_hva(kfp->hva)) return KVM_PFN_NOSLOT; if (memslot_is_readonly(kfp->slot) && kfp->map_writable) { *kfp->map_writable = false; kfp->map_writable = NULL; } return hva_to_pfn(kfp); } kvm_pfn_t __kvm_faultin_pfn(const struct kvm_memory_slot *slot, gfn_t gfn, unsigned int foll, bool *writable, struct page **refcounted_page) { struct kvm_follow_pfn kfp = { .slot = slot, .gfn = gfn, .flags = foll, .map_writable = writable, .refcounted_page = refcounted_page, }; if (WARN_ON_ONCE(!writable || !refcounted_page)) return KVM_PFN_ERR_FAULT; *writable = false; *refcounted_page = NULL; return kvm_follow_pfn(&kfp); } EXPORT_SYMBOL_GPL(__kvm_faultin_pfn); int kvm_prefetch_pages(struct kvm_memory_slot *slot, gfn_t gfn, struct page **pages, int nr_pages) { unsigned long addr; gfn_t entry = 0; addr = gfn_to_hva_many(slot, gfn, &entry); if (kvm_is_error_hva(addr)) return -1; if (entry < nr_pages) return 0; return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages); } EXPORT_SYMBOL_GPL(kvm_prefetch_pages); /* * Don't use this API unless you are absolutely, positively certain that KVM * needs to get a struct page, e.g. to pin the page for firmware DMA. * * FIXME: Users of this API likely need to FOLL_PIN the page, not just elevate * its refcount. */ struct page *__gfn_to_page(struct kvm *kvm, gfn_t gfn, bool write) { struct page *refcounted_page = NULL; struct kvm_follow_pfn kfp = { .slot = gfn_to_memslot(kvm, gfn), .gfn = gfn, .flags = write ? FOLL_WRITE : 0, .refcounted_page = &refcounted_page, }; (void)kvm_follow_pfn(&kfp); return refcounted_page; } EXPORT_SYMBOL_GPL(__gfn_to_page); int __kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map, bool writable) { struct kvm_follow_pfn kfp = { .slot = gfn_to_memslot(vcpu->kvm, gfn), .gfn = gfn, .flags = writable ? FOLL_WRITE : 0, .refcounted_page = &map->pinned_page, .pin = true, }; map->pinned_page = NULL; map->page = NULL; map->hva = NULL; map->gfn = gfn; map->writable = writable; map->pfn = kvm_follow_pfn(&kfp); if (is_error_noslot_pfn(map->pfn)) return -EINVAL; if (pfn_valid(map->pfn)) { map->page = pfn_to_page(map->pfn); map->hva = kmap(map->page); #ifdef CONFIG_HAS_IOMEM } else { map->hva = memremap(pfn_to_hpa(map->pfn), PAGE_SIZE, MEMREMAP_WB); #endif } return map->hva ? 0 : -EFAULT; } EXPORT_SYMBOL_GPL(__kvm_vcpu_map); void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map) { if (!map->hva) return; if (map->page) kunmap(map->page); #ifdef CONFIG_HAS_IOMEM else memunmap(map->hva); #endif if (map->writable) kvm_vcpu_mark_page_dirty(vcpu, map->gfn); if (map->pinned_page) { if (map->writable) kvm_set_page_dirty(map->pinned_page); kvm_set_page_accessed(map->pinned_page); unpin_user_page(map->pinned_page); } map->hva = NULL; map->page = NULL; map->pinned_page = NULL; } EXPORT_SYMBOL_GPL(kvm_vcpu_unmap); static int next_segment(unsigned long len, int offset) { if (len > PAGE_SIZE - offset) return PAGE_SIZE - offset; else return len; } /* Copy @len bytes from guest memory at '(@gfn * PAGE_SIZE) + @offset' to @data */ static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn, void *data, int offset, int len) { int r; unsigned long addr; if (WARN_ON_ONCE(offset + len > PAGE_SIZE)) return -EFAULT; addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); if (kvm_is_error_hva(addr)) return -EFAULT; r = __copy_from_user(data, (void __user *)addr + offset, len); if (r) return -EFAULT; return 0; } int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, int len) { struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); return __kvm_read_guest_page(slot, gfn, data, offset, len); } EXPORT_SYMBOL_GPL(kvm_read_guest_page); int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int offset, int len) { struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); return __kvm_read_guest_page(slot, gfn, data, offset, len); } EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page); int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) { gfn_t gfn = gpa >> PAGE_SHIFT; int seg; int offset = offset_in_page(gpa); int ret; while ((seg = next_segment(len, offset)) != 0) { ret = kvm_read_guest_page(kvm, gfn, data, offset, seg); if (ret < 0) return ret; offset = 0; len -= seg; data += seg; ++gfn; } return 0; } EXPORT_SYMBOL_GPL(kvm_read_guest); int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len) { gfn_t gfn = gpa >> PAGE_SHIFT; int seg; int offset = offset_in_page(gpa); int ret; while ((seg = next_segment(len, offset)) != 0) { ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg); if (ret < 0) return ret; offset = 0; len -= seg; data += seg; ++gfn; } return 0; } EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest); static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn, void *data, int offset, unsigned long len) { int r; unsigned long addr; if (WARN_ON_ONCE(offset + len > PAGE_SIZE)) return -EFAULT; addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); if (kvm_is_error_hva(addr)) return -EFAULT; pagefault_disable(); r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); pagefault_enable(); if (r) return -EFAULT; return 0; } int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len) { gfn_t gfn = gpa >> PAGE_SHIFT; struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); int offset = offset_in_page(gpa); return __kvm_read_guest_atomic(slot, gfn, data, offset, len); } EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic); /* Copy @len bytes from @data into guest memory at '(@gfn * PAGE_SIZE) + @offset' */ static int __kvm_write_guest_page(struct kvm *kvm, struct kvm_memory_slot *memslot, gfn_t gfn, const void *data, int offset, int len) { int r; unsigned long addr; if (WARN_ON_ONCE(offset + len > PAGE_SIZE)) return -EFAULT; addr = gfn_to_hva_memslot(memslot, gfn); if (kvm_is_error_hva(addr)) return -EFAULT; r = __copy_to_user((void __user *)addr + offset, data, len); if (r) return -EFAULT; mark_page_dirty_in_slot(kvm, memslot, gfn); return 0; } int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, int offset, int len) { struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); return __kvm_write_guest_page(kvm, slot, gfn, data, offset, len); } EXPORT_SYMBOL_GPL(kvm_write_guest_page); int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, const void *data, int offset, int len) { struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); return __kvm_write_guest_page(vcpu->kvm, slot, gfn, data, offset, len); } EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page); int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, unsigned long len) { gfn_t gfn = gpa >> PAGE_SHIFT; int seg; int offset = offset_in_page(gpa); int ret; while ((seg = next_segment(len, offset)) != 0) { ret = kvm_write_guest_page(kvm, gfn, data, offset, seg); if (ret < 0) return ret; offset = 0; len -= seg; data += seg; ++gfn; } return 0; } EXPORT_SYMBOL_GPL(kvm_write_guest); int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data, unsigned long len) { gfn_t gfn = gpa >> PAGE_SHIFT; int seg; int offset = offset_in_page(gpa); int ret; while ((seg = next_segment(len, offset)) != 0) { ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg); if (ret < 0) return ret; offset = 0; len -= seg; data += seg; ++gfn; } return 0; } EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest); static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots, struct gfn_to_hva_cache *ghc, gpa_t gpa, unsigned long len) { int offset = offset_in_page(gpa); gfn_t start_gfn = gpa >> PAGE_SHIFT; gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT; gfn_t nr_pages_needed = end_gfn - start_gfn + 1; gfn_t nr_pages_avail; /* Update ghc->generation before performing any error checks. */ ghc->generation = slots->generation; if (start_gfn > end_gfn) { ghc->hva = KVM_HVA_ERR_BAD; return -EINVAL; } /* * If the requested region crosses two memslots, we still * verify that the entire region is valid here. */ for ( ; start_gfn <= end_gfn; start_gfn += nr_pages_avail) { ghc->memslot = __gfn_to_memslot(slots, start_gfn); ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, &nr_pages_avail); if (kvm_is_error_hva(ghc->hva)) return -EFAULT; } /* Use the slow path for cross page reads and writes. */ if (nr_pages_needed == 1) ghc->hva += offset; else ghc->memslot = NULL; ghc->gpa = gpa; ghc->len = len; return 0; } int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, gpa_t gpa, unsigned long len) { struct kvm_memslots *slots = kvm_memslots(kvm); return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len); } EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init); int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, void *data, unsigned int offset, unsigned long len) { struct kvm_memslots *slots = kvm_memslots(kvm); int r; gpa_t gpa = ghc->gpa + offset; if (WARN_ON_ONCE(len + offset > ghc->len)) return -EINVAL; if (slots->generation != ghc->generation) { if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len)) return -EFAULT; } if (kvm_is_error_hva(ghc->hva)) return -EFAULT; if (unlikely(!ghc->memslot)) return kvm_write_guest(kvm, gpa, data, len); r = __copy_to_user((void __user *)ghc->hva + offset, data, len); if (r) return -EFAULT; mark_page_dirty_in_slot(kvm, ghc->memslot, gpa >> PAGE_SHIFT); return 0; } EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached); int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, void *data, unsigned long len) { return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len); } EXPORT_SYMBOL_GPL(kvm_write_guest_cached); int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, void *data, unsigned int offset, unsigned long len) { struct kvm_memslots *slots = kvm_memslots(kvm); int r; gpa_t gpa = ghc->gpa + offset; if (WARN_ON_ONCE(len + offset > ghc->len)) return -EINVAL; if (slots->generation != ghc->generation) { if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len)) return -EFAULT; } if (kvm_is_error_hva(ghc->hva)) return -EFAULT; if (unlikely(!ghc->memslot)) return kvm_read_guest(kvm, gpa, data, len); r = __copy_from_user(data, (void __user *)ghc->hva + offset, len); if (r) return -EFAULT; return 0; } EXPORT_SYMBOL_GPL(kvm_read_guest_offset_cached); int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, void *data, unsigned long len) { return kvm_read_guest_offset_cached(kvm, ghc, data, 0, len); } EXPORT_SYMBOL_GPL(kvm_read_guest_cached); int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) { const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0))); gfn_t gfn = gpa >> PAGE_SHIFT; int seg; int offset = offset_in_page(gpa); int ret; while ((seg = next_segment(len, offset)) != 0) { ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, seg); if (ret < 0) return ret; offset = 0; len -= seg; ++gfn; } return 0; } EXPORT_SYMBOL_GPL(kvm_clear_guest); void mark_page_dirty_in_slot(struct kvm *kvm, const struct kvm_memory_slot *memslot, gfn_t gfn) { struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); #ifdef CONFIG_HAVE_KVM_DIRTY_RING if (WARN_ON_ONCE(vcpu && vcpu->kvm != kvm)) return; WARN_ON_ONCE(!vcpu && !kvm_arch_allow_write_without_running_vcpu(kvm)); #endif if (memslot && kvm_slot_dirty_track_enabled(memslot)) { unsigned long rel_gfn = gfn - memslot->base_gfn; u32 slot = (memslot->as_id << 16) | memslot->id; if (kvm->dirty_ring_size && vcpu) kvm_dirty_ring_push(vcpu, slot, rel_gfn); else if (memslot->dirty_bitmap) set_bit_le(rel_gfn, memslot->dirty_bitmap); } } EXPORT_SYMBOL_GPL(mark_page_dirty_in_slot); void mark_page_dirty(struct kvm *kvm, gfn_t gfn) { struct kvm_memory_slot *memslot; memslot = gfn_to_memslot(kvm, gfn); mark_page_dirty_in_slot(kvm, memslot, gfn); } EXPORT_SYMBOL_GPL(mark_page_dirty); void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn) { struct kvm_memory_slot *memslot; memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); mark_page_dirty_in_slot(vcpu->kvm, memslot, gfn); } EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty); void kvm_sigset_activate(struct kvm_vcpu *vcpu) { if (!vcpu->sigset_active) return; /* * This does a lockless modification of ->real_blocked, which is fine * because, only current can change ->real_blocked and all readers of * ->real_blocked don't care as long ->real_blocked is always a subset * of ->blocked. */ sigprocmask(SIG_SETMASK, &vcpu->sigset, &current->real_blocked); } void kvm_sigset_deactivate(struct kvm_vcpu *vcpu) { if (!vcpu->sigset_active) return; sigprocmask(SIG_SETMASK, &current->real_blocked, NULL); sigemptyset(&current->real_blocked); } static void grow_halt_poll_ns(struct kvm_vcpu *vcpu) { unsigned int old, val, grow, grow_start; old = val = vcpu->halt_poll_ns; grow_start = READ_ONCE(halt_poll_ns_grow_start); grow = READ_ONCE(halt_poll_ns_grow); if (!grow) goto out; val *= grow; if (val < grow_start) val = grow_start; vcpu->halt_poll_ns = val; out: trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old); } static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu) { unsigned int old, val, shrink, grow_start; old = val = vcpu->halt_poll_ns; shrink = READ_ONCE(halt_poll_ns_shrink); grow_start = READ_ONCE(halt_poll_ns_grow_start); if (shrink == 0) val = 0; else val /= shrink; if (val < grow_start) val = 0; vcpu->halt_poll_ns = val; trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old); } static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu) { int ret = -EINTR; int idx = srcu_read_lock(&vcpu->kvm->srcu); if (kvm_arch_vcpu_runnable(vcpu)) goto out; if (kvm_cpu_has_pending_timer(vcpu)) goto out; if (signal_pending(current)) goto out; if (kvm_check_request(KVM_REQ_UNBLOCK, vcpu)) goto out; ret = 0; out: srcu_read_unlock(&vcpu->kvm->srcu, idx); return ret; } /* * Block the vCPU until the vCPU is runnable, an event arrives, or a signal is * pending. This is mostly used when halting a vCPU, but may also be used * directly for other vCPU non-runnable states, e.g. x86's Wait-For-SIPI. */ bool kvm_vcpu_block(struct kvm_vcpu *vcpu) { struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu); bool waited = false; vcpu->stat.generic.blocking = 1; preempt_disable(); kvm_arch_vcpu_blocking(vcpu); prepare_to_rcuwait(wait); preempt_enable(); for (;;) { set_current_state(TASK_INTERRUPTIBLE); if (kvm_vcpu_check_block(vcpu) < 0) break; waited = true; schedule(); } preempt_disable(); finish_rcuwait(wait); kvm_arch_vcpu_unblocking(vcpu); preempt_enable(); vcpu->stat.generic.blocking = 0; return waited; } static inline void update_halt_poll_stats(struct kvm_vcpu *vcpu, ktime_t start, ktime_t end, bool success) { struct kvm_vcpu_stat_generic *stats = &vcpu->stat.generic; u64 poll_ns = ktime_to_ns(ktime_sub(end, start)); ++vcpu->stat.generic.halt_attempted_poll; if (success) { ++vcpu->stat.generic.halt_successful_poll; if (!vcpu_valid_wakeup(vcpu)) ++vcpu->stat.generic.halt_poll_invalid; stats->halt_poll_success_ns += poll_ns; KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_success_hist, poll_ns); } else { stats->halt_poll_fail_ns += poll_ns; KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_fail_hist, poll_ns); } } static unsigned int kvm_vcpu_max_halt_poll_ns(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; if (kvm->override_halt_poll_ns) { /* * Ensure kvm->max_halt_poll_ns is not read before * kvm->override_halt_poll_ns. * * Pairs with the smp_wmb() when enabling KVM_CAP_HALT_POLL. */ smp_rmb(); return READ_ONCE(kvm->max_halt_poll_ns); } return READ_ONCE(halt_poll_ns); } /* * Emulate a vCPU halt condition, e.g. HLT on x86, WFI on arm, etc... If halt * polling is enabled, busy wait for a short time before blocking to avoid the * expensive block+unblock sequence if a wake event arrives soon after the vCPU * is halted. */ void kvm_vcpu_halt(struct kvm_vcpu *vcpu) { unsigned int max_halt_poll_ns = kvm_vcpu_max_halt_poll_ns(vcpu); bool halt_poll_allowed = !kvm_arch_no_poll(vcpu); ktime_t start, cur, poll_end; bool waited = false; bool do_halt_poll; u64 halt_ns; if (vcpu->halt_poll_ns > max_halt_poll_ns) vcpu->halt_poll_ns = max_halt_poll_ns; do_halt_poll = halt_poll_allowed && vcpu->halt_poll_ns; start = cur = poll_end = ktime_get(); if (do_halt_poll) { ktime_t stop = ktime_add_ns(start, vcpu->halt_poll_ns); do { if (kvm_vcpu_check_block(vcpu) < 0) goto out; cpu_relax(); poll_end = cur = ktime_get(); } while (kvm_vcpu_can_poll(cur, stop)); } waited = kvm_vcpu_block(vcpu); cur = ktime_get(); if (waited) { vcpu->stat.generic.halt_wait_ns += ktime_to_ns(cur) - ktime_to_ns(poll_end); KVM_STATS_LOG_HIST_UPDATE(vcpu->stat.generic.halt_wait_hist, ktime_to_ns(cur) - ktime_to_ns(poll_end)); } out: /* The total time the vCPU was "halted", including polling time. */ halt_ns = ktime_to_ns(cur) - ktime_to_ns(start); /* * Note, halt-polling is considered successful so long as the vCPU was * never actually scheduled out, i.e. even if the wake event arrived * after of the halt-polling loop itself, but before the full wait. */ if (do_halt_poll) update_halt_poll_stats(vcpu, start, poll_end, !waited); if (halt_poll_allowed) { /* Recompute the max halt poll time in case it changed. */ max_halt_poll_ns = kvm_vcpu_max_halt_poll_ns(vcpu); if (!vcpu_valid_wakeup(vcpu)) { shrink_halt_poll_ns(vcpu); } else if (max_halt_poll_ns) { if (halt_ns <= vcpu->halt_poll_ns) ; /* we had a long block, shrink polling */ else if (vcpu->halt_poll_ns && halt_ns > max_halt_poll_ns) shrink_halt_poll_ns(vcpu); /* we had a short halt and our poll time is too small */ else if (vcpu->halt_poll_ns < max_halt_poll_ns && halt_ns < max_halt_poll_ns) grow_halt_poll_ns(vcpu); } else { vcpu->halt_poll_ns = 0; } } trace_kvm_vcpu_wakeup(halt_ns, waited, vcpu_valid_wakeup(vcpu)); } EXPORT_SYMBOL_GPL(kvm_vcpu_halt); bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu) { if (__kvm_vcpu_wake_up(vcpu)) { WRITE_ONCE(vcpu->ready, true); ++vcpu->stat.generic.halt_wakeup; return true; } return false; } EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up); #ifndef CONFIG_S390 /* * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode. */ void __kvm_vcpu_kick(struct kvm_vcpu *vcpu, bool wait) { int me, cpu; if (kvm_vcpu_wake_up(vcpu)) return; me = get_cpu(); /* * The only state change done outside the vcpu mutex is IN_GUEST_MODE * to EXITING_GUEST_MODE. Therefore the moderately expensive "should * kick" check does not need atomic operations if kvm_vcpu_kick is used * within the vCPU thread itself. */ if (vcpu == __this_cpu_read(kvm_running_vcpu)) { if (vcpu->mode == IN_GUEST_MODE) WRITE_ONCE(vcpu->mode, EXITING_GUEST_MODE); goto out; } /* * Note, the vCPU could get migrated to a different pCPU at any point * after kvm_arch_vcpu_should_kick(), which could result in sending an * IPI to the previous pCPU. But, that's ok because the purpose of the * IPI is to force the vCPU to leave IN_GUEST_MODE, and migrating the * vCPU also requires it to leave IN_GUEST_MODE. */ if (kvm_arch_vcpu_should_kick(vcpu)) { cpu = READ_ONCE(vcpu->cpu); if (cpu != me && (unsigned int)cpu < nr_cpu_ids && cpu_online(cpu)) { /* * Use a reschedule IPI to kick the vCPU if the caller * doesn't need to wait for a response, as KVM allows * kicking vCPUs while IRQs are disabled, but using the * SMP function call framework with IRQs disabled can * deadlock due to taking cross-CPU locks. */ if (wait) smp_call_function_single(cpu, ack_kick, NULL, wait); else smp_send_reschedule(cpu); } } out: put_cpu(); } EXPORT_SYMBOL_GPL(__kvm_vcpu_kick); #endif /* !CONFIG_S390 */ int kvm_vcpu_yield_to(struct kvm_vcpu *target) { struct task_struct *task = NULL; int ret; if (!read_trylock(&target->pid_lock)) return 0; if (target->pid) task = get_pid_task(target->pid, PIDTYPE_PID); read_unlock(&target->pid_lock); if (!task) return 0; ret = yield_to(task, 1); put_task_struct(task); return ret; } EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to); /* * Helper that checks whether a VCPU is eligible for directed yield. * Most eligible candidate to yield is decided by following heuristics: * * (a) VCPU which has not done pl-exit or cpu relax intercepted recently * (preempted lock holder), indicated by @in_spin_loop. * Set at the beginning and cleared at the end of interception/PLE handler. * * (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get * chance last time (mostly it has become eligible now since we have probably * yielded to lockholder in last iteration. This is done by toggling * @dy_eligible each time a VCPU checked for eligibility.) * * Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding * to preempted lock-holder could result in wrong VCPU selection and CPU * burning. Giving priority for a potential lock-holder increases lock * progress. * * Since algorithm is based on heuristics, accessing another VCPU data without * locking does not harm. It may result in trying to yield to same VCPU, fail * and continue with next VCPU and so on. */ static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu) { #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT bool eligible; eligible = !vcpu->spin_loop.in_spin_loop || vcpu->spin_loop.dy_eligible; if (vcpu->spin_loop.in_spin_loop) kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible); return eligible; #else return true; #endif } /* * Unlike kvm_arch_vcpu_runnable, this function is called outside * a vcpu_load/vcpu_put pair. However, for most architectures * kvm_arch_vcpu_runnable does not require vcpu_load. */ bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) { return kvm_arch_vcpu_runnable(vcpu); } static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu) { if (kvm_arch_dy_runnable(vcpu)) return true; #ifdef CONFIG_KVM_ASYNC_PF if (!list_empty_careful(&vcpu->async_pf.done)) return true; #endif return false; } /* * By default, simply query the target vCPU's current mode when checking if a * vCPU was preempted in kernel mode. All architectures except x86 (or more * specifical, except VMX) allow querying whether or not a vCPU is in kernel * mode even if the vCPU is NOT loaded, i.e. using kvm_arch_vcpu_in_kernel() * directly for cross-vCPU checks is functionally correct and accurate. */ bool __weak kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu) { return kvm_arch_vcpu_in_kernel(vcpu); } bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu) { return false; } void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) { int nr_vcpus, start, i, idx, yielded; struct kvm *kvm = me->kvm; struct kvm_vcpu *vcpu; int try = 3; nr_vcpus = atomic_read(&kvm->online_vcpus); if (nr_vcpus < 2) return; /* Pairs with the smp_wmb() in kvm_vm_ioctl_create_vcpu(). */ smp_rmb(); kvm_vcpu_set_in_spin_loop(me, true); /* * The current vCPU ("me") is spinning in kernel mode, i.e. is likely * waiting for a resource to become available. Attempt to yield to a * vCPU that is runnable, but not currently running, e.g. because the * vCPU was preempted by a higher priority task. With luck, the vCPU * that was preempted is holding a lock or some other resource that the * current vCPU is waiting to acquire, and yielding to the other vCPU * will allow it to make forward progress and release the lock (or kick * the spinning vCPU, etc). * * Since KVM has no insight into what exactly the guest is doing, * approximate a round-robin selection by iterating over all vCPUs, * starting at the last boosted vCPU. I.e. if N=kvm->last_boosted_vcpu, * iterate over vCPU[N+1]..vCPU[N-1], wrapping as needed. * * Note, this is inherently racy, e.g. if multiple vCPUs are spinning, * they may all try to yield to the same vCPU(s). But as above, this * is all best effort due to KVM's lack of visibility into the guest. */ start = READ_ONCE(kvm->last_boosted_vcpu) + 1; for (i = 0; i < nr_vcpus; i++) { idx = (start + i) % nr_vcpus; if (idx == me->vcpu_idx) continue; vcpu = xa_load(&kvm->vcpu_array, idx); if (!READ_ONCE(vcpu->ready)) continue; if (kvm_vcpu_is_blocking(vcpu) && !vcpu_dy_runnable(vcpu)) continue; /* * Treat the target vCPU as being in-kernel if it has a pending * interrupt, as the vCPU trying to yield may be spinning * waiting on IPI delivery, i.e. the target vCPU is in-kernel * for the purposes of directed yield. */ if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode && !kvm_arch_dy_has_pending_interrupt(vcpu) && !kvm_arch_vcpu_preempted_in_kernel(vcpu)) continue; if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) continue; yielded = kvm_vcpu_yield_to(vcpu); if (yielded > 0) { WRITE_ONCE(kvm->last_boosted_vcpu, i); break; } else if (yielded < 0 && !--try) { break; } } kvm_vcpu_set_in_spin_loop(me, false); /* Ensure vcpu is not eligible during next spinloop */ kvm_vcpu_set_dy_eligible(me, false); } EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); static bool kvm_page_in_dirty_ring(struct kvm *kvm, unsigned long pgoff) { #ifdef CONFIG_HAVE_KVM_DIRTY_RING return (pgoff >= KVM_DIRTY_LOG_PAGE_OFFSET) && (pgoff < KVM_DIRTY_LOG_PAGE_OFFSET + kvm->dirty_ring_size / PAGE_SIZE); #else return false; #endif } static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf) { struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data; struct page *page; if (vmf->pgoff == 0) page = virt_to_page(vcpu->run); #ifdef CONFIG_X86 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET) page = virt_to_page(vcpu->arch.pio_data); #endif #ifdef CONFIG_KVM_MMIO else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET) page = virt_to_page(vcpu->kvm->coalesced_mmio_ring); #endif else if (kvm_page_in_dirty_ring(vcpu->kvm, vmf->pgoff)) page = kvm_dirty_ring_get_page( &vcpu->dirty_ring, vmf->pgoff - KVM_DIRTY_LOG_PAGE_OFFSET); else return kvm_arch_vcpu_fault(vcpu, vmf); get_page(page); vmf->page = page; return 0; } static const struct vm_operations_struct kvm_vcpu_vm_ops = { .fault = kvm_vcpu_fault, }; static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) { struct kvm_vcpu *vcpu = file->private_data; unsigned long pages = vma_pages(vma); if ((kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff) || kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff + pages - 1)) && ((vma->vm_flags & VM_EXEC) || !(vma->vm_flags & VM_SHARED))) return -EINVAL; vma->vm_ops = &kvm_vcpu_vm_ops; return 0; } static int kvm_vcpu_release(struct inode *inode, struct file *filp) { struct kvm_vcpu *vcpu = filp->private_data; kvm_put_kvm(vcpu->kvm); return 0; } static struct file_operations kvm_vcpu_fops = { .release = kvm_vcpu_release, .unlocked_ioctl = kvm_vcpu_ioctl, .mmap = kvm_vcpu_mmap, .llseek = noop_llseek, KVM_COMPAT(kvm_vcpu_compat_ioctl), }; /* * Allocates an inode for the vcpu. */ static int create_vcpu_fd(struct kvm_vcpu *vcpu) { char name[8 + 1 + ITOA_MAX_LEN + 1]; snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id); return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC); } #ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS static int vcpu_get_pid(void *data, u64 *val) { struct kvm_vcpu *vcpu = data; read_lock(&vcpu->pid_lock); *val = pid_nr(vcpu->pid); read_unlock(&vcpu->pid_lock); return 0; } DEFINE_SIMPLE_ATTRIBUTE(vcpu_get_pid_fops, vcpu_get_pid, NULL, "%llu\n"); static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) { struct dentry *debugfs_dentry; char dir_name[ITOA_MAX_LEN * 2]; if (!debugfs_initialized()) return; snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id); debugfs_dentry = debugfs_create_dir(dir_name, vcpu->kvm->debugfs_dentry); debugfs_create_file("pid", 0444, debugfs_dentry, vcpu, &vcpu_get_pid_fops); kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry); } #endif /* * Creates some virtual cpus. Good luck creating more than one. */ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id) { int r; struct kvm_vcpu *vcpu; struct page *page; /* * KVM tracks vCPU IDs as 'int', be kind to userspace and reject * too-large values instead of silently truncating. * * Ensure KVM_MAX_VCPU_IDS isn't pushed above INT_MAX without first * changing the storage type (at the very least, IDs should be tracked * as unsigned ints). */ BUILD_BUG_ON(KVM_MAX_VCPU_IDS > INT_MAX); if (id >= KVM_MAX_VCPU_IDS) return -EINVAL; mutex_lock(&kvm->lock); if (kvm->created_vcpus >= kvm->max_vcpus) { mutex_unlock(&kvm->lock); return -EINVAL; } r = kvm_arch_vcpu_precreate(kvm, id); if (r) { mutex_unlock(&kvm->lock); return r; } kvm->created_vcpus++; mutex_unlock(&kvm->lock); vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT); if (!vcpu) { r = -ENOMEM; goto vcpu_decrement; } BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE); page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!page) { r = -ENOMEM; goto vcpu_free; } vcpu->run = page_address(page); kvm_vcpu_init(vcpu, kvm, id); r = kvm_arch_vcpu_create(vcpu); if (r) goto vcpu_free_run_page; if (kvm->dirty_ring_size) { r = kvm_dirty_ring_alloc(kvm, &vcpu->dirty_ring, id, kvm->dirty_ring_size); if (r) goto arch_vcpu_destroy; } mutex_lock(&kvm->lock); if (kvm_get_vcpu_by_id(kvm, id)) { r = -EEXIST; goto unlock_vcpu_destroy; } vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus); r = xa_insert(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, GFP_KERNEL_ACCOUNT); WARN_ON_ONCE(r == -EBUSY); if (r) goto unlock_vcpu_destroy; /* * Now it's all set up, let userspace reach it. Grab the vCPU's mutex * so that userspace can't invoke vCPU ioctl()s until the vCPU is fully * visible (per online_vcpus), e.g. so that KVM doesn't get tricked * into a NULL-pointer dereference because KVM thinks the _current_ * vCPU doesn't exist. As a bonus, taking vcpu->mutex ensures lockdep * knows it's taken *inside* kvm->lock. */ mutex_lock(&vcpu->mutex); kvm_get_kvm(kvm); r = create_vcpu_fd(vcpu); if (r < 0) goto kvm_put_xa_erase; /* * Pairs with smp_rmb() in kvm_get_vcpu. Store the vcpu * pointer before kvm->online_vcpu's incremented value. */ smp_wmb(); atomic_inc(&kvm->online_vcpus); mutex_unlock(&vcpu->mutex); mutex_unlock(&kvm->lock); kvm_arch_vcpu_postcreate(vcpu); kvm_create_vcpu_debugfs(vcpu); return r; kvm_put_xa_erase: mutex_unlock(&vcpu->mutex); kvm_put_kvm_no_destroy(kvm); xa_erase(&kvm->vcpu_array, vcpu->vcpu_idx); unlock_vcpu_destroy: mutex_unlock(&kvm->lock); kvm_dirty_ring_free(&vcpu->dirty_ring); arch_vcpu_destroy: kvm_arch_vcpu_destroy(vcpu); vcpu_free_run_page: free_page((unsigned long)vcpu->run); vcpu_free: kmem_cache_free(kvm_vcpu_cache, vcpu); vcpu_decrement: mutex_lock(&kvm->lock); kvm->created_vcpus--; mutex_unlock(&kvm->lock); return r; } static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) { if (sigset) { sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP)); vcpu->sigset_active = 1; vcpu->sigset = *sigset; } else vcpu->sigset_active = 0; return 0; } static ssize_t kvm_vcpu_stats_read(struct file *file, char __user *user_buffer, size_t size, loff_t *offset) { struct kvm_vcpu *vcpu = file->private_data; return kvm_stats_read(vcpu->stats_id, &kvm_vcpu_stats_header, &kvm_vcpu_stats_desc[0], &vcpu->stat, sizeof(vcpu->stat), user_buffer, size, offset); } static int kvm_vcpu_stats_release(struct inode *inode, struct file *file) { struct kvm_vcpu *vcpu = file->private_data; kvm_put_kvm(vcpu->kvm); return 0; } static const struct file_operations kvm_vcpu_stats_fops = { .owner = THIS_MODULE, .read = kvm_vcpu_stats_read, .release = kvm_vcpu_stats_release, .llseek = noop_llseek, }; static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu) { int fd; struct file *file; char name[15 + ITOA_MAX_LEN + 1]; snprintf(name, sizeof(name), "kvm-vcpu-stats:%d", vcpu->vcpu_id); fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) return fd; file = anon_inode_getfile_fmode(name, &kvm_vcpu_stats_fops, vcpu, O_RDONLY, FMODE_PREAD); if (IS_ERR(file)) { put_unused_fd(fd); return PTR_ERR(file); } kvm_get_kvm(vcpu->kvm); fd_install(fd, file); return fd; } #ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY static int kvm_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, struct kvm_pre_fault_memory *range) { int idx; long r; u64 full_size; if (range->flags) return -EINVAL; if (!PAGE_ALIGNED(range->gpa) || !PAGE_ALIGNED(range->size) || range->gpa + range->size <= range->gpa) return -EINVAL; vcpu_load(vcpu); idx = srcu_read_lock(&vcpu->kvm->srcu); full_size = range->size; do { if (signal_pending(current)) { r = -EINTR; break; } r = kvm_arch_vcpu_pre_fault_memory(vcpu, range); if (WARN_ON_ONCE(r == 0 || r == -EIO)) break; if (r < 0) break; range->size -= r; range->gpa += r; cond_resched(); } while (range->size); srcu_read_unlock(&vcpu->kvm->srcu, idx); vcpu_put(vcpu); /* Return success if at least one page was mapped successfully. */ return full_size == range->size ? r : 0; } #endif static int kvm_wait_for_vcpu_online(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; /* * In practice, this happy path will always be taken, as a well-behaved * VMM will never invoke a vCPU ioctl() before KVM_CREATE_VCPU returns. */ if (likely(vcpu->vcpu_idx < atomic_read(&kvm->online_vcpus))) return 0; /* * Acquire and release the vCPU's mutex to wait for vCPU creation to * complete (kvm_vm_ioctl_create_vcpu() holds the mutex until the vCPU * is fully online). */ if (mutex_lock_killable(&vcpu->mutex)) return -EINTR; mutex_unlock(&vcpu->mutex); if (WARN_ON_ONCE(!kvm_get_vcpu(kvm, vcpu->vcpu_idx))) return -EIO; return 0; } static long kvm_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; int r; struct kvm_fpu *fpu = NULL; struct kvm_sregs *kvm_sregs = NULL; if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead) return -EIO; if (unlikely(_IOC_TYPE(ioctl) != KVMIO)) return -EINVAL; /* * Wait for the vCPU to be online before handling the ioctl(), as KVM * assumes the vCPU is reachable via vcpu_array, i.e. may dereference * a NULL pointer if userspace invokes an ioctl() before KVM is ready. */ r = kvm_wait_for_vcpu_online(vcpu); if (r) return r; /* * Some architectures have vcpu ioctls that are asynchronous to vcpu * execution; mutex_lock() would break them. */ r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg); if (r != -ENOIOCTLCMD) return r; if (mutex_lock_killable(&vcpu->mutex)) return -EINTR; switch (ioctl) { case KVM_RUN: { struct pid *oldpid; r = -EINVAL; if (arg) goto out; /* * Note, vcpu->pid is primarily protected by vcpu->mutex. The * dedicated r/w lock allows other tasks, e.g. other vCPUs, to * read vcpu->pid while this vCPU is in KVM_RUN, e.g. to yield * directly to this vCPU */ oldpid = vcpu->pid; if (unlikely(oldpid != task_pid(current))) { /* The thread running this VCPU changed. */ struct pid *newpid; r = kvm_arch_vcpu_run_pid_change(vcpu); if (r) break; newpid = get_task_pid(current, PIDTYPE_PID); write_lock(&vcpu->pid_lock); vcpu->pid = newpid; write_unlock(&vcpu->pid_lock); put_pid(oldpid); } vcpu->wants_to_run = !READ_ONCE(vcpu->run->immediate_exit__unsafe); r = kvm_arch_vcpu_ioctl_run(vcpu); vcpu->wants_to_run = false; trace_kvm_userspace_exit(vcpu->run->exit_reason, r); break; } case KVM_GET_REGS: { struct kvm_regs *kvm_regs; r = -ENOMEM; kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); if (!kvm_regs) goto out; r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs); if (r) goto out_free1; r = -EFAULT; if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs))) goto out_free1; r = 0; out_free1: kfree(kvm_regs); break; } case KVM_SET_REGS: { struct kvm_regs *kvm_regs; kvm_regs = memdup_user(argp, sizeof(*kvm_regs)); if (IS_ERR(kvm_regs)) { r = PTR_ERR(kvm_regs); goto out; } r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs); kfree(kvm_regs); break; } case KVM_GET_SREGS: { kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL); r = -ENOMEM; if (!kvm_sregs) goto out; r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs); if (r) goto out; r = -EFAULT; if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs))) goto out; r = 0; break; } case KVM_SET_SREGS: { kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs)); if (IS_ERR(kvm_sregs)) { r = PTR_ERR(kvm_sregs); kvm_sregs = NULL; goto out; } r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs); break; } case KVM_GET_MP_STATE: { struct kvm_mp_state mp_state; r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state); if (r) goto out; r = -EFAULT; if (copy_to_user(argp, &mp_state, sizeof(mp_state))) goto out; r = 0; break; } case KVM_SET_MP_STATE: { struct kvm_mp_state mp_state; r = -EFAULT; if (copy_from_user(&mp_state, argp, sizeof(mp_state))) goto out; r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state); break; } case KVM_TRANSLATE: { struct kvm_translation tr; r = -EFAULT; if (copy_from_user(&tr, argp, sizeof(tr))) goto out; r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr); if (r) goto out; r = -EFAULT; if (copy_to_user(argp, &tr, sizeof(tr))) goto out; r = 0; break; } case KVM_SET_GUEST_DEBUG: { struct kvm_guest_debug dbg; r = -EFAULT; if (copy_from_user(&dbg, argp, sizeof(dbg))) goto out; r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg); break; } case KVM_SET_SIGNAL_MASK: { struct kvm_signal_mask __user *sigmask_arg = argp; struct kvm_signal_mask kvm_sigmask; sigset_t sigset, *p; p = NULL; if (argp) { r = -EFAULT; if (copy_from_user(&kvm_sigmask, argp, sizeof(kvm_sigmask))) goto out; r = -EINVAL; if (kvm_sigmask.len != sizeof(sigset)) goto out; r = -EFAULT; if (copy_from_user(&sigset, sigmask_arg->sigset, sizeof(sigset))) goto out; p = &sigset; } r = kvm_vcpu_ioctl_set_sigmask(vcpu, p); break; } case KVM_GET_FPU: { fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL); r = -ENOMEM; if (!fpu) goto out; r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu); if (r) goto out; r = -EFAULT; if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu))) goto out; r = 0; break; } case KVM_SET_FPU: { fpu = memdup_user(argp, sizeof(*fpu)); if (IS_ERR(fpu)) { r = PTR_ERR(fpu); fpu = NULL; goto out; } r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); break; } case KVM_GET_STATS_FD: { r = kvm_vcpu_ioctl_get_stats_fd(vcpu); break; } #ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY case KVM_PRE_FAULT_MEMORY: { struct kvm_pre_fault_memory range; r = -EFAULT; if (copy_from_user(&range, argp, sizeof(range))) break; r = kvm_vcpu_pre_fault_memory(vcpu, &range); /* Pass back leftover range. */ if (copy_to_user(argp, &range, sizeof(range))) r = -EFAULT; break; } #endif default: r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); } out: mutex_unlock(&vcpu->mutex); kfree(fpu); kfree(kvm_sregs); return r; } #ifdef CONFIG_KVM_COMPAT static long kvm_vcpu_compat_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = compat_ptr(arg); int r; if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead) return -EIO; switch (ioctl) { case KVM_SET_SIGNAL_MASK: { struct kvm_signal_mask __user *sigmask_arg = argp; struct kvm_signal_mask kvm_sigmask; sigset_t sigset; if (argp) { r = -EFAULT; if (copy_from_user(&kvm_sigmask, argp, sizeof(kvm_sigmask))) goto out; r = -EINVAL; if (kvm_sigmask.len != sizeof(compat_sigset_t)) goto out; r = -EFAULT; if (get_compat_sigset(&sigset, (compat_sigset_t __user *)sigmask_arg->sigset)) goto out; r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); } else r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL); break; } default: r = kvm_vcpu_ioctl(filp, ioctl, arg); } out: return r; } #endif static int kvm_device_mmap(struct file *filp, struct vm_area_struct *vma) { struct kvm_device *dev = filp->private_data; if (dev->ops->mmap) return dev->ops->mmap(dev, vma); return -ENODEV; } static int kvm_device_ioctl_attr(struct kvm_device *dev, int (*accessor)(struct kvm_device *dev, struct kvm_device_attr *attr), unsigned long arg) { struct kvm_device_attr attr; if (!accessor) return -EPERM; if (copy_from_user(&attr, (void __user *)arg, sizeof(attr))) return -EFAULT; return accessor(dev, &attr); } static long kvm_device_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { struct kvm_device *dev = filp->private_data; if (dev->kvm->mm != current->mm || dev->kvm->vm_dead) return -EIO; switch (ioctl) { case KVM_SET_DEVICE_ATTR: return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg); case KVM_GET_DEVICE_ATTR: return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg); case KVM_HAS_DEVICE_ATTR: return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg); default: if (dev->ops->ioctl) return dev->ops->ioctl(dev, ioctl, arg); return -ENOTTY; } } static int kvm_device_release(struct inode *inode, struct file *filp) { struct kvm_device *dev = filp->private_data; struct kvm *kvm = dev->kvm; if (dev->ops->release) { mutex_lock(&kvm->lock); list_del_rcu(&dev->vm_node); synchronize_rcu(); dev->ops->release(dev); mutex_unlock(&kvm->lock); } kvm_put_kvm(kvm); return 0; } static struct file_operations kvm_device_fops = { .unlocked_ioctl = kvm_device_ioctl, .release = kvm_device_release, KVM_COMPAT(kvm_device_ioctl), .mmap = kvm_device_mmap, }; struct kvm_device *kvm_device_from_filp(struct file *filp) { if (filp->f_op != &kvm_device_fops) return NULL; return filp->private_data; } static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = { #ifdef CONFIG_KVM_MPIC [KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops, [KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops, #endif }; int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type) { if (type >= ARRAY_SIZE(kvm_device_ops_table)) return -ENOSPC; if (kvm_device_ops_table[type] != NULL) return -EEXIST; kvm_device_ops_table[type] = ops; return 0; } void kvm_unregister_device_ops(u32 type) { if (kvm_device_ops_table[type] != NULL) kvm_device_ops_table[type] = NULL; } static int kvm_ioctl_create_device(struct kvm *kvm, struct kvm_create_device *cd) { const struct kvm_device_ops *ops; struct kvm_device *dev; bool test = cd->flags & KVM_CREATE_DEVICE_TEST; int type; int ret; if (cd->type >= ARRAY_SIZE(kvm_device_ops_table)) return -ENODEV; type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table)); ops = kvm_device_ops_table[type]; if (ops == NULL) return -ENODEV; if (test) return 0; dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT); if (!dev) return -ENOMEM; dev->ops = ops; dev->kvm = kvm; mutex_lock(&kvm->lock); ret = ops->create(dev, type); if (ret < 0) { mutex_unlock(&kvm->lock); kfree(dev); return ret; } list_add_rcu(&dev->vm_node, &kvm->devices); mutex_unlock(&kvm->lock); if (ops->init) ops->init(dev); kvm_get_kvm(kvm); ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC); if (ret < 0) { kvm_put_kvm_no_destroy(kvm); mutex_lock(&kvm->lock); list_del_rcu(&dev->vm_node); synchronize_rcu(); if (ops->release) ops->release(dev); mutex_unlock(&kvm->lock); if (ops->destroy) ops->destroy(dev); return ret; } cd->fd = ret; return 0; } static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) { switch (arg) { case KVM_CAP_USER_MEMORY: case KVM_CAP_USER_MEMORY2: case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS: case KVM_CAP_INTERNAL_ERROR_DATA: #ifdef CONFIG_HAVE_KVM_MSI case KVM_CAP_SIGNAL_MSI: #endif #ifdef CONFIG_HAVE_KVM_IRQCHIP case KVM_CAP_IRQFD: #endif case KVM_CAP_IOEVENTFD_ANY_LENGTH: case KVM_CAP_CHECK_EXTENSION_VM: case KVM_CAP_ENABLE_CAP_VM: case KVM_CAP_HALT_POLL: return 1; #ifdef CONFIG_KVM_MMIO case KVM_CAP_COALESCED_MMIO: return KVM_COALESCED_MMIO_PAGE_OFFSET; case KVM_CAP_COALESCED_PIO: return 1; #endif #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: return KVM_DIRTY_LOG_MANUAL_CAPS; #endif #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING case KVM_CAP_IRQ_ROUTING: return KVM_MAX_IRQ_ROUTES; #endif #if KVM_MAX_NR_ADDRESS_SPACES > 1 case KVM_CAP_MULTI_ADDRESS_SPACE: if (kvm) return kvm_arch_nr_memslot_as_ids(kvm); return KVM_MAX_NR_ADDRESS_SPACES; #endif case KVM_CAP_NR_MEMSLOTS: return KVM_USER_MEM_SLOTS; case KVM_CAP_DIRTY_LOG_RING: #ifdef CONFIG_HAVE_KVM_DIRTY_RING_TSO return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn); #else return 0; #endif case KVM_CAP_DIRTY_LOG_RING_ACQ_REL: #ifdef CONFIG_HAVE_KVM_DIRTY_RING_ACQ_REL return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn); #else return 0; #endif #ifdef CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP: #endif case KVM_CAP_BINARY_STATS_FD: case KVM_CAP_SYSTEM_EVENT_DATA: case KVM_CAP_DEVICE_CTRL: return 1; #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES case KVM_CAP_MEMORY_ATTRIBUTES: return kvm_supported_mem_attributes(kvm); #endif #ifdef CONFIG_KVM_PRIVATE_MEM case KVM_CAP_GUEST_MEMFD: return !kvm || kvm_arch_has_private_mem(kvm); #endif default: break; } return kvm_vm_ioctl_check_extension(kvm, arg); } static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size) { int r; if (!KVM_DIRTY_LOG_PAGE_OFFSET) return -EINVAL; /* the size should be power of 2 */ if (!size || (size & (size - 1))) return -EINVAL; /* Should be bigger to keep the reserved entries, or a page */ if (size < kvm_dirty_ring_get_rsvd_entries(kvm) * sizeof(struct kvm_dirty_gfn) || size < PAGE_SIZE) return -EINVAL; if (size > KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn)) return -E2BIG; /* We only allow it to set once */ if (kvm->dirty_ring_size) return -EINVAL; mutex_lock(&kvm->lock); if (kvm->created_vcpus) { /* We don't allow to change this value after vcpu created */ r = -EINVAL; } else { kvm->dirty_ring_size = size; r = 0; } mutex_unlock(&kvm->lock); return r; } static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm) { unsigned long i; struct kvm_vcpu *vcpu; int cleared = 0; if (!kvm->dirty_ring_size) return -EINVAL; mutex_lock(&kvm->slots_lock); kvm_for_each_vcpu(i, vcpu, kvm) cleared += kvm_dirty_ring_reset(vcpu->kvm, &vcpu->dirty_ring); mutex_unlock(&kvm->slots_lock); if (cleared) kvm_flush_remote_tlbs(kvm); return cleared; } int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) { return -EINVAL; } bool kvm_are_all_memslots_empty(struct kvm *kvm) { int i; lockdep_assert_held(&kvm->slots_lock); for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { if (!kvm_memslots_empty(__kvm_memslots(kvm, i))) return false; } return true; } EXPORT_SYMBOL_GPL(kvm_are_all_memslots_empty); static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm, struct kvm_enable_cap *cap) { switch (cap->cap) { #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: { u64 allowed_options = KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE; if (cap->args[0] & KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE) allowed_options = KVM_DIRTY_LOG_MANUAL_CAPS; if (cap->flags || (cap->args[0] & ~allowed_options)) return -EINVAL; kvm->manual_dirty_log_protect = cap->args[0]; return 0; } #endif case KVM_CAP_HALT_POLL: { if (cap->flags || cap->args[0] != (unsigned int)cap->args[0]) return -EINVAL; kvm->max_halt_poll_ns = cap->args[0]; /* * Ensure kvm->override_halt_poll_ns does not become visible * before kvm->max_halt_poll_ns. * * Pairs with the smp_rmb() in kvm_vcpu_max_halt_poll_ns(). */ smp_wmb(); kvm->override_halt_poll_ns = true; return 0; } case KVM_CAP_DIRTY_LOG_RING: case KVM_CAP_DIRTY_LOG_RING_ACQ_REL: if (!kvm_vm_ioctl_check_extension_generic(kvm, cap->cap)) return -EINVAL; return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]); case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP: { int r = -EINVAL; if (!IS_ENABLED(CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP) || !kvm->dirty_ring_size || cap->flags) return r; mutex_lock(&kvm->slots_lock); /* * For simplicity, allow enabling ring+bitmap if and only if * there are no memslots, e.g. to ensure all memslots allocate * a bitmap after the capability is enabled. */ if (kvm_are_all_memslots_empty(kvm)) { kvm->dirty_ring_with_bitmap = true; r = 0; } mutex_unlock(&kvm->slots_lock); return r; } default: return kvm_vm_ioctl_enable_cap(kvm, cap); } } static ssize_t kvm_vm_stats_read(struct file *file, char __user *user_buffer, size_t size, loff_t *offset) { struct kvm *kvm = file->private_data; return kvm_stats_read(kvm->stats_id, &kvm_vm_stats_header, &kvm_vm_stats_desc[0], &kvm->stat, sizeof(kvm->stat), user_buffer, size, offset); } static int kvm_vm_stats_release(struct inode *inode, struct file *file) { struct kvm *kvm = file->private_data; kvm_put_kvm(kvm); return 0; } static const struct file_operations kvm_vm_stats_fops = { .owner = THIS_MODULE, .read = kvm_vm_stats_read, .release = kvm_vm_stats_release, .llseek = noop_llseek, }; static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm) { int fd; struct file *file; fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) return fd; file = anon_inode_getfile_fmode("kvm-vm-stats", &kvm_vm_stats_fops, kvm, O_RDONLY, FMODE_PREAD); if (IS_ERR(file)) { put_unused_fd(fd); return PTR_ERR(file); } kvm_get_kvm(kvm); fd_install(fd, file); return fd; } #define SANITY_CHECK_MEM_REGION_FIELD(field) \ do { \ BUILD_BUG_ON(offsetof(struct kvm_userspace_memory_region, field) != \ offsetof(struct kvm_userspace_memory_region2, field)); \ BUILD_BUG_ON(sizeof_field(struct kvm_userspace_memory_region, field) != \ sizeof_field(struct kvm_userspace_memory_region2, field)); \ } while (0) static long kvm_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { struct kvm *kvm = filp->private_data; void __user *argp = (void __user *)arg; int r; if (kvm->mm != current->mm || kvm->vm_dead) return -EIO; switch (ioctl) { case KVM_CREATE_VCPU: r = kvm_vm_ioctl_create_vcpu(kvm, arg); break; case KVM_ENABLE_CAP: { struct kvm_enable_cap cap; r = -EFAULT; if (copy_from_user(&cap, argp, sizeof(cap))) goto out; r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap); break; } case KVM_SET_USER_MEMORY_REGION2: case KVM_SET_USER_MEMORY_REGION: { struct kvm_userspace_memory_region2 mem; unsigned long size; if (ioctl == KVM_SET_USER_MEMORY_REGION) { /* * Fields beyond struct kvm_userspace_memory_region shouldn't be * accessed, but avoid leaking kernel memory in case of a bug. */ memset(&mem, 0, sizeof(mem)); size = sizeof(struct kvm_userspace_memory_region); } else { size = sizeof(struct kvm_userspace_memory_region2); } /* Ensure the common parts of the two structs are identical. */ SANITY_CHECK_MEM_REGION_FIELD(slot); SANITY_CHECK_MEM_REGION_FIELD(flags); SANITY_CHECK_MEM_REGION_FIELD(guest_phys_addr); SANITY_CHECK_MEM_REGION_FIELD(memory_size); SANITY_CHECK_MEM_REGION_FIELD(userspace_addr); r = -EFAULT; if (copy_from_user(&mem, argp, size)) goto out; r = -EINVAL; if (ioctl == KVM_SET_USER_MEMORY_REGION && (mem.flags & ~KVM_SET_USER_MEMORY_REGION_V1_FLAGS)) goto out; r = kvm_vm_ioctl_set_memory_region(kvm, &mem); break; } case KVM_GET_DIRTY_LOG: { struct kvm_dirty_log log; r = -EFAULT; if (copy_from_user(&log, argp, sizeof(log))) goto out; r = kvm_vm_ioctl_get_dirty_log(kvm, &log); break; } #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT case KVM_CLEAR_DIRTY_LOG: { struct kvm_clear_dirty_log log; r = -EFAULT; if (copy_from_user(&log, argp, sizeof(log))) goto out; r = kvm_vm_ioctl_clear_dirty_log(kvm, &log); break; } #endif #ifdef CONFIG_KVM_MMIO case KVM_REGISTER_COALESCED_MMIO: { struct kvm_coalesced_mmio_zone zone; r = -EFAULT; if (copy_from_user(&zone, argp, sizeof(zone))) goto out; r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone); break; } case KVM_UNREGISTER_COALESCED_MMIO: { struct kvm_coalesced_mmio_zone zone; r = -EFAULT; if (copy_from_user(&zone, argp, sizeof(zone))) goto out; r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone); break; } #endif case KVM_IRQFD: { struct kvm_irqfd data; r = -EFAULT; if (copy_from_user(&data, argp, sizeof(data))) goto out; r = kvm_irqfd(kvm, &data); break; } case KVM_IOEVENTFD: { struct kvm_ioeventfd data; r = -EFAULT; if (copy_from_user(&data, argp, sizeof(data))) goto out; r = kvm_ioeventfd(kvm, &data); break; } #ifdef CONFIG_HAVE_KVM_MSI case KVM_SIGNAL_MSI: { struct kvm_msi msi; r = -EFAULT; if (copy_from_user(&msi, argp, sizeof(msi))) goto out; r = kvm_send_userspace_msi(kvm, &msi); break; } #endif #ifdef __KVM_HAVE_IRQ_LINE case KVM_IRQ_LINE_STATUS: case KVM_IRQ_LINE: { struct kvm_irq_level irq_event; r = -EFAULT; if (copy_from_user(&irq_event, argp, sizeof(irq_event))) goto out; r = kvm_vm_ioctl_irq_line(kvm, &irq_event, ioctl == KVM_IRQ_LINE_STATUS); if (r) goto out; r = -EFAULT; if (ioctl == KVM_IRQ_LINE_STATUS) { if (copy_to_user(argp, &irq_event, sizeof(irq_event))) goto out; } r = 0; break; } #endif #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING case KVM_SET_GSI_ROUTING: { struct kvm_irq_routing routing; struct kvm_irq_routing __user *urouting; struct kvm_irq_routing_entry *entries = NULL; r = -EFAULT; if (copy_from_user(&routing, argp, sizeof(routing))) goto out; r = -EINVAL; if (!kvm_arch_can_set_irq_routing(kvm)) goto out; if (routing.nr > KVM_MAX_IRQ_ROUTES) goto out; if (routing.flags) goto out; if (routing.nr) { urouting = argp; entries = vmemdup_array_user(urouting->entries, routing.nr, sizeof(*entries)); if (IS_ERR(entries)) { r = PTR_ERR(entries); goto out; } } r = kvm_set_irq_routing(kvm, entries, routing.nr, routing.flags); kvfree(entries); break; } #endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */ #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES case KVM_SET_MEMORY_ATTRIBUTES: { struct kvm_memory_attributes attrs; r = -EFAULT; if (copy_from_user(&attrs, argp, sizeof(attrs))) goto out; r = kvm_vm_ioctl_set_mem_attributes(kvm, &attrs); break; } #endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */ case KVM_CREATE_DEVICE: { struct kvm_create_device cd; r = -EFAULT; if (copy_from_user(&cd, argp, sizeof(cd))) goto out; r = kvm_ioctl_create_device(kvm, &cd); if (r) goto out; r = -EFAULT; if (copy_to_user(argp, &cd, sizeof(cd))) goto out; r = 0; break; } case KVM_CHECK_EXTENSION: r = kvm_vm_ioctl_check_extension_generic(kvm, arg); break; case KVM_RESET_DIRTY_RINGS: r = kvm_vm_ioctl_reset_dirty_pages(kvm); break; case KVM_GET_STATS_FD: r = kvm_vm_ioctl_get_stats_fd(kvm); break; #ifdef CONFIG_KVM_PRIVATE_MEM case KVM_CREATE_GUEST_MEMFD: { struct kvm_create_guest_memfd guest_memfd; r = -EFAULT; if (copy_from_user(&guest_memfd, argp, sizeof(guest_memfd))) goto out; r = kvm_gmem_create(kvm, &guest_memfd); break; } #endif default: r = kvm_arch_vm_ioctl(filp, ioctl, arg); } out: return r; } #ifdef CONFIG_KVM_COMPAT struct compat_kvm_dirty_log { __u32 slot; __u32 padding1; union { compat_uptr_t dirty_bitmap; /* one bit per page */ __u64 padding2; }; }; struct compat_kvm_clear_dirty_log { __u32 slot; __u32 num_pages; __u64 first_page; union { compat_uptr_t dirty_bitmap; /* one bit per page */ __u64 padding2; }; }; long __weak kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { return -ENOTTY; } static long kvm_vm_compat_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { struct kvm *kvm = filp->private_data; int r; if (kvm->mm != current->mm || kvm->vm_dead) return -EIO; r = kvm_arch_vm_compat_ioctl(filp, ioctl, arg); if (r != -ENOTTY) return r; switch (ioctl) { #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT case KVM_CLEAR_DIRTY_LOG: { struct compat_kvm_clear_dirty_log compat_log; struct kvm_clear_dirty_log log; if (copy_from_user(&compat_log, (void __user *)arg, sizeof(compat_log))) return -EFAULT; log.slot = compat_log.slot; log.num_pages = compat_log.num_pages; log.first_page = compat_log.first_page; log.padding2 = compat_log.padding2; log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap); r = kvm_vm_ioctl_clear_dirty_log(kvm, &log); break; } #endif case KVM_GET_DIRTY_LOG: { struct compat_kvm_dirty_log compat_log; struct kvm_dirty_log log; if (copy_from_user(&compat_log, (void __user *)arg, sizeof(compat_log))) return -EFAULT; log.slot = compat_log.slot; log.padding1 = compat_log.padding1; log.padding2 = compat_log.padding2; log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap); r = kvm_vm_ioctl_get_dirty_log(kvm, &log); break; } default: r = kvm_vm_ioctl(filp, ioctl, arg); } return r; } #endif static struct file_operations kvm_vm_fops = { .release = kvm_vm_release, .unlocked_ioctl = kvm_vm_ioctl, .llseek = noop_llseek, KVM_COMPAT(kvm_vm_compat_ioctl), }; bool file_is_kvm(struct file *file) { return file && file->f_op == &kvm_vm_fops; } EXPORT_SYMBOL_GPL(file_is_kvm); static int kvm_dev_ioctl_create_vm(unsigned long type) { char fdname[ITOA_MAX_LEN + 1]; int r, fd; struct kvm *kvm; struct file *file; fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) return fd; snprintf(fdname, sizeof(fdname), "%d", fd); kvm = kvm_create_vm(type, fdname); if (IS_ERR(kvm)) { r = PTR_ERR(kvm); goto put_fd; } file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); if (IS_ERR(file)) { r = PTR_ERR(file); goto put_kvm; } /* * Don't call kvm_put_kvm anymore at this point; file->f_op is * already set, with ->release() being kvm_vm_release(). In error * cases it will be called by the final fput(file) and will take * care of doing kvm_put_kvm(kvm). */ kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm); fd_install(fd, file); return fd; put_kvm: kvm_put_kvm(kvm); put_fd: put_unused_fd(fd); return r; } static long kvm_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { int r = -EINVAL; switch (ioctl) { case KVM_GET_API_VERSION: if (arg) goto out; r = KVM_API_VERSION; break; case KVM_CREATE_VM: r = kvm_dev_ioctl_create_vm(arg); break; case KVM_CHECK_EXTENSION: r = kvm_vm_ioctl_check_extension_generic(NULL, arg); break; case KVM_GET_VCPU_MMAP_SIZE: if (arg) goto out; r = PAGE_SIZE; /* struct kvm_run */ #ifdef CONFIG_X86 r += PAGE_SIZE; /* pio data page */ #endif #ifdef CONFIG_KVM_MMIO r += PAGE_SIZE; /* coalesced mmio ring page */ #endif break; default: return kvm_arch_dev_ioctl(filp, ioctl, arg); } out: return r; } static struct file_operations kvm_chardev_ops = { .unlocked_ioctl = kvm_dev_ioctl, .llseek = noop_llseek, KVM_COMPAT(kvm_dev_ioctl), }; static struct miscdevice kvm_dev = { KVM_MINOR, "kvm", &kvm_chardev_ops, }; #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING bool enable_virt_at_load = true; module_param(enable_virt_at_load, bool, 0444); EXPORT_SYMBOL_GPL(enable_virt_at_load); __visible bool kvm_rebooting; EXPORT_SYMBOL_GPL(kvm_rebooting); static DEFINE_PER_CPU(bool, virtualization_enabled); static DEFINE_MUTEX(kvm_usage_lock); static int kvm_usage_count; __weak void kvm_arch_enable_virtualization(void) { } __weak void kvm_arch_disable_virtualization(void) { } static int kvm_enable_virtualization_cpu(void) { if (__this_cpu_read(virtualization_enabled)) return 0; if (kvm_arch_enable_virtualization_cpu()) { pr_info("kvm: enabling virtualization on CPU%d failed\n", raw_smp_processor_id()); return -EIO; } __this_cpu_write(virtualization_enabled, true); return 0; } static int kvm_online_cpu(unsigned int cpu) { /* * Abort the CPU online process if hardware virtualization cannot * be enabled. Otherwise running VMs would encounter unrecoverable * errors when scheduled to this CPU. */ return kvm_enable_virtualization_cpu(); } static void kvm_disable_virtualization_cpu(void *ign) { if (!__this_cpu_read(virtualization_enabled)) return; kvm_arch_disable_virtualization_cpu(); __this_cpu_write(virtualization_enabled, false); } static int kvm_offline_cpu(unsigned int cpu) { kvm_disable_virtualization_cpu(NULL); return 0; } static void kvm_shutdown(void) { /* * Disable hardware virtualization and set kvm_rebooting to indicate * that KVM has asynchronously disabled hardware virtualization, i.e. * that relevant errors and exceptions aren't entirely unexpected. * Some flavors of hardware virtualization need to be disabled before * transferring control to firmware (to perform shutdown/reboot), e.g. * on x86, virtualization can block INIT interrupts, which are used by * firmware to pull APs back under firmware control. Note, this path * is used for both shutdown and reboot scenarios, i.e. neither name is * 100% comprehensive. */ pr_info("kvm: exiting hardware virtualization\n"); kvm_rebooting = true; on_each_cpu(kvm_disable_virtualization_cpu, NULL, 1); } static int kvm_suspend(void) { /* * Secondary CPUs and CPU hotplug are disabled across the suspend/resume * callbacks, i.e. no need to acquire kvm_usage_lock to ensure the usage * count is stable. Assert that kvm_usage_lock is not held to ensure * the system isn't suspended while KVM is enabling hardware. Hardware * enabling can be preempted, but the task cannot be frozen until it has * dropped all locks (userspace tasks are frozen via a fake signal). */ lockdep_assert_not_held(&kvm_usage_lock); lockdep_assert_irqs_disabled(); kvm_disable_virtualization_cpu(NULL); return 0; } static void kvm_resume(void) { lockdep_assert_not_held(&kvm_usage_lock); lockdep_assert_irqs_disabled(); WARN_ON_ONCE(kvm_enable_virtualization_cpu()); } static struct syscore_ops kvm_syscore_ops = { .suspend = kvm_suspend, .resume = kvm_resume, .shutdown = kvm_shutdown, }; int kvm_enable_virtualization(void) { int r; guard(mutex)(&kvm_usage_lock); if (kvm_usage_count++) return 0; kvm_arch_enable_virtualization(); r = cpuhp_setup_state(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online", kvm_online_cpu, kvm_offline_cpu); if (r) goto err_cpuhp; register_syscore_ops(&kvm_syscore_ops); /* * Undo virtualization enabling and bail if the system is going down. * If userspace initiated a forced reboot, e.g. reboot -f, then it's * possible for an in-flight operation to enable virtualization after * syscore_shutdown() is called, i.e. without kvm_shutdown() being * invoked. Note, this relies on system_state being set _before_ * kvm_shutdown(), e.g. to ensure either kvm_shutdown() is invoked * or this CPU observes the impending shutdown. Which is why KVM uses * a syscore ops hook instead of registering a dedicated reboot * notifier (the latter runs before system_state is updated). */ if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF || system_state == SYSTEM_RESTART) { r = -EBUSY; goto err_rebooting; } return 0; err_rebooting: unregister_syscore_ops(&kvm_syscore_ops); cpuhp_remove_state(CPUHP_AP_KVM_ONLINE); err_cpuhp: kvm_arch_disable_virtualization(); --kvm_usage_count; return r; } EXPORT_SYMBOL_GPL(kvm_enable_virtualization); void kvm_disable_virtualization(void) { guard(mutex)(&kvm_usage_lock); if (--kvm_usage_count) return; unregister_syscore_ops(&kvm_syscore_ops); cpuhp_remove_state(CPUHP_AP_KVM_ONLINE); kvm_arch_disable_virtualization(); } EXPORT_SYMBOL_GPL(kvm_disable_virtualization); static int kvm_init_virtualization(void) { if (enable_virt_at_load) return kvm_enable_virtualization(); return 0; } static void kvm_uninit_virtualization(void) { if (enable_virt_at_load) kvm_disable_virtualization(); } #else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */ static int kvm_init_virtualization(void) { return 0; } static void kvm_uninit_virtualization(void) { } #endif /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */ static void kvm_iodevice_destructor(struct kvm_io_device *dev) { if (dev->ops->destructor) dev->ops->destructor(dev); } static void kvm_io_bus_destroy(struct kvm_io_bus *bus) { int i; for (i = 0; i < bus->dev_count; i++) { struct kvm_io_device *pos = bus->range[i].dev; kvm_iodevice_destructor(pos); } kfree(bus); } static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1, const struct kvm_io_range *r2) { gpa_t addr1 = r1->addr; gpa_t addr2 = r2->addr; if (addr1 < addr2) return -1; /* If r2->len == 0, match the exact address. If r2->len != 0, * accept any overlapping write. Any order is acceptable for * overlapping ranges, because kvm_io_bus_get_first_dev ensures * we process all of them. */ if (r2->len) { addr1 += r1->len; addr2 += r2->len; } if (addr1 > addr2) return 1; return 0; } static int kvm_io_bus_sort_cmp(const void *p1, const void *p2) { return kvm_io_bus_cmp(p1, p2); } static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus, gpa_t addr, int len) { struct kvm_io_range *range, key; int off; key = (struct kvm_io_range) { .addr = addr, .len = len, }; range = bsearch(&key, bus->range, bus->dev_count, sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp); if (range == NULL) return -ENOENT; off = range - bus->range; while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0) off--; return off; } static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus, struct kvm_io_range *range, const void *val) { int idx; idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len); if (idx < 0) return -EOPNOTSUPP; while (idx < bus->dev_count && kvm_io_bus_cmp(range, &bus->range[idx]) == 0) { if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr, range->len, val)) return idx; idx++; } return -EOPNOTSUPP; } int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, int len, const void *val) { struct kvm_io_bus *bus; struct kvm_io_range range; int r; range = (struct kvm_io_range) { .addr = addr, .len = len, }; bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); if (!bus) return -ENOMEM; r = __kvm_io_bus_write(vcpu, bus, &range, val); return r < 0 ? r : 0; } EXPORT_SYMBOL_GPL(kvm_io_bus_write); int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, int len, const void *val, long cookie) { struct kvm_io_bus *bus; struct kvm_io_range range; range = (struct kvm_io_range) { .addr = addr, .len = len, }; bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); if (!bus) return -ENOMEM; /* First try the device referenced by cookie. */ if ((cookie >= 0) && (cookie < bus->dev_count) && (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0)) if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len, val)) return cookie; /* * cookie contained garbage; fall back to search and return the * correct cookie value. */ return __kvm_io_bus_write(vcpu, bus, &range, val); } static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus, struct kvm_io_range *range, void *val) { int idx; idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len); if (idx < 0) return -EOPNOTSUPP; while (idx < bus->dev_count && kvm_io_bus_cmp(range, &bus->range[idx]) == 0) { if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr, range->len, val)) return idx; idx++; } return -EOPNOTSUPP; } int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, int len, void *val) { struct kvm_io_bus *bus; struct kvm_io_range range; int r; range = (struct kvm_io_range) { .addr = addr, .len = len, }; bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); if (!bus) return -ENOMEM; r = __kvm_io_bus_read(vcpu, bus, &range, val); return r < 0 ? r : 0; } EXPORT_SYMBOL_GPL(kvm_io_bus_read); int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len, struct kvm_io_device *dev) { int i; struct kvm_io_bus *new_bus, *bus; struct kvm_io_range range; lockdep_assert_held(&kvm->slots_lock); bus = kvm_get_bus(kvm, bus_idx); if (!bus) return -ENOMEM; /* exclude ioeventfd which is limited by maximum fd */ if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1) return -ENOSPC; new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1), GFP_KERNEL_ACCOUNT); if (!new_bus) return -ENOMEM; range = (struct kvm_io_range) { .addr = addr, .len = len, .dev = dev, }; for (i = 0; i < bus->dev_count; i++) if (kvm_io_bus_cmp(&bus->range[i], &range) > 0) break; memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range)); new_bus->dev_count++; new_bus->range[i] = range; memcpy(new_bus->range + i + 1, bus->range + i, (bus->dev_count - i) * sizeof(struct kvm_io_range)); rcu_assign_pointer(kvm->buses[bus_idx], new_bus); synchronize_srcu_expedited(&kvm->srcu); kfree(bus); return 0; } int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, struct kvm_io_device *dev) { int i; struct kvm_io_bus *new_bus, *bus; lockdep_assert_held(&kvm->slots_lock); bus = kvm_get_bus(kvm, bus_idx); if (!bus) return 0; for (i = 0; i < bus->dev_count; i++) { if (bus->range[i].dev == dev) { break; } } if (i == bus->dev_count) return 0; new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1), GFP_KERNEL_ACCOUNT); if (new_bus) { memcpy(new_bus, bus, struct_size(bus, range, i)); new_bus->dev_count--; memcpy(new_bus->range + i, bus->range + i + 1, flex_array_size(new_bus, range, new_bus->dev_count - i)); } rcu_assign_pointer(kvm->buses[bus_idx], new_bus); synchronize_srcu_expedited(&kvm->srcu); /* * If NULL bus is installed, destroy the old bus, including all the * attached devices. Otherwise, destroy the caller's device only. */ if (!new_bus) { pr_err("kvm: failed to shrink bus, removing it completely\n"); kvm_io_bus_destroy(bus); return -ENOMEM; } kvm_iodevice_destructor(dev); kfree(bus); return 0; } struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr) { struct kvm_io_bus *bus; int dev_idx, srcu_idx; struct kvm_io_device *iodev = NULL; srcu_idx = srcu_read_lock(&kvm->srcu); bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); if (!bus) goto out_unlock; dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1); if (dev_idx < 0) goto out_unlock; iodev = bus->range[dev_idx].dev; out_unlock: srcu_read_unlock(&kvm->srcu, srcu_idx); return iodev; } EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev); static int kvm_debugfs_open(struct inode *inode, struct file *file, int (*get)(void *, u64 *), int (*set)(void *, u64), const char *fmt) { int ret; struct kvm_stat_data *stat_data = inode->i_private; /* * The debugfs files are a reference to the kvm struct which * is still valid when kvm_destroy_vm is called. kvm_get_kvm_safe * avoids the race between open and the removal of the debugfs directory. */ if (!kvm_get_kvm_safe(stat_data->kvm)) return -ENOENT; ret = simple_attr_open(inode, file, get, kvm_stats_debugfs_mode(stat_data->desc) & 0222 ? set : NULL, fmt); if (ret) kvm_put_kvm(stat_data->kvm); return ret; } static int kvm_debugfs_release(struct inode *inode, struct file *file) { struct kvm_stat_data *stat_data = inode->i_private; simple_attr_release(inode, file); kvm_put_kvm(stat_data->kvm); return 0; } static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val) { *val = *(u64 *)((void *)(&kvm->stat) + offset); return 0; } static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset) { *(u64 *)((void *)(&kvm->stat) + offset) = 0; return 0; } static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val) { unsigned long i; struct kvm_vcpu *vcpu; *val = 0; kvm_for_each_vcpu(i, vcpu, kvm) *val += *(u64 *)((void *)(&vcpu->stat) + offset); return 0; } static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset) { unsigned long i; struct kvm_vcpu *vcpu; kvm_for_each_vcpu(i, vcpu, kvm) *(u64 *)((void *)(&vcpu->stat) + offset) = 0; return 0; } static int kvm_stat_data_get(void *data, u64 *val) { int r = -EFAULT; struct kvm_stat_data *stat_data = data; switch (stat_data->kind) { case KVM_STAT_VM: r = kvm_get_stat_per_vm(stat_data->kvm, stat_data->desc->desc.offset, val); break; case KVM_STAT_VCPU: r = kvm_get_stat_per_vcpu(stat_data->kvm, stat_data->desc->desc.offset, val); break; } return r; } static int kvm_stat_data_clear(void *data, u64 val) { int r = -EFAULT; struct kvm_stat_data *stat_data = data; if (val) return -EINVAL; switch (stat_data->kind) { case KVM_STAT_VM: r = kvm_clear_stat_per_vm(stat_data->kvm, stat_data->desc->desc.offset); break; case KVM_STAT_VCPU: r = kvm_clear_stat_per_vcpu(stat_data->kvm, stat_data->desc->desc.offset); break; } return r; } static int kvm_stat_data_open(struct inode *inode, struct file *file) { __simple_attr_check_format("%llu\n", 0ull); return kvm_debugfs_open(inode, file, kvm_stat_data_get, kvm_stat_data_clear, "%llu\n"); } static const struct file_operations stat_fops_per_vm = { .owner = THIS_MODULE, .open = kvm_stat_data_open, .release = kvm_debugfs_release, .read = simple_attr_read, .write = simple_attr_write, }; static int vm_stat_get(void *_offset, u64 *val) { unsigned offset = (long)_offset; struct kvm *kvm; u64 tmp_val; *val = 0; mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { kvm_get_stat_per_vm(kvm, offset, &tmp_val); *val += tmp_val; } mutex_unlock(&kvm_lock); return 0; } static int vm_stat_clear(void *_offset, u64 val) { unsigned offset = (long)_offset; struct kvm *kvm; if (val) return -EINVAL; mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { kvm_clear_stat_per_vm(kvm, offset); } mutex_unlock(&kvm_lock); return 0; } DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n"); DEFINE_SIMPLE_ATTRIBUTE(vm_stat_readonly_fops, vm_stat_get, NULL, "%llu\n"); static int vcpu_stat_get(void *_offset, u64 *val) { unsigned offset = (long)_offset; struct kvm *kvm; u64 tmp_val; *val = 0; mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { kvm_get_stat_per_vcpu(kvm, offset, &tmp_val); *val += tmp_val; } mutex_unlock(&kvm_lock); return 0; } static int vcpu_stat_clear(void *_offset, u64 val) { unsigned offset = (long)_offset; struct kvm *kvm; if (val) return -EINVAL; mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { kvm_clear_stat_per_vcpu(kvm, offset); } mutex_unlock(&kvm_lock); return 0; } DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear, "%llu\n"); DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_readonly_fops, vcpu_stat_get, NULL, "%llu\n"); static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) { struct kobj_uevent_env *env; unsigned long long created, active; if (!kvm_dev.this_device || !kvm) return; mutex_lock(&kvm_lock); if (type == KVM_EVENT_CREATE_VM) { kvm_createvm_count++; kvm_active_vms++; } else if (type == KVM_EVENT_DESTROY_VM) { kvm_active_vms--; } created = kvm_createvm_count; active = kvm_active_vms; mutex_unlock(&kvm_lock); env = kzalloc(sizeof(*env), GFP_KERNEL); if (!env) return; add_uevent_var(env, "CREATED=%llu", created); add_uevent_var(env, "COUNT=%llu", active); if (type == KVM_EVENT_CREATE_VM) { add_uevent_var(env, "EVENT=create"); kvm->userspace_pid = task_pid_nr(current); } else if (type == KVM_EVENT_DESTROY_VM) { add_uevent_var(env, "EVENT=destroy"); } add_uevent_var(env, "PID=%d", kvm->userspace_pid); if (!IS_ERR(kvm->debugfs_dentry)) { char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL); if (p) { tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX); if (!IS_ERR(tmp)) add_uevent_var(env, "STATS_PATH=%s", tmp); kfree(p); } } /* no need for checks, since we are adding at most only 5 keys */ env->envp[env->envp_idx++] = NULL; kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp); kfree(env); } static void kvm_init_debug(void) { const struct file_operations *fops; const struct _kvm_stats_desc *pdesc; int i; kvm_debugfs_dir = debugfs_create_dir("kvm", NULL); for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) { pdesc = &kvm_vm_stats_desc[i]; if (kvm_stats_debugfs_mode(pdesc) & 0222) fops = &vm_stat_fops; else fops = &vm_stat_readonly_fops; debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc), kvm_debugfs_dir, (void *)(long)pdesc->desc.offset, fops); } for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) { pdesc = &kvm_vcpu_stats_desc[i]; if (kvm_stats_debugfs_mode(pdesc) & 0222) fops = &vcpu_stat_fops; else fops = &vcpu_stat_readonly_fops; debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc), kvm_debugfs_dir, (void *)(long)pdesc->desc.offset, fops); } } static inline struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) { return container_of(pn, struct kvm_vcpu, preempt_notifier); } static void kvm_sched_in(struct preempt_notifier *pn, int cpu) { struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); WRITE_ONCE(vcpu->preempted, false); WRITE_ONCE(vcpu->ready, false); __this_cpu_write(kvm_running_vcpu, vcpu); kvm_arch_vcpu_load(vcpu, cpu); WRITE_ONCE(vcpu->scheduled_out, false); } static void kvm_sched_out(struct preempt_notifier *pn, struct task_struct *next) { struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); WRITE_ONCE(vcpu->scheduled_out, true); if (task_is_runnable(current) && vcpu->wants_to_run) { WRITE_ONCE(vcpu->preempted, true); WRITE_ONCE(vcpu->ready, true); } kvm_arch_vcpu_put(vcpu); __this_cpu_write(kvm_running_vcpu, NULL); } /** * kvm_get_running_vcpu - get the vcpu running on the current CPU. * * We can disable preemption locally around accessing the per-CPU variable, * and use the resolved vcpu pointer after enabling preemption again, * because even if the current thread is migrated to another CPU, reading * the per-CPU value later will give us the same value as we update the * per-CPU variable in the preempt notifier handlers. */ struct kvm_vcpu *kvm_get_running_vcpu(void) { struct kvm_vcpu *vcpu; preempt_disable(); vcpu = __this_cpu_read(kvm_running_vcpu); preempt_enable(); return vcpu; } EXPORT_SYMBOL_GPL(kvm_get_running_vcpu); /** * kvm_get_running_vcpus - get the per-CPU array of currently running vcpus. */ struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void) { return &kvm_running_vcpu; } #ifdef CONFIG_GUEST_PERF_EVENTS static unsigned int kvm_guest_state(void) { struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); unsigned int state; if (!kvm_arch_pmi_in_guest(vcpu)) return 0; state = PERF_GUEST_ACTIVE; if (!kvm_arch_vcpu_in_kernel(vcpu)) state |= PERF_GUEST_USER; return state; } static unsigned long kvm_guest_get_ip(void) { struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); /* Retrieving the IP must be guarded by a call to kvm_guest_state(). */ if (WARN_ON_ONCE(!kvm_arch_pmi_in_guest(vcpu))) return 0; return kvm_arch_vcpu_get_ip(vcpu); } static struct perf_guest_info_callbacks kvm_guest_cbs = { .state = kvm_guest_state, .get_ip = kvm_guest_get_ip, .handle_intel_pt_intr = NULL, }; void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void)) { kvm_guest_cbs.handle_intel_pt_intr = pt_intr_handler; perf_register_guest_info_callbacks(&kvm_guest_cbs); } void kvm_unregister_perf_callbacks(void) { perf_unregister_guest_info_callbacks(&kvm_guest_cbs); } #endif int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) { int r; int cpu; /* A kmem cache lets us meet the alignment requirements of fx_save. */ if (!vcpu_align) vcpu_align = __alignof__(struct kvm_vcpu); kvm_vcpu_cache = kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align, SLAB_ACCOUNT, offsetof(struct kvm_vcpu, arch), offsetofend(struct kvm_vcpu, stats_id) - offsetof(struct kvm_vcpu, arch), NULL); if (!kvm_vcpu_cache) return -ENOMEM; for_each_possible_cpu(cpu) { if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu), GFP_KERNEL, cpu_to_node(cpu))) { r = -ENOMEM; goto err_cpu_kick_mask; } } r = kvm_irqfd_init(); if (r) goto err_irqfd; r = kvm_async_pf_init(); if (r) goto err_async_pf; kvm_chardev_ops.owner = module; kvm_vm_fops.owner = module; kvm_vcpu_fops.owner = module; kvm_device_fops.owner = module; kvm_preempt_ops.sched_in = kvm_sched_in; kvm_preempt_ops.sched_out = kvm_sched_out; kvm_init_debug(); r = kvm_vfio_ops_init(); if (WARN_ON_ONCE(r)) goto err_vfio; kvm_gmem_init(module); r = kvm_init_virtualization(); if (r) goto err_virt; /* * Registration _must_ be the very last thing done, as this exposes * /dev/kvm to userspace, i.e. all infrastructure must be setup! */ r = misc_register(&kvm_dev); if (r) { pr_err("kvm: misc device register failed\n"); goto err_register; } return 0; err_register: kvm_uninit_virtualization(); err_virt: kvm_vfio_ops_exit(); err_vfio: kvm_async_pf_deinit(); err_async_pf: kvm_irqfd_exit(); err_irqfd: err_cpu_kick_mask: for_each_possible_cpu(cpu) free_cpumask_var(per_cpu(cpu_kick_mask, cpu)); kmem_cache_destroy(kvm_vcpu_cache); return r; } EXPORT_SYMBOL_GPL(kvm_init); void kvm_exit(void) { int cpu; /* * Note, unregistering /dev/kvm doesn't strictly need to come first, * fops_get(), a.k.a. try_module_get(), prevents acquiring references * to KVM while the module is being stopped. */ misc_deregister(&kvm_dev); kvm_uninit_virtualization(); debugfs_remove_recursive(kvm_debugfs_dir); for_each_possible_cpu(cpu) free_cpumask_var(per_cpu(cpu_kick_mask, cpu)); kmem_cache_destroy(kvm_vcpu_cache); kvm_vfio_ops_exit(); kvm_async_pf_deinit(); kvm_irqfd_exit(); } EXPORT_SYMBOL_GPL(kvm_exit);
1 3 3 3 3 5 27 28 27 28 28 28 27 27 28 26 22 1 6 5 5 3 1 1 7 1 6 5 6 6 3 11 6 2 1 1 3 3 28 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 // SPDX-License-Identifier: GPL-2.0-only #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/export.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/if_vlan.h> #include <linux/filter.h> #include <net/dsa.h> #include <net/dst_metadata.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/gre.h> #include <net/pptp.h> #include <net/tipc.h> #include <linux/igmp.h> #include <linux/icmp.h> #include <linux/sctp.h> #include <linux/dccp.h> #include <linux/if_tunnel.h> #include <linux/if_pppox.h> #include <linux/ppp_defs.h> #include <linux/stddef.h> #include <linux/if_ether.h> #include <linux/if_hsr.h> #include <linux/mpls.h> #include <linux/tcp.h> #include <linux/ptp_classify.h> #include <net/flow_dissector.h> #include <net/pkt_cls.h> #include <scsi/fc/fc_fcoe.h> #include <uapi/linux/batadv_packet.h> #include <linux/bpf.h> #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_labels.h> #endif #include <linux/bpf-netns.h> static void dissector_set_key(struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id) { flow_dissector->used_keys |= (1ULL << key_id); } void skb_flow_dissector_init(struct flow_dissector *flow_dissector, const struct flow_dissector_key *key, unsigned int key_count) { unsigned int i; memset(flow_dissector, 0, sizeof(*flow_dissector)); for (i = 0; i < key_count; i++, key++) { /* User should make sure that every key target offset is within * boundaries of unsigned short. */ BUG_ON(key->offset > USHRT_MAX); BUG_ON(dissector_uses_key(flow_dissector, key->key_id)); dissector_set_key(flow_dissector, key->key_id); flow_dissector->offset[key->key_id] = key->offset; } /* Ensure that the dissector always includes control and basic key. * That way we are able to avoid handling lack of these in fast path. */ BUG_ON(!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_CONTROL)); BUG_ON(!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_BASIC)); } EXPORT_SYMBOL(skb_flow_dissector_init); #ifdef CONFIG_BPF_SYSCALL int flow_dissector_bpf_prog_attach_check(struct net *net, struct bpf_prog *prog) { enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR; if (net == &init_net) { /* BPF flow dissector in the root namespace overrides * any per-net-namespace one. When attaching to root, * make sure we don't have any BPF program attached * to the non-root namespaces. */ struct net *ns; for_each_net(ns) { if (ns == &init_net) continue; if (rcu_access_pointer(ns->bpf.run_array[type])) return -EEXIST; } } else { /* Make sure root flow dissector is not attached * when attaching to the non-root namespace. */ if (rcu_access_pointer(init_net.bpf.run_array[type])) return -EEXIST; } return 0; } #endif /* CONFIG_BPF_SYSCALL */ /** * skb_flow_get_ports - extract the upper layer ports and return them * @skb: sk_buff to extract the ports from * @thoff: transport header offset * @ip_proto: protocol for which to get port offset * @data: raw buffer pointer to the packet, if NULL use skb->data * @hlen: packet header length, if @data is NULL use skb_headlen(skb) * * The function will try to retrieve the ports at offset thoff + poff where poff * is the protocol port offset returned from proto_ports_offset */ __be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, const void *data, int hlen) { int poff = proto_ports_offset(ip_proto); if (!data) { data = skb->data; hlen = skb_headlen(skb); } if (poff >= 0) { __be32 *ports, _ports; ports = __skb_header_pointer(skb, thoff + poff, sizeof(_ports), data, hlen, &_ports); if (ports) return *ports; } return 0; } EXPORT_SYMBOL(skb_flow_get_ports); static bool icmp_has_id(u8 type) { switch (type) { case ICMP_ECHO: case ICMP_ECHOREPLY: case ICMP_TIMESTAMP: case ICMP_TIMESTAMPREPLY: case ICMPV6_ECHO_REQUEST: case ICMPV6_ECHO_REPLY: return true; } return false; } /** * skb_flow_get_icmp_tci - extract ICMP(6) Type, Code and Identifier fields * @skb: sk_buff to extract from * @key_icmp: struct flow_dissector_key_icmp to fill * @data: raw buffer pointer to the packet * @thoff: offset to extract at * @hlen: packet header length */ void skb_flow_get_icmp_tci(const struct sk_buff *skb, struct flow_dissector_key_icmp *key_icmp, const void *data, int thoff, int hlen) { struct icmphdr *ih, _ih; ih = __skb_header_pointer(skb, thoff, sizeof(_ih), data, hlen, &_ih); if (!ih) return; key_icmp->type = ih->type; key_icmp->code = ih->code; /* As we use 0 to signal that the Id field is not present, * avoid confusion with packets without such field */ if (icmp_has_id(ih->type)) key_icmp->id = ih->un.echo.id ? ntohs(ih->un.echo.id) : 1; else key_icmp->id = 0; } EXPORT_SYMBOL(skb_flow_get_icmp_tci); /* If FLOW_DISSECTOR_KEY_ICMP is set, dissect an ICMP packet * using skb_flow_get_icmp_tci(). */ static void __skb_flow_dissect_icmp(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, int thoff, int hlen) { struct flow_dissector_key_icmp *key_icmp; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ICMP)) return; key_icmp = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ICMP, target_container); skb_flow_get_icmp_tci(skb, key_icmp, data, thoff, hlen); } static void __skb_flow_dissect_ah(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, int nhoff, int hlen) { struct flow_dissector_key_ipsec *key_ah; struct ip_auth_hdr _hdr, *hdr; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPSEC)) return; hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) return; key_ah = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_IPSEC, target_container); key_ah->spi = hdr->spi; } static void __skb_flow_dissect_esp(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, int nhoff, int hlen) { struct flow_dissector_key_ipsec *key_esp; struct ip_esp_hdr _hdr, *hdr; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPSEC)) return; hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) return; key_esp = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_IPSEC, target_container); key_esp->spi = hdr->spi; } static void __skb_flow_dissect_l2tpv3(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, int nhoff, int hlen) { struct flow_dissector_key_l2tpv3 *key_l2tpv3; struct { __be32 session_id; } *hdr, _hdr; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_L2TPV3)) return; hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) return; key_l2tpv3 = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_L2TPV3, target_container); key_l2tpv3->session_id = hdr->session_id; } void skb_flow_dissect_meta(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container) { struct flow_dissector_key_meta *meta; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_META)) return; meta = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_META, target_container); meta->ingress_ifindex = skb->skb_iif; #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) if (tc_skb_ext_tc_enabled()) { struct tc_skb_ext *ext; ext = skb_ext_find(skb, TC_SKB_EXT); if (ext) meta->l2_miss = ext->l2_miss; } #endif } EXPORT_SYMBOL(skb_flow_dissect_meta); static void skb_flow_dissect_set_enc_control(enum flow_dissector_key_id type, u32 ctrl_flags, struct flow_dissector *flow_dissector, void *target_container) { struct flow_dissector_key_control *ctrl; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_CONTROL)) return; ctrl = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ENC_CONTROL, target_container); ctrl->addr_type = type; ctrl->flags = ctrl_flags; } void skb_flow_dissect_ct(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, u16 *ctinfo_map, size_t mapsize, bool post_ct, u16 zone) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) struct flow_dissector_key_ct *key; enum ip_conntrack_info ctinfo; struct nf_conn_labels *cl; struct nf_conn *ct; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_CT)) return; ct = nf_ct_get(skb, &ctinfo); if (!ct && !post_ct) return; key = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_CT, target_container); if (!ct) { key->ct_state = TCA_FLOWER_KEY_CT_FLAGS_TRACKED | TCA_FLOWER_KEY_CT_FLAGS_INVALID; key->ct_zone = zone; return; } if (ctinfo < mapsize) key->ct_state = ctinfo_map[ctinfo]; #if IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) key->ct_zone = ct->zone.id; #endif #if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) key->ct_mark = READ_ONCE(ct->mark); #endif cl = nf_ct_labels_find(ct); if (cl) memcpy(key->ct_labels, cl->bits, sizeof(key->ct_labels)); #endif /* CONFIG_NF_CONNTRACK */ } EXPORT_SYMBOL(skb_flow_dissect_ct); void skb_flow_dissect_tunnel_info(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container) { struct ip_tunnel_info *info; struct ip_tunnel_key *key; u32 ctrl_flags = 0; /* A quick check to see if there might be something to do. */ if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_KEYID) && !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) && !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) && !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_CONTROL) && !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_PORTS) && !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IP) && !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_OPTS)) return; info = skb_tunnel_info(skb); if (!info) return; key = &info->key; if (test_bit(IP_TUNNEL_CSUM_BIT, key->tun_flags)) ctrl_flags |= FLOW_DIS_F_TUNNEL_CSUM; if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, key->tun_flags)) ctrl_flags |= FLOW_DIS_F_TUNNEL_DONT_FRAGMENT; if (test_bit(IP_TUNNEL_OAM_BIT, key->tun_flags)) ctrl_flags |= FLOW_DIS_F_TUNNEL_OAM; if (test_bit(IP_TUNNEL_CRIT_OPT_BIT, key->tun_flags)) ctrl_flags |= FLOW_DIS_F_TUNNEL_CRIT_OPT; switch (ip_tunnel_info_af(info)) { case AF_INET: skb_flow_dissect_set_enc_control(FLOW_DISSECTOR_KEY_IPV4_ADDRS, ctrl_flags, flow_dissector, target_container); if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS)) { struct flow_dissector_key_ipv4_addrs *ipv4; ipv4 = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, target_container); ipv4->src = key->u.ipv4.src; ipv4->dst = key->u.ipv4.dst; } break; case AF_INET6: skb_flow_dissect_set_enc_control(FLOW_DISSECTOR_KEY_IPV6_ADDRS, ctrl_flags, flow_dissector, target_container); if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS)) { struct flow_dissector_key_ipv6_addrs *ipv6; ipv6 = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, target_container); ipv6->src = key->u.ipv6.src; ipv6->dst = key->u.ipv6.dst; } break; default: skb_flow_dissect_set_enc_control(0, ctrl_flags, flow_dissector, target_container); break; } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_KEYID)) { struct flow_dissector_key_keyid *keyid; keyid = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ENC_KEYID, target_container); keyid->keyid = tunnel_id_to_key32(key->tun_id); } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_PORTS)) { struct flow_dissector_key_ports *tp; tp = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ENC_PORTS, target_container); tp->src = key->tp_src; tp->dst = key->tp_dst; } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IP)) { struct flow_dissector_key_ip *ip; ip = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IP, target_container); ip->tos = key->tos; ip->ttl = key->ttl; } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_OPTS)) { struct flow_dissector_key_enc_opts *enc_opt; IP_TUNNEL_DECLARE_FLAGS(flags) = { }; u32 val; enc_opt = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ENC_OPTS, target_container); if (!info->options_len) return; enc_opt->len = info->options_len; ip_tunnel_info_opts_get(enc_opt->data, info); ip_tunnel_set_options_present(flags); ip_tunnel_flags_and(flags, info->key.tun_flags, flags); val = find_next_bit(flags, __IP_TUNNEL_FLAG_NUM, IP_TUNNEL_GENEVE_OPT_BIT); enc_opt->dst_opt_type = val < __IP_TUNNEL_FLAG_NUM ? val : 0; } } EXPORT_SYMBOL(skb_flow_dissect_tunnel_info); void skb_flow_dissect_hash(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container) { struct flow_dissector_key_hash *key; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_HASH)) return; key = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_HASH, target_container); key->hash = skb_get_hash_raw(skb); } EXPORT_SYMBOL(skb_flow_dissect_hash); static enum flow_dissect_ret __skb_flow_dissect_mpls(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, int nhoff, int hlen, int lse_index, bool *entropy_label) { struct mpls_label *hdr, _hdr; u32 entry, label, bos; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS_ENTROPY) && !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS)) return FLOW_DISSECT_RET_OUT_GOOD; if (lse_index >= FLOW_DIS_MPLS_MAX) return FLOW_DISSECT_RET_OUT_GOOD; hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) return FLOW_DISSECT_RET_OUT_BAD; entry = ntohl(hdr->entry); label = (entry & MPLS_LS_LABEL_MASK) >> MPLS_LS_LABEL_SHIFT; bos = (entry & MPLS_LS_S_MASK) >> MPLS_LS_S_SHIFT; if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS)) { struct flow_dissector_key_mpls *key_mpls; struct flow_dissector_mpls_lse *lse; key_mpls = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_MPLS, target_container); lse = &key_mpls->ls[lse_index]; lse->mpls_ttl = (entry & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT; lse->mpls_bos = bos; lse->mpls_tc = (entry & MPLS_LS_TC_MASK) >> MPLS_LS_TC_SHIFT; lse->mpls_label = label; dissector_set_mpls_lse(key_mpls, lse_index); } if (*entropy_label && dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS_ENTROPY)) { struct flow_dissector_key_keyid *key_keyid; key_keyid = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_MPLS_ENTROPY, target_container); key_keyid->keyid = cpu_to_be32(label); } *entropy_label = label == MPLS_LABEL_ENTROPY; return bos ? FLOW_DISSECT_RET_OUT_GOOD : FLOW_DISSECT_RET_PROTO_AGAIN; } static enum flow_dissect_ret __skb_flow_dissect_arp(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, int nhoff, int hlen) { struct flow_dissector_key_arp *key_arp; struct { unsigned char ar_sha[ETH_ALEN]; unsigned char ar_sip[4]; unsigned char ar_tha[ETH_ALEN]; unsigned char ar_tip[4]; } *arp_eth, _arp_eth; const struct arphdr *arp; struct arphdr _arp; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ARP)) return FLOW_DISSECT_RET_OUT_GOOD; arp = __skb_header_pointer(skb, nhoff, sizeof(_arp), data, hlen, &_arp); if (!arp) return FLOW_DISSECT_RET_OUT_BAD; if (arp->ar_hrd != htons(ARPHRD_ETHER) || arp->ar_pro != htons(ETH_P_IP) || arp->ar_hln != ETH_ALEN || arp->ar_pln != 4 || (arp->ar_op != htons(ARPOP_REPLY) && arp->ar_op != htons(ARPOP_REQUEST))) return FLOW_DISSECT_RET_OUT_BAD; arp_eth = __skb_header_pointer(skb, nhoff + sizeof(_arp), sizeof(_arp_eth), data, hlen, &_arp_eth); if (!arp_eth) return FLOW_DISSECT_RET_OUT_BAD; key_arp = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ARP, target_container); memcpy(&key_arp->sip, arp_eth->ar_sip, sizeof(key_arp->sip)); memcpy(&key_arp->tip, arp_eth->ar_tip, sizeof(key_arp->tip)); /* Only store the lower byte of the opcode; * this covers ARPOP_REPLY and ARPOP_REQUEST. */ key_arp->op = ntohs(arp->ar_op) & 0xff; ether_addr_copy(key_arp->sha, arp_eth->ar_sha); ether_addr_copy(key_arp->tha, arp_eth->ar_tha); return FLOW_DISSECT_RET_OUT_GOOD; } static enum flow_dissect_ret __skb_flow_dissect_cfm(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, int nhoff, int hlen) { struct flow_dissector_key_cfm *key, *hdr, _hdr; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_CFM)) return FLOW_DISSECT_RET_OUT_GOOD; hdr = __skb_header_pointer(skb, nhoff, sizeof(*key), data, hlen, &_hdr); if (!hdr) return FLOW_DISSECT_RET_OUT_BAD; key = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_CFM, target_container); key->mdl_ver = hdr->mdl_ver; key->opcode = hdr->opcode; return FLOW_DISSECT_RET_OUT_GOOD; } static enum flow_dissect_ret __skb_flow_dissect_gre(const struct sk_buff *skb, struct flow_dissector_key_control *key_control, struct flow_dissector *flow_dissector, void *target_container, const void *data, __be16 *p_proto, int *p_nhoff, int *p_hlen, unsigned int flags) { struct flow_dissector_key_keyid *key_keyid; struct gre_base_hdr *hdr, _hdr; int offset = 0; u16 gre_ver; hdr = __skb_header_pointer(skb, *p_nhoff, sizeof(_hdr), data, *p_hlen, &_hdr); if (!hdr) return FLOW_DISSECT_RET_OUT_BAD; /* Only look inside GRE without routing */ if (hdr->flags & GRE_ROUTING) return FLOW_DISSECT_RET_OUT_GOOD; /* Only look inside GRE for version 0 and 1 */ gre_ver = ntohs(hdr->flags & GRE_VERSION); if (gre_ver > 1) return FLOW_DISSECT_RET_OUT_GOOD; *p_proto = hdr->protocol; if (gre_ver) { /* Version1 must be PPTP, and check the flags */ if (!(*p_proto == GRE_PROTO_PPP && (hdr->flags & GRE_KEY))) return FLOW_DISSECT_RET_OUT_GOOD; } offset += sizeof(struct gre_base_hdr); if (hdr->flags & GRE_CSUM) offset += sizeof_field(struct gre_full_hdr, csum) + sizeof_field(struct gre_full_hdr, reserved1); if (hdr->flags & GRE_KEY) { const __be32 *keyid; __be32 _keyid; keyid = __skb_header_pointer(skb, *p_nhoff + offset, sizeof(_keyid), data, *p_hlen, &_keyid); if (!keyid) return FLOW_DISSECT_RET_OUT_BAD; if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_GRE_KEYID)) { key_keyid = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_GRE_KEYID, target_container); if (gre_ver == 0) key_keyid->keyid = *keyid; else key_keyid->keyid = *keyid & GRE_PPTP_KEY_MASK; } offset += sizeof_field(struct gre_full_hdr, key); } if (hdr->flags & GRE_SEQ) offset += sizeof_field(struct pptp_gre_header, seq); if (gre_ver == 0) { if (*p_proto == htons(ETH_P_TEB)) { const struct ethhdr *eth; struct ethhdr _eth; eth = __skb_header_pointer(skb, *p_nhoff + offset, sizeof(_eth), data, *p_hlen, &_eth); if (!eth) return FLOW_DISSECT_RET_OUT_BAD; *p_proto = eth->h_proto; offset += sizeof(*eth); /* Cap headers that we access via pointers at the * end of the Ethernet header as our maximum alignment * at that point is only 2 bytes. */ if (NET_IP_ALIGN) *p_hlen = *p_nhoff + offset; } } else { /* version 1, must be PPTP */ u8 _ppp_hdr[PPP_HDRLEN]; u8 *ppp_hdr; if (hdr->flags & GRE_ACK) offset += sizeof_field(struct pptp_gre_header, ack); ppp_hdr = __skb_header_pointer(skb, *p_nhoff + offset, sizeof(_ppp_hdr), data, *p_hlen, _ppp_hdr); if (!ppp_hdr) return FLOW_DISSECT_RET_OUT_BAD; switch (PPP_PROTOCOL(ppp_hdr)) { case PPP_IP: *p_proto = htons(ETH_P_IP); break; case PPP_IPV6: *p_proto = htons(ETH_P_IPV6); break; default: /* Could probably catch some more like MPLS */ break; } offset += PPP_HDRLEN; } *p_nhoff += offset; key_control->flags |= FLOW_DIS_ENCAPSULATION; if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) return FLOW_DISSECT_RET_OUT_GOOD; return FLOW_DISSECT_RET_PROTO_AGAIN; } /** * __skb_flow_dissect_batadv() - dissect batman-adv header * @skb: sk_buff to with the batman-adv header * @key_control: flow dissectors control key * @data: raw buffer pointer to the packet, if NULL use skb->data * @p_proto: pointer used to update the protocol to process next * @p_nhoff: pointer used to update inner network header offset * @hlen: packet header length * @flags: any combination of FLOW_DISSECTOR_F_* * * ETH_P_BATMAN packets are tried to be dissected. Only * &struct batadv_unicast packets are actually processed because they contain an * inner ethernet header and are usually followed by actual network header. This * allows the flow dissector to continue processing the packet. * * Return: FLOW_DISSECT_RET_PROTO_AGAIN when &struct batadv_unicast was found, * FLOW_DISSECT_RET_OUT_GOOD when dissector should stop after encapsulation, * otherwise FLOW_DISSECT_RET_OUT_BAD */ static enum flow_dissect_ret __skb_flow_dissect_batadv(const struct sk_buff *skb, struct flow_dissector_key_control *key_control, const void *data, __be16 *p_proto, int *p_nhoff, int hlen, unsigned int flags) { struct { struct batadv_unicast_packet batadv_unicast; struct ethhdr eth; } *hdr, _hdr; hdr = __skb_header_pointer(skb, *p_nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) return FLOW_DISSECT_RET_OUT_BAD; if (hdr->batadv_unicast.version != BATADV_COMPAT_VERSION) return FLOW_DISSECT_RET_OUT_BAD; if (hdr->batadv_unicast.packet_type != BATADV_UNICAST) return FLOW_DISSECT_RET_OUT_BAD; *p_proto = hdr->eth.h_proto; *p_nhoff += sizeof(*hdr); key_control->flags |= FLOW_DIS_ENCAPSULATION; if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) return FLOW_DISSECT_RET_OUT_GOOD; return FLOW_DISSECT_RET_PROTO_AGAIN; } static void __skb_flow_dissect_tcp(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, int thoff, int hlen) { struct flow_dissector_key_tcp *key_tcp; struct tcphdr *th, _th; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_TCP)) return; th = __skb_header_pointer(skb, thoff, sizeof(_th), data, hlen, &_th); if (!th) return; if (unlikely(__tcp_hdrlen(th) < sizeof(_th))) return; key_tcp = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_TCP, target_container); key_tcp->flags = (*(__be16 *) &tcp_flag_word(th) & htons(0x0FFF)); } static void __skb_flow_dissect_ports(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, int nhoff, u8 ip_proto, int hlen) { struct flow_dissector_key_ports_range *key_ports_range = NULL; struct flow_dissector_key_ports *key_ports = NULL; __be32 ports; if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) key_ports = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_PORTS, target_container); if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS_RANGE)) key_ports_range = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_PORTS_RANGE, target_container); if (!key_ports && !key_ports_range) return; ports = skb_flow_get_ports(skb, nhoff, ip_proto, data, hlen); if (key_ports) key_ports->ports = ports; if (key_ports_range) key_ports_range->tp.ports = ports; } static void __skb_flow_dissect_ipv4(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, const struct iphdr *iph) { struct flow_dissector_key_ip *key_ip; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IP)) return; key_ip = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_IP, target_container); key_ip->tos = iph->tos; key_ip->ttl = iph->ttl; } static void __skb_flow_dissect_ipv6(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, const struct ipv6hdr *iph) { struct flow_dissector_key_ip *key_ip; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IP)) return; key_ip = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_IP, target_container); key_ip->tos = ipv6_get_dsfield(iph); key_ip->ttl = iph->hop_limit; } /* Maximum number of protocol headers that can be parsed in * __skb_flow_dissect */ #define MAX_FLOW_DISSECT_HDRS 15 static bool skb_flow_dissect_allowed(int *num_hdrs) { ++*num_hdrs; return (*num_hdrs <= MAX_FLOW_DISSECT_HDRS); } static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, struct flow_dissector *flow_dissector, void *target_container) { struct flow_dissector_key_ports_range *key_ports_range = NULL; struct flow_dissector_key_ports *key_ports = NULL; struct flow_dissector_key_control *key_control; struct flow_dissector_key_basic *key_basic; struct flow_dissector_key_addrs *key_addrs; struct flow_dissector_key_tags *key_tags; key_control = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_CONTROL, target_container); key_control->thoff = flow_keys->thoff; if (flow_keys->is_frag) key_control->flags |= FLOW_DIS_IS_FRAGMENT; if (flow_keys->is_first_frag) key_control->flags |= FLOW_DIS_FIRST_FRAG; if (flow_keys->is_encap) key_control->flags |= FLOW_DIS_ENCAPSULATION; key_basic = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_BASIC, target_container); key_basic->n_proto = flow_keys->n_proto; key_basic->ip_proto = flow_keys->ip_proto; if (flow_keys->addr_proto == ETH_P_IP && dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV4_ADDRS)) { key_addrs = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_IPV4_ADDRS, target_container); key_addrs->v4addrs.src = flow_keys->ipv4_src; key_addrs->v4addrs.dst = flow_keys->ipv4_dst; key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; } else if (flow_keys->addr_proto == ETH_P_IPV6 && dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV6_ADDRS)) { key_addrs = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_IPV6_ADDRS, target_container); memcpy(&key_addrs->v6addrs.src, &flow_keys->ipv6_src, sizeof(key_addrs->v6addrs.src)); memcpy(&key_addrs->v6addrs.dst, &flow_keys->ipv6_dst, sizeof(key_addrs->v6addrs.dst)); key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) { key_ports = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_PORTS, target_container); key_ports->src = flow_keys->sport; key_ports->dst = flow_keys->dport; } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS_RANGE)) { key_ports_range = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_PORTS_RANGE, target_container); key_ports_range->tp.src = flow_keys->sport; key_ports_range->tp.dst = flow_keys->dport; } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_FLOW_LABEL)) { key_tags = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_FLOW_LABEL, target_container); key_tags->flow_label = ntohl(flow_keys->flow_label); } } u32 bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, __be16 proto, int nhoff, int hlen, unsigned int flags) { struct bpf_flow_keys *flow_keys = ctx->flow_keys; u32 result; /* Pass parameters to the BPF program */ memset(flow_keys, 0, sizeof(*flow_keys)); flow_keys->n_proto = proto; flow_keys->nhoff = nhoff; flow_keys->thoff = flow_keys->nhoff; BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG != (int)FLOW_DISSECTOR_F_PARSE_1ST_FRAG); BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL != (int)FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP != (int)FLOW_DISSECTOR_F_STOP_AT_ENCAP); flow_keys->flags = flags; result = bpf_prog_run_pin_on_cpu(prog, ctx); flow_keys->nhoff = clamp_t(u16, flow_keys->nhoff, nhoff, hlen); flow_keys->thoff = clamp_t(u16, flow_keys->thoff, flow_keys->nhoff, hlen); return result; } static bool is_pppoe_ses_hdr_valid(const struct pppoe_hdr *hdr) { return hdr->ver == 1 && hdr->type == 1 && hdr->code == 0; } /** * __skb_flow_dissect - extract the flow_keys struct and return it * @net: associated network namespace, derived from @skb if NULL * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified * @flow_dissector: list of keys to dissect * @target_container: target structure to put dissected values into * @data: raw buffer pointer to the packet, if NULL use skb->data * @proto: protocol for which to get the flow, if @data is NULL use skb->protocol * @nhoff: network header offset, if @data is NULL use skb_network_offset(skb) * @hlen: packet header length, if @data is NULL use skb_headlen(skb) * @flags: flags that control the dissection process, e.g. * FLOW_DISSECTOR_F_STOP_AT_ENCAP. * * The function will try to retrieve individual keys into target specified * by flow_dissector from either the skbuff or a raw buffer specified by the * rest parameters. * * Caller must take care of zeroing target container memory. */ bool __skb_flow_dissect(const struct net *net, const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, __be16 proto, int nhoff, int hlen, unsigned int flags) { struct flow_dissector_key_control *key_control; struct flow_dissector_key_basic *key_basic; struct flow_dissector_key_addrs *key_addrs; struct flow_dissector_key_tags *key_tags; struct flow_dissector_key_vlan *key_vlan; enum flow_dissect_ret fdret; enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX; bool mpls_el = false; int mpls_lse = 0; int num_hdrs = 0; u8 ip_proto = 0; bool ret; if (!data) { data = skb->data; proto = skb_vlan_tag_present(skb) ? skb->vlan_proto : skb->protocol; nhoff = skb_network_offset(skb); hlen = skb_headlen(skb); #if IS_ENABLED(CONFIG_NET_DSA) if (unlikely(skb->dev && netdev_uses_dsa(skb->dev) && proto == htons(ETH_P_XDSA))) { struct metadata_dst *md_dst = skb_metadata_dst(skb); const struct dsa_device_ops *ops; int offset = 0; ops = skb->dev->dsa_ptr->tag_ops; /* Only DSA header taggers break flow dissection */ if (ops->needed_headroom && (!md_dst || md_dst->type != METADATA_HW_PORT_MUX)) { if (ops->flow_dissect) ops->flow_dissect(skb, &proto, &offset); else dsa_tag_generic_flow_dissect(skb, &proto, &offset); hlen -= offset; nhoff += offset; } } #endif } /* It is ensured by skb_flow_dissector_init() that control key will * be always present. */ key_control = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_CONTROL, target_container); /* It is ensured by skb_flow_dissector_init() that basic key will * be always present. */ key_basic = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_BASIC, target_container); rcu_read_lock(); if (skb) { if (!net) { if (skb->dev) net = dev_net_rcu(skb->dev); else if (skb->sk) net = sock_net(skb->sk); } } DEBUG_NET_WARN_ON_ONCE(!net); if (net) { enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR; struct bpf_prog_array *run_array; run_array = rcu_dereference(init_net.bpf.run_array[type]); if (!run_array) run_array = rcu_dereference(net->bpf.run_array[type]); if (run_array) { struct bpf_flow_keys flow_keys; struct bpf_flow_dissector ctx = { .flow_keys = &flow_keys, .data = data, .data_end = data + hlen, }; __be16 n_proto = proto; struct bpf_prog *prog; u32 result; if (skb) { ctx.skb = skb; /* we can't use 'proto' in the skb case * because it might be set to skb->vlan_proto * which has been pulled from the data */ n_proto = skb->protocol; } prog = READ_ONCE(run_array->items[0].prog); result = bpf_flow_dissect(prog, &ctx, n_proto, nhoff, hlen, flags); if (result != BPF_FLOW_DISSECTOR_CONTINUE) { __skb_flow_bpf_to_target(&flow_keys, flow_dissector, target_container); rcu_read_unlock(); return result == BPF_OK; } } } rcu_read_unlock(); if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ETH_ADDRS)) { struct ethhdr *eth = eth_hdr(skb); struct flow_dissector_key_eth_addrs *key_eth_addrs; key_eth_addrs = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ETH_ADDRS, target_container); memcpy(key_eth_addrs, eth, sizeof(*key_eth_addrs)); } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_NUM_OF_VLANS)) { struct flow_dissector_key_num_of_vlans *key_num_of_vlans; key_num_of_vlans = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_NUM_OF_VLANS, target_container); key_num_of_vlans->num_of_vlans = 0; } proto_again: fdret = FLOW_DISSECT_RET_CONTINUE; switch (proto) { case htons(ETH_P_IP): { const struct iphdr *iph; struct iphdr _iph; iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph); if (!iph || iph->ihl < 5) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } nhoff += iph->ihl * 4; ip_proto = iph->protocol; if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV4_ADDRS)) { key_addrs = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_IPV4_ADDRS, target_container); memcpy(&key_addrs->v4addrs.src, &iph->saddr, sizeof(key_addrs->v4addrs.src)); memcpy(&key_addrs->v4addrs.dst, &iph->daddr, sizeof(key_addrs->v4addrs.dst)); key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; } __skb_flow_dissect_ipv4(skb, flow_dissector, target_container, data, iph); if (ip_is_fragment(iph)) { key_control->flags |= FLOW_DIS_IS_FRAGMENT; if (iph->frag_off & htons(IP_OFFSET)) { fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } else { key_control->flags |= FLOW_DIS_FIRST_FRAG; if (!(flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG)) { fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } } } break; } case htons(ETH_P_IPV6): { const struct ipv6hdr *iph; struct ipv6hdr _iph; iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph); if (!iph) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } ip_proto = iph->nexthdr; nhoff += sizeof(struct ipv6hdr); if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV6_ADDRS)) { key_addrs = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_IPV6_ADDRS, target_container); memcpy(&key_addrs->v6addrs.src, &iph->saddr, sizeof(key_addrs->v6addrs.src)); memcpy(&key_addrs->v6addrs.dst, &iph->daddr, sizeof(key_addrs->v6addrs.dst)); key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; } if ((dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_FLOW_LABEL) || (flags & FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL)) && ip6_flowlabel(iph)) { __be32 flow_label = ip6_flowlabel(iph); if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_FLOW_LABEL)) { key_tags = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_FLOW_LABEL, target_container); key_tags->flow_label = ntohl(flow_label); } if (flags & FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL) { fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } } __skb_flow_dissect_ipv6(skb, flow_dissector, target_container, data, iph); break; } case htons(ETH_P_8021AD): case htons(ETH_P_8021Q): { const struct vlan_hdr *vlan = NULL; struct vlan_hdr _vlan; __be16 saved_vlan_tpid = proto; if (dissector_vlan == FLOW_DISSECTOR_KEY_MAX && skb && skb_vlan_tag_present(skb)) { proto = skb->protocol; } else { vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan), data, hlen, &_vlan); if (!vlan) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } proto = vlan->h_vlan_encapsulated_proto; nhoff += sizeof(*vlan); } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_NUM_OF_VLANS) && !(key_control->flags & FLOW_DIS_ENCAPSULATION)) { struct flow_dissector_key_num_of_vlans *key_nvs; key_nvs = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_NUM_OF_VLANS, target_container); key_nvs->num_of_vlans++; } if (dissector_vlan == FLOW_DISSECTOR_KEY_MAX) { dissector_vlan = FLOW_DISSECTOR_KEY_VLAN; } else if (dissector_vlan == FLOW_DISSECTOR_KEY_VLAN) { dissector_vlan = FLOW_DISSECTOR_KEY_CVLAN; } else { fdret = FLOW_DISSECT_RET_PROTO_AGAIN; break; } if (dissector_uses_key(flow_dissector, dissector_vlan)) { key_vlan = skb_flow_dissector_target(flow_dissector, dissector_vlan, target_container); if (!vlan) { key_vlan->vlan_id = skb_vlan_tag_get_id(skb); key_vlan->vlan_priority = skb_vlan_tag_get_prio(skb); } else { key_vlan->vlan_id = ntohs(vlan->h_vlan_TCI) & VLAN_VID_MASK; key_vlan->vlan_priority = (ntohs(vlan->h_vlan_TCI) & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; } key_vlan->vlan_tpid = saved_vlan_tpid; key_vlan->vlan_eth_type = proto; } fdret = FLOW_DISSECT_RET_PROTO_AGAIN; break; } case htons(ETH_P_PPP_SES): { struct { struct pppoe_hdr hdr; __be16 proto; } *hdr, _hdr; u16 ppp_proto; hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } if (!is_pppoe_ses_hdr_valid(&hdr->hdr)) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } /* least significant bit of the most significant octet * indicates if protocol field was compressed */ ppp_proto = ntohs(hdr->proto); if (ppp_proto & 0x0100) { ppp_proto = ppp_proto >> 8; nhoff += PPPOE_SES_HLEN - 1; } else { nhoff += PPPOE_SES_HLEN; } if (ppp_proto == PPP_IP) { proto = htons(ETH_P_IP); fdret = FLOW_DISSECT_RET_PROTO_AGAIN; } else if (ppp_proto == PPP_IPV6) { proto = htons(ETH_P_IPV6); fdret = FLOW_DISSECT_RET_PROTO_AGAIN; } else if (ppp_proto == PPP_MPLS_UC) { proto = htons(ETH_P_MPLS_UC); fdret = FLOW_DISSECT_RET_PROTO_AGAIN; } else if (ppp_proto == PPP_MPLS_MC) { proto = htons(ETH_P_MPLS_MC); fdret = FLOW_DISSECT_RET_PROTO_AGAIN; } else if (ppp_proto_is_valid(ppp_proto)) { fdret = FLOW_DISSECT_RET_OUT_GOOD; } else { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PPPOE)) { struct flow_dissector_key_pppoe *key_pppoe; key_pppoe = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_PPPOE, target_container); key_pppoe->session_id = hdr->hdr.sid; key_pppoe->ppp_proto = htons(ppp_proto); key_pppoe->type = htons(ETH_P_PPP_SES); } break; } case htons(ETH_P_TIPC): { struct tipc_basic_hdr *hdr, _hdr; hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_TIPC)) { key_addrs = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_TIPC, target_container); key_addrs->tipckey.key = tipc_hdr_rps_key(hdr); key_control->addr_type = FLOW_DISSECTOR_KEY_TIPC; } fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } case htons(ETH_P_MPLS_UC): case htons(ETH_P_MPLS_MC): fdret = __skb_flow_dissect_mpls(skb, flow_dissector, target_container, data, nhoff, hlen, mpls_lse, &mpls_el); nhoff += sizeof(struct mpls_label); mpls_lse++; break; case htons(ETH_P_FCOE): if ((hlen - nhoff) < FCOE_HEADER_LEN) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } nhoff += FCOE_HEADER_LEN; fdret = FLOW_DISSECT_RET_OUT_GOOD; break; case htons(ETH_P_ARP): case htons(ETH_P_RARP): fdret = __skb_flow_dissect_arp(skb, flow_dissector, target_container, data, nhoff, hlen); break; case htons(ETH_P_BATMAN): fdret = __skb_flow_dissect_batadv(skb, key_control, data, &proto, &nhoff, hlen, flags); break; case htons(ETH_P_1588): { struct ptp_header *hdr, _hdr; hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } nhoff += sizeof(struct ptp_header); fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } case htons(ETH_P_PRP): case htons(ETH_P_HSR): { struct hsr_tag *hdr, _hdr; hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } proto = hdr->encap_proto; nhoff += HSR_HLEN; fdret = FLOW_DISSECT_RET_PROTO_AGAIN; break; } case htons(ETH_P_CFM): fdret = __skb_flow_dissect_cfm(skb, flow_dissector, target_container, data, nhoff, hlen); break; default: fdret = FLOW_DISSECT_RET_OUT_BAD; break; } /* Process result of proto processing */ switch (fdret) { case FLOW_DISSECT_RET_OUT_GOOD: goto out_good; case FLOW_DISSECT_RET_PROTO_AGAIN: if (skb_flow_dissect_allowed(&num_hdrs)) goto proto_again; goto out_good; case FLOW_DISSECT_RET_CONTINUE: case FLOW_DISSECT_RET_IPPROTO_AGAIN: break; case FLOW_DISSECT_RET_OUT_BAD: default: goto out_bad; } ip_proto_again: fdret = FLOW_DISSECT_RET_CONTINUE; switch (ip_proto) { case IPPROTO_GRE: if (flags & FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP) { fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } fdret = __skb_flow_dissect_gre(skb, key_control, flow_dissector, target_container, data, &proto, &nhoff, &hlen, flags); break; case NEXTHDR_HOP: case NEXTHDR_ROUTING: case NEXTHDR_DEST: { u8 _opthdr[2], *opthdr; if (proto != htons(ETH_P_IPV6)) break; opthdr = __skb_header_pointer(skb, nhoff, sizeof(_opthdr), data, hlen, &_opthdr); if (!opthdr) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } ip_proto = opthdr[0]; nhoff += (opthdr[1] + 1) << 3; fdret = FLOW_DISSECT_RET_IPPROTO_AGAIN; break; } case NEXTHDR_FRAGMENT: { struct frag_hdr _fh, *fh; if (proto != htons(ETH_P_IPV6)) break; fh = __skb_header_pointer(skb, nhoff, sizeof(_fh), data, hlen, &_fh); if (!fh) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } key_control->flags |= FLOW_DIS_IS_FRAGMENT; nhoff += sizeof(_fh); ip_proto = fh->nexthdr; if (!(fh->frag_off & htons(IP6_OFFSET))) { key_control->flags |= FLOW_DIS_FIRST_FRAG; if (flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG) { fdret = FLOW_DISSECT_RET_IPPROTO_AGAIN; break; } } fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } case IPPROTO_IPIP: if (flags & FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP) { fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } proto = htons(ETH_P_IP); key_control->flags |= FLOW_DIS_ENCAPSULATION; if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) { fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } fdret = FLOW_DISSECT_RET_PROTO_AGAIN; break; case IPPROTO_IPV6: if (flags & FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP) { fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } proto = htons(ETH_P_IPV6); key_control->flags |= FLOW_DIS_ENCAPSULATION; if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) { fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } fdret = FLOW_DISSECT_RET_PROTO_AGAIN; break; case IPPROTO_MPLS: proto = htons(ETH_P_MPLS_UC); fdret = FLOW_DISSECT_RET_PROTO_AGAIN; break; case IPPROTO_TCP: __skb_flow_dissect_tcp(skb, flow_dissector, target_container, data, nhoff, hlen); break; case IPPROTO_ICMP: case IPPROTO_ICMPV6: __skb_flow_dissect_icmp(skb, flow_dissector, target_container, data, nhoff, hlen); break; case IPPROTO_L2TP: __skb_flow_dissect_l2tpv3(skb, flow_dissector, target_container, data, nhoff, hlen); break; case IPPROTO_ESP: __skb_flow_dissect_esp(skb, flow_dissector, target_container, data, nhoff, hlen); break; case IPPROTO_AH: __skb_flow_dissect_ah(skb, flow_dissector, target_container, data, nhoff, hlen); break; default: break; } if (!(key_control->flags & FLOW_DIS_IS_FRAGMENT)) __skb_flow_dissect_ports(skb, flow_dissector, target_container, data, nhoff, ip_proto, hlen); /* Process result of IP proto processing */ switch (fdret) { case FLOW_DISSECT_RET_PROTO_AGAIN: if (skb_flow_dissect_allowed(&num_hdrs)) goto proto_again; break; case FLOW_DISSECT_RET_IPPROTO_AGAIN: if (skb_flow_dissect_allowed(&num_hdrs)) goto ip_proto_again; break; case FLOW_DISSECT_RET_OUT_GOOD: case FLOW_DISSECT_RET_CONTINUE: break; case FLOW_DISSECT_RET_OUT_BAD: default: goto out_bad; } out_good: ret = true; out: key_control->thoff = min_t(u16, nhoff, skb ? skb->len : hlen); key_basic->n_proto = proto; key_basic->ip_proto = ip_proto; return ret; out_bad: ret = false; goto out; } EXPORT_SYMBOL(__skb_flow_dissect); static siphash_aligned_key_t hashrnd; static __always_inline void __flow_hash_secret_init(void) { net_get_random_once(&hashrnd, sizeof(hashrnd)); } static const void *flow_keys_hash_start(const struct flow_keys *flow) { BUILD_BUG_ON(FLOW_KEYS_HASH_OFFSET % SIPHASH_ALIGNMENT); return &flow->FLOW_KEYS_HASH_START_FIELD; } static inline size_t flow_keys_hash_length(const struct flow_keys *flow) { size_t diff = FLOW_KEYS_HASH_OFFSET + sizeof(flow->addrs); BUILD_BUG_ON((sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) % sizeof(u32)); switch (flow->control.addr_type) { case FLOW_DISSECTOR_KEY_IPV4_ADDRS: diff -= sizeof(flow->addrs.v4addrs); break; case FLOW_DISSECTOR_KEY_IPV6_ADDRS: diff -= sizeof(flow->addrs.v6addrs); break; case FLOW_DISSECTOR_KEY_TIPC: diff -= sizeof(flow->addrs.tipckey); break; } return sizeof(*flow) - diff; } __be32 flow_get_u32_src(const struct flow_keys *flow) { switch (flow->control.addr_type) { case FLOW_DISSECTOR_KEY_IPV4_ADDRS: return flow->addrs.v4addrs.src; case FLOW_DISSECTOR_KEY_IPV6_ADDRS: return (__force __be32)ipv6_addr_hash( &flow->addrs.v6addrs.src); case FLOW_DISSECTOR_KEY_TIPC: return flow->addrs.tipckey.key; default: return 0; } } EXPORT_SYMBOL(flow_get_u32_src); __be32 flow_get_u32_dst(const struct flow_keys *flow) { switch (flow->control.addr_type) { case FLOW_DISSECTOR_KEY_IPV4_ADDRS: return flow->addrs.v4addrs.dst; case FLOW_DISSECTOR_KEY_IPV6_ADDRS: return (__force __be32)ipv6_addr_hash( &flow->addrs.v6addrs.dst); default: return 0; } } EXPORT_SYMBOL(flow_get_u32_dst); /* Sort the source and destination IP and the ports, * to have consistent hash within the two directions */ static inline void __flow_hash_consistentify(struct flow_keys *keys) { int addr_diff, i; switch (keys->control.addr_type) { case FLOW_DISSECTOR_KEY_IPV4_ADDRS: if ((__force u32)keys->addrs.v4addrs.dst < (__force u32)keys->addrs.v4addrs.src) swap(keys->addrs.v4addrs.src, keys->addrs.v4addrs.dst); if ((__force u16)keys->ports.dst < (__force u16)keys->ports.src) { swap(keys->ports.src, keys->ports.dst); } break; case FLOW_DISSECTOR_KEY_IPV6_ADDRS: addr_diff = memcmp(&keys->addrs.v6addrs.dst, &keys->addrs.v6addrs.src, sizeof(keys->addrs.v6addrs.dst)); if (addr_diff < 0) { for (i = 0; i < 4; i++) swap(keys->addrs.v6addrs.src.s6_addr32[i], keys->addrs.v6addrs.dst.s6_addr32[i]); } if ((__force u16)keys->ports.dst < (__force u16)keys->ports.src) { swap(keys->ports.src, keys->ports.dst); } break; } } static inline u32 __flow_hash_from_keys(struct flow_keys *keys, const siphash_key_t *keyval) { u32 hash; __flow_hash_consistentify(keys); hash = siphash(flow_keys_hash_start(keys), flow_keys_hash_length(keys), keyval); if (!hash) hash = 1; return hash; } u32 flow_hash_from_keys(struct flow_keys *keys) { __flow_hash_secret_init(); return __flow_hash_from_keys(keys, &hashrnd); } EXPORT_SYMBOL(flow_hash_from_keys); u32 flow_hash_from_keys_seed(struct flow_keys *keys, const siphash_key_t *keyval) { return __flow_hash_from_keys(keys, keyval); } EXPORT_SYMBOL(flow_hash_from_keys_seed); static inline u32 ___skb_get_hash(const struct sk_buff *skb, struct flow_keys *keys, const siphash_key_t *keyval) { skb_flow_dissect_flow_keys(skb, keys, FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); return __flow_hash_from_keys(keys, keyval); } struct _flow_keys_digest_data { __be16 n_proto; u8 ip_proto; u8 padding; __be32 ports; __be32 src; __be32 dst; }; void make_flow_keys_digest(struct flow_keys_digest *digest, const struct flow_keys *flow) { struct _flow_keys_digest_data *data = (struct _flow_keys_digest_data *)digest; BUILD_BUG_ON(sizeof(*data) > sizeof(*digest)); memset(digest, 0, sizeof(*digest)); data->n_proto = flow->basic.n_proto; data->ip_proto = flow->basic.ip_proto; data->ports = flow->ports.ports; data->src = flow->addrs.v4addrs.src; data->dst = flow->addrs.v4addrs.dst; } EXPORT_SYMBOL(make_flow_keys_digest); static struct flow_dissector flow_keys_dissector_symmetric __read_mostly; u32 __skb_get_hash_symmetric_net(const struct net *net, const struct sk_buff *skb) { struct flow_keys keys; __flow_hash_secret_init(); memset(&keys, 0, sizeof(keys)); __skb_flow_dissect(net, skb, &flow_keys_dissector_symmetric, &keys, NULL, 0, 0, 0, 0); return __flow_hash_from_keys(&keys, &hashrnd); } EXPORT_SYMBOL_GPL(__skb_get_hash_symmetric_net); /** * __skb_get_hash_net: calculate a flow hash * @net: associated network namespace, derived from @skb if NULL * @skb: sk_buff to calculate flow hash from * * This function calculates a flow hash based on src/dst addresses * and src/dst port numbers. Sets hash in skb to non-zero hash value * on success, zero indicates no valid hash. Also, sets l4_hash in skb * if hash is a canonical 4-tuple hash over transport ports. */ void __skb_get_hash_net(const struct net *net, struct sk_buff *skb) { struct flow_keys keys; u32 hash; memset(&keys, 0, sizeof(keys)); __skb_flow_dissect(net, skb, &flow_keys_dissector, &keys, NULL, 0, 0, 0, FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); __flow_hash_secret_init(); hash = __flow_hash_from_keys(&keys, &hashrnd); __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys)); } EXPORT_SYMBOL(__skb_get_hash_net); __u32 skb_get_hash_perturb(const struct sk_buff *skb, const siphash_key_t *perturb) { struct flow_keys keys; return ___skb_get_hash(skb, &keys, perturb); } EXPORT_SYMBOL(skb_get_hash_perturb); u32 __skb_get_poff(const struct sk_buff *skb, const void *data, const struct flow_keys_basic *keys, int hlen) { u32 poff = keys->control.thoff; /* skip L4 headers for fragments after the first */ if ((keys->control.flags & FLOW_DIS_IS_FRAGMENT) && !(keys->control.flags & FLOW_DIS_FIRST_FRAG)) return poff; switch (keys->basic.ip_proto) { case IPPROTO_TCP: { /* access doff as u8 to avoid unaligned access */ const u8 *doff; u8 _doff; doff = __skb_header_pointer(skb, poff + 12, sizeof(_doff), data, hlen, &_doff); if (!doff) return poff; poff += max_t(u32, sizeof(struct tcphdr), (*doff & 0xF0) >> 2); break; } case IPPROTO_UDP: case IPPROTO_UDPLITE: poff += sizeof(struct udphdr); break; /* For the rest, we do not really care about header * extensions at this point for now. */ case IPPROTO_ICMP: poff += sizeof(struct icmphdr); break; case IPPROTO_ICMPV6: poff += sizeof(struct icmp6hdr); break; case IPPROTO_IGMP: poff += sizeof(struct igmphdr); break; case IPPROTO_DCCP: poff += sizeof(struct dccp_hdr); break; case IPPROTO_SCTP: poff += sizeof(struct sctphdr); break; } return poff; } /** * skb_get_poff - get the offset to the payload * @skb: sk_buff to get the payload offset from * * The function will get the offset to the payload as far as it could * be dissected. The main user is currently BPF, so that we can dynamically * truncate packets without needing to push actual payload to the user * space and can analyze headers only, instead. */ u32 skb_get_poff(const struct sk_buff *skb) { struct flow_keys_basic keys; if (!skb_flow_dissect_flow_keys_basic(NULL, skb, &keys, NULL, 0, 0, 0, 0)) return 0; return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb)); } __u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys) { memset(keys, 0, sizeof(*keys)); memcpy(&keys->addrs.v6addrs.src, &fl6->saddr, sizeof(keys->addrs.v6addrs.src)); memcpy(&keys->addrs.v6addrs.dst, &fl6->daddr, sizeof(keys->addrs.v6addrs.dst)); keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; keys->ports.src = fl6->fl6_sport; keys->ports.dst = fl6->fl6_dport; keys->keyid.keyid = fl6->fl6_gre_key; keys->tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); keys->basic.ip_proto = fl6->flowi6_proto; return flow_hash_from_keys(keys); } EXPORT_SYMBOL(__get_hash_from_flowi6); static const struct flow_dissector_key flow_keys_dissector_keys[] = { { .key_id = FLOW_DISSECTOR_KEY_CONTROL, .offset = offsetof(struct flow_keys, control), }, { .key_id = FLOW_DISSECTOR_KEY_BASIC, .offset = offsetof(struct flow_keys, basic), }, { .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS, .offset = offsetof(struct flow_keys, addrs.v4addrs), }, { .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS, .offset = offsetof(struct flow_keys, addrs.v6addrs), }, { .key_id = FLOW_DISSECTOR_KEY_TIPC, .offset = offsetof(struct flow_keys, addrs.tipckey), }, { .key_id = FLOW_DISSECTOR_KEY_PORTS, .offset = offsetof(struct flow_keys, ports), }, { .key_id = FLOW_DISSECTOR_KEY_VLAN, .offset = offsetof(struct flow_keys, vlan), }, { .key_id = FLOW_DISSECTOR_KEY_FLOW_LABEL, .offset = offsetof(struct flow_keys, tags), }, { .key_id = FLOW_DISSECTOR_KEY_GRE_KEYID, .offset = offsetof(struct flow_keys, keyid), }, }; static const struct flow_dissector_key flow_keys_dissector_symmetric_keys[] = { { .key_id = FLOW_DISSECTOR_KEY_CONTROL, .offset = offsetof(struct flow_keys, control), }, { .key_id = FLOW_DISSECTOR_KEY_BASIC, .offset = offsetof(struct flow_keys, basic), }, { .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS, .offset = offsetof(struct flow_keys, addrs.v4addrs), }, { .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS, .offset = offsetof(struct flow_keys, addrs.v6addrs), }, { .key_id = FLOW_DISSECTOR_KEY_PORTS, .offset = offsetof(struct flow_keys, ports), }, }; static const struct flow_dissector_key flow_keys_basic_dissector_keys[] = { { .key_id = FLOW_DISSECTOR_KEY_CONTROL, .offset = offsetof(struct flow_keys, control), }, { .key_id = FLOW_DISSECTOR_KEY_BASIC, .offset = offsetof(struct flow_keys, basic), }, }; struct flow_dissector flow_keys_dissector __read_mostly; EXPORT_SYMBOL(flow_keys_dissector); struct flow_dissector flow_keys_basic_dissector __read_mostly; EXPORT_SYMBOL(flow_keys_basic_dissector); static int __init init_default_flow_dissectors(void) { skb_flow_dissector_init(&flow_keys_dissector, flow_keys_dissector_keys, ARRAY_SIZE(flow_keys_dissector_keys)); skb_flow_dissector_init(&flow_keys_dissector_symmetric, flow_keys_dissector_symmetric_keys, ARRAY_SIZE(flow_keys_dissector_symmetric_keys)); skb_flow_dissector_init(&flow_keys_basic_dissector, flow_keys_basic_dissector_keys, ARRAY_SIZE(flow_keys_basic_dissector_keys)); return 0; } core_initcall(init_default_flow_dissectors);
1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/super.c * * Copyright (C) 1991, 1992 Linus Torvalds * * super.c contains code to handle: - mount structures * - super-block tables * - filesystem drivers list * - mount system call * - umount system call * - ustat system call * * GK 2/5/95 - Changed to support mounting the root fs via NFS * * Added kerneld support: Jacques Gelinas and Bjorn Ekwall * Added change_root: Werner Almesberger & Hans Lermen, Feb '96 * Added options to /proc/mounts: * Torbjörn Lindh (torbjorn.lindh@gopta.se), April 14, 1996. * Added devfs support: Richard Gooch <rgooch@atnf.csiro.au>, 13-JAN-1998 * Heavily rewritten for 'one fs - one tree' dcache architecture. AV, Mar 2000 */ #include <linux/export.h> #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/mount.h> #include <linux/security.h> #include <linux/writeback.h> /* for the emergency remount stuff */ #include <linux/idr.h> #include <linux/mutex.h> #include <linux/backing-dev.h> #include <linux/rculist_bl.h> #include <linux/fscrypt.h> #include <linux/fsnotify.h> #include <linux/lockdep.h> #include <linux/user_namespace.h> #include <linux/fs_context.h> #include <uapi/linux/mount.h> #include "internal.h" static int thaw_super_locked(struct super_block *sb, enum freeze_holder who, const void *freeze_owner); static LIST_HEAD(super_blocks); static DEFINE_SPINLOCK(sb_lock); static char *sb_writers_name[SB_FREEZE_LEVELS] = { "sb_writers", "sb_pagefaults", "sb_internal", }; static inline void __super_lock(struct super_block *sb, bool excl) { if (excl) down_write(&sb->s_umount); else down_read(&sb->s_umount); } static inline void super_unlock(struct super_block *sb, bool excl) { if (excl) up_write(&sb->s_umount); else up_read(&sb->s_umount); } static inline void __super_lock_excl(struct super_block *sb) { __super_lock(sb, true); } static inline void super_unlock_excl(struct super_block *sb) { super_unlock(sb, true); } static inline void super_unlock_shared(struct super_block *sb) { super_unlock(sb, false); } static bool super_flags(const struct super_block *sb, unsigned int flags) { /* * Pairs with smp_store_release() in super_wake() and ensures * that we see @flags after we're woken. */ return smp_load_acquire(&sb->s_flags) & flags; } /** * super_lock - wait for superblock to become ready and lock it * @sb: superblock to wait for * @excl: whether exclusive access is required * * If the superblock has neither passed through vfs_get_tree() or * generic_shutdown_super() yet wait for it to happen. Either superblock * creation will succeed and SB_BORN is set by vfs_get_tree() or we're * woken and we'll see SB_DYING. * * The caller must have acquired a temporary reference on @sb->s_count. * * Return: The function returns true if SB_BORN was set and with * s_umount held. The function returns false if SB_DYING was * set and without s_umount held. */ static __must_check bool super_lock(struct super_block *sb, bool excl) { lockdep_assert_not_held(&sb->s_umount); /* wait until the superblock is ready or dying */ wait_var_event(&sb->s_flags, super_flags(sb, SB_BORN | SB_DYING)); /* Don't pointlessly acquire s_umount. */ if (super_flags(sb, SB_DYING)) return false; __super_lock(sb, excl); /* * Has gone through generic_shutdown_super() in the meantime. * @sb->s_root is NULL and @sb->s_active is 0. No one needs to * grab a reference to this. Tell them so. */ if (sb->s_flags & SB_DYING) { super_unlock(sb, excl); return false; } WARN_ON_ONCE(!(sb->s_flags & SB_BORN)); return true; } /* wait and try to acquire read-side of @sb->s_umount */ static inline bool super_lock_shared(struct super_block *sb) { return super_lock(sb, false); } /* wait and try to acquire write-side of @sb->s_umount */ static inline bool super_lock_excl(struct super_block *sb) { return super_lock(sb, true); } /* wake waiters */ #define SUPER_WAKE_FLAGS (SB_BORN | SB_DYING | SB_DEAD) static void super_wake(struct super_block *sb, unsigned int flag) { WARN_ON_ONCE((flag & ~SUPER_WAKE_FLAGS)); WARN_ON_ONCE(hweight32(flag & SUPER_WAKE_FLAGS) > 1); /* * Pairs with smp_load_acquire() in super_lock() to make sure * all initializations in the superblock are seen by the user * seeing SB_BORN sent. */ smp_store_release(&sb->s_flags, sb->s_flags | flag); /* * Pairs with the barrier in prepare_to_wait_event() to make sure * ___wait_var_event() either sees SB_BORN set or * waitqueue_active() check in wake_up_var() sees the waiter. */ smp_mb(); wake_up_var(&sb->s_flags); } /* * One thing we have to be careful of with a per-sb shrinker is that we don't * drop the last active reference to the superblock from within the shrinker. * If that happens we could trigger unregistering the shrinker from within the * shrinker path and that leads to deadlock on the shrinker_mutex. Hence we * take a passive reference to the superblock to avoid this from occurring. */ static unsigned long super_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { struct super_block *sb; long fs_objects = 0; long total_objects; long freed = 0; long dentries; long inodes; sb = shrink->private_data; /* * Deadlock avoidance. We may hold various FS locks, and we don't want * to recurse into the FS that called us in clear_inode() and friends.. */ if (!(sc->gfp_mask & __GFP_FS)) return SHRINK_STOP; if (!super_trylock_shared(sb)) return SHRINK_STOP; if (sb->s_op->nr_cached_objects) fs_objects = sb->s_op->nr_cached_objects(sb, sc); inodes = list_lru_shrink_count(&sb->s_inode_lru, sc); dentries = list_lru_shrink_count(&sb->s_dentry_lru, sc); total_objects = dentries + inodes + fs_objects; if (!total_objects) total_objects = 1; /* proportion the scan between the caches */ dentries = mult_frac(sc->nr_to_scan, dentries, total_objects); inodes = mult_frac(sc->nr_to_scan, inodes, total_objects); fs_objects = mult_frac(sc->nr_to_scan, fs_objects, total_objects); /* * prune the dcache first as the icache is pinned by it, then * prune the icache, followed by the filesystem specific caches * * Ensure that we always scan at least one object - memcg kmem * accounting uses this to fully empty the caches. */ sc->nr_to_scan = dentries + 1; freed = prune_dcache_sb(sb, sc); sc->nr_to_scan = inodes + 1; freed += prune_icache_sb(sb, sc); if (fs_objects) { sc->nr_to_scan = fs_objects + 1; freed += sb->s_op->free_cached_objects(sb, sc); } super_unlock_shared(sb); return freed; } static unsigned long super_cache_count(struct shrinker *shrink, struct shrink_control *sc) { struct super_block *sb; long total_objects = 0; sb = shrink->private_data; /* * We don't call super_trylock_shared() here as it is a scalability * bottleneck, so we're exposed to partial setup state. The shrinker * rwsem does not protect filesystem operations backing * list_lru_shrink_count() or s_op->nr_cached_objects(). Counts can * change between super_cache_count and super_cache_scan, so we really * don't need locks here. * * However, if we are currently mounting the superblock, the underlying * filesystem might be in a state of partial construction and hence it * is dangerous to access it. super_trylock_shared() uses a SB_BORN check * to avoid this situation, so do the same here. The memory barrier is * matched with the one in mount_fs() as we don't hold locks here. */ if (!(sb->s_flags & SB_BORN)) return 0; smp_rmb(); if (sb->s_op && sb->s_op->nr_cached_objects) total_objects = sb->s_op->nr_cached_objects(sb, sc); total_objects += list_lru_shrink_count(&sb->s_dentry_lru, sc); total_objects += list_lru_shrink_count(&sb->s_inode_lru, sc); if (!total_objects) return SHRINK_EMPTY; total_objects = vfs_pressure_ratio(total_objects); return total_objects; } static void destroy_super_work(struct work_struct *work) { struct super_block *s = container_of(work, struct super_block, destroy_work); fsnotify_sb_free(s); security_sb_free(s); put_user_ns(s->s_user_ns); kfree(s->s_subtype); for (int i = 0; i < SB_FREEZE_LEVELS; i++) percpu_free_rwsem(&s->s_writers.rw_sem[i]); kfree(s); } static void destroy_super_rcu(struct rcu_head *head) { struct super_block *s = container_of(head, struct super_block, rcu); INIT_WORK(&s->destroy_work, destroy_super_work); schedule_work(&s->destroy_work); } /* Free a superblock that has never been seen by anyone */ static void destroy_unused_super(struct super_block *s) { if (!s) return; super_unlock_excl(s); list_lru_destroy(&s->s_dentry_lru); list_lru_destroy(&s->s_inode_lru); shrinker_free(s->s_shrink); /* no delays needed */ destroy_super_work(&s->destroy_work); } /** * alloc_super - create new superblock * @type: filesystem type superblock should belong to * @flags: the mount flags * @user_ns: User namespace for the super_block * * Allocates and initializes a new &struct super_block. alloc_super() * returns a pointer new superblock or %NULL if allocation had failed. */ static struct super_block *alloc_super(struct file_system_type *type, int flags, struct user_namespace *user_ns) { struct super_block *s = kzalloc(sizeof(struct super_block), GFP_KERNEL); static const struct super_operations default_op; int i; if (!s) return NULL; INIT_LIST_HEAD(&s->s_mounts); s->s_user_ns = get_user_ns(user_ns); init_rwsem(&s->s_umount); lockdep_set_class(&s->s_umount, &type->s_umount_key); /* * sget() can have s_umount recursion. * * When it cannot find a suitable sb, it allocates a new * one (this one), and tries again to find a suitable old * one. * * In case that succeeds, it will acquire the s_umount * lock of the old one. Since these are clearly distrinct * locks, and this object isn't exposed yet, there's no * risk of deadlocks. * * Annotate this by putting this lock in a different * subclass. */ down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING); if (security_sb_alloc(s)) goto fail; for (i = 0; i < SB_FREEZE_LEVELS; i++) { if (__percpu_init_rwsem(&s->s_writers.rw_sem[i], sb_writers_name[i], &type->s_writers_key[i])) goto fail; } s->s_bdi = &noop_backing_dev_info; s->s_flags = flags; if (s->s_user_ns != &init_user_ns) s->s_iflags |= SB_I_NODEV; INIT_HLIST_NODE(&s->s_instances); INIT_HLIST_BL_HEAD(&s->s_roots); mutex_init(&s->s_sync_lock); INIT_LIST_HEAD(&s->s_inodes); spin_lock_init(&s->s_inode_list_lock); INIT_LIST_HEAD(&s->s_inodes_wb); spin_lock_init(&s->s_inode_wblist_lock); s->s_count = 1; atomic_set(&s->s_active, 1); mutex_init(&s->s_vfs_rename_mutex); lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key); init_rwsem(&s->s_dquot.dqio_sem); s->s_maxbytes = MAX_NON_LFS; s->s_op = &default_op; s->s_time_gran = 1000000000; s->s_time_min = TIME64_MIN; s->s_time_max = TIME64_MAX; s->s_shrink = shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "sb-%s", type->name); if (!s->s_shrink) goto fail; s->s_shrink->scan_objects = super_cache_scan; s->s_shrink->count_objects = super_cache_count; s->s_shrink->batch = 1024; s->s_shrink->private_data = s; if (list_lru_init_memcg(&s->s_dentry_lru, s->s_shrink)) goto fail; if (list_lru_init_memcg(&s->s_inode_lru, s->s_shrink)) goto fail; return s; fail: destroy_unused_super(s); return NULL; } /* Superblock refcounting */ /* * Drop a superblock's refcount. The caller must hold sb_lock. */ static void __put_super(struct super_block *s) { if (!--s->s_count) { list_del_init(&s->s_list); WARN_ON(s->s_dentry_lru.node); WARN_ON(s->s_inode_lru.node); WARN_ON(!list_empty(&s->s_mounts)); call_rcu(&s->rcu, destroy_super_rcu); } } /** * put_super - drop a temporary reference to superblock * @sb: superblock in question * * Drops a temporary reference, frees superblock if there's no * references left. */ void put_super(struct super_block *sb) { spin_lock(&sb_lock); __put_super(sb); spin_unlock(&sb_lock); } static void kill_super_notify(struct super_block *sb) { lockdep_assert_not_held(&sb->s_umount); /* already notified earlier */ if (sb->s_flags & SB_DEAD) return; /* * Remove it from @fs_supers so it isn't found by new * sget{_fc}() walkers anymore. Any concurrent mounter still * managing to grab a temporary reference is guaranteed to * already see SB_DYING and will wait until we notify them about * SB_DEAD. */ spin_lock(&sb_lock); hlist_del_init(&sb->s_instances); spin_unlock(&sb_lock); /* * Let concurrent mounts know that this thing is really dead. * We don't need @sb->s_umount here as every concurrent caller * will see SB_DYING and either discard the superblock or wait * for SB_DEAD. */ super_wake(sb, SB_DEAD); } /** * deactivate_locked_super - drop an active reference to superblock * @s: superblock to deactivate * * Drops an active reference to superblock, converting it into a temporary * one if there is no other active references left. In that case we * tell fs driver to shut it down and drop the temporary reference we * had just acquired. * * Caller holds exclusive lock on superblock; that lock is released. */ void deactivate_locked_super(struct super_block *s) { struct file_system_type *fs = s->s_type; if (atomic_dec_and_test(&s->s_active)) { shrinker_free(s->s_shrink); fs->kill_sb(s); kill_super_notify(s); /* * Since list_lru_destroy() may sleep, we cannot call it from * put_super(), where we hold the sb_lock. Therefore we destroy * the lru lists right now. */ list_lru_destroy(&s->s_dentry_lru); list_lru_destroy(&s->s_inode_lru); put_filesystem(fs); put_super(s); } else { super_unlock_excl(s); } } EXPORT_SYMBOL(deactivate_locked_super); /** * deactivate_super - drop an active reference to superblock * @s: superblock to deactivate * * Variant of deactivate_locked_super(), except that superblock is *not* * locked by caller. If we are going to drop the final active reference, * lock will be acquired prior to that. */ void deactivate_super(struct super_block *s) { if (!atomic_add_unless(&s->s_active, -1, 1)) { __super_lock_excl(s); deactivate_locked_super(s); } } EXPORT_SYMBOL(deactivate_super); /** * grab_super - acquire an active reference to a superblock * @sb: superblock to acquire * * Acquire a temporary reference on a superblock and try to trade it for * an active reference. This is used in sget{_fc}() to wait for a * superblock to either become SB_BORN or for it to pass through * sb->kill() and be marked as SB_DEAD. * * Return: This returns true if an active reference could be acquired, * false if not. */ static bool grab_super(struct super_block *sb) { bool locked; sb->s_count++; spin_unlock(&sb_lock); locked = super_lock_excl(sb); if (locked) { if (atomic_inc_not_zero(&sb->s_active)) { put_super(sb); return true; } super_unlock_excl(sb); } wait_var_event(&sb->s_flags, super_flags(sb, SB_DEAD)); put_super(sb); return false; } /* * super_trylock_shared - try to grab ->s_umount shared * @sb: reference we are trying to grab * * Try to prevent fs shutdown. This is used in places where we * cannot take an active reference but we need to ensure that the * filesystem is not shut down while we are working on it. It returns * false if we cannot acquire s_umount or if we lose the race and * filesystem already got into shutdown, and returns true with the s_umount * lock held in read mode in case of success. On successful return, * the caller must drop the s_umount lock when done. * * Note that unlike get_super() et.al. this one does *not* bump ->s_count. * The reason why it's safe is that we are OK with doing trylock instead * of down_read(). There's a couple of places that are OK with that, but * it's very much not a general-purpose interface. */ bool super_trylock_shared(struct super_block *sb) { if (down_read_trylock(&sb->s_umount)) { if (!(sb->s_flags & SB_DYING) && sb->s_root && (sb->s_flags & SB_BORN)) return true; super_unlock_shared(sb); } return false; } /** * retire_super - prevents superblock from being reused * @sb: superblock to retire * * The function marks superblock to be ignored in superblock test, which * prevents it from being reused for any new mounts. If the superblock has * a private bdi, it also unregisters it, but doesn't reduce the refcount * of the superblock to prevent potential races. The refcount is reduced * by generic_shutdown_super(). The function can not be called * concurrently with generic_shutdown_super(). It is safe to call the * function multiple times, subsequent calls have no effect. * * The marker will affect the re-use only for block-device-based * superblocks. Other superblocks will still get marked if this function * is used, but that will not affect their reusability. */ void retire_super(struct super_block *sb) { WARN_ON(!sb->s_bdev); __super_lock_excl(sb); if (sb->s_iflags & SB_I_PERSB_BDI) { bdi_unregister(sb->s_bdi); sb->s_iflags &= ~SB_I_PERSB_BDI; } sb->s_iflags |= SB_I_RETIRED; super_unlock_excl(sb); } EXPORT_SYMBOL(retire_super); /** * generic_shutdown_super - common helper for ->kill_sb() * @sb: superblock to kill * * generic_shutdown_super() does all fs-independent work on superblock * shutdown. Typical ->kill_sb() should pick all fs-specific objects * that need destruction out of superblock, call generic_shutdown_super() * and release aforementioned objects. Note: dentries and inodes _are_ * taken care of and do not need specific handling. * * Upon calling this function, the filesystem may no longer alter or * rearrange the set of dentries belonging to this super_block, nor may it * change the attachments of dentries to inodes. */ void generic_shutdown_super(struct super_block *sb) { const struct super_operations *sop = sb->s_op; if (sb->s_root) { shrink_dcache_for_umount(sb); sync_filesystem(sb); sb->s_flags &= ~SB_ACTIVE; cgroup_writeback_umount(sb); /* Evict all inodes with zero refcount. */ evict_inodes(sb); /* * Clean up and evict any inodes that still have references due * to fsnotify or the security policy. */ fsnotify_sb_delete(sb); security_sb_delete(sb); if (sb->s_dio_done_wq) { destroy_workqueue(sb->s_dio_done_wq); sb->s_dio_done_wq = NULL; } if (sop->put_super) sop->put_super(sb); /* * Now that all potentially-encrypted inodes have been evicted, * the fscrypt keyring can be destroyed. */ fscrypt_destroy_keyring(sb); if (CHECK_DATA_CORRUPTION(!list_empty(&sb->s_inodes), NULL, "VFS: Busy inodes after unmount of %s (%s)", sb->s_id, sb->s_type->name)) { /* * Adding a proper bailout path here would be hard, but * we can at least make it more likely that a later * iput_final() or such crashes cleanly. */ struct inode *inode; spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { inode->i_op = VFS_PTR_POISON; inode->i_sb = VFS_PTR_POISON; inode->i_mapping = VFS_PTR_POISON; } spin_unlock(&sb->s_inode_list_lock); } } /* * Broadcast to everyone that grabbed a temporary reference to this * superblock before we removed it from @fs_supers that the superblock * is dying. Every walker of @fs_supers outside of sget{_fc}() will now * discard this superblock and treat it as dead. * * We leave the superblock on @fs_supers so it can be found by * sget{_fc}() until we passed sb->kill_sb(). */ super_wake(sb, SB_DYING); super_unlock_excl(sb); if (sb->s_bdi != &noop_backing_dev_info) { if (sb->s_iflags & SB_I_PERSB_BDI) bdi_unregister(sb->s_bdi); bdi_put(sb->s_bdi); sb->s_bdi = &noop_backing_dev_info; } } EXPORT_SYMBOL(generic_shutdown_super); bool mount_capable(struct fs_context *fc) { if (!(fc->fs_type->fs_flags & FS_USERNS_MOUNT)) return capable(CAP_SYS_ADMIN); else return ns_capable(fc->user_ns, CAP_SYS_ADMIN); } /** * sget_fc - Find or create a superblock * @fc: Filesystem context. * @test: Comparison callback * @set: Setup callback * * Create a new superblock or find an existing one. * * The @test callback is used to find a matching existing superblock. * Whether or not the requested parameters in @fc are taken into account * is specific to the @test callback that is used. They may even be * completely ignored. * * If an extant superblock is matched, it will be returned unless: * * (1) the namespace the filesystem context @fc and the extant * superblock's namespace differ * * (2) the filesystem context @fc has requested that reusing an extant * superblock is not allowed * * In both cases EBUSY will be returned. * * If no match is made, a new superblock will be allocated and basic * initialisation will be performed (s_type, s_fs_info and s_id will be * set and the @set callback will be invoked), the superblock will be * published and it will be returned in a partially constructed state * with SB_BORN and SB_ACTIVE as yet unset. * * Return: On success, an extant or newly created superblock is * returned. On failure an error pointer is returned. */ struct super_block *sget_fc(struct fs_context *fc, int (*test)(struct super_block *, struct fs_context *), int (*set)(struct super_block *, struct fs_context *)) { struct super_block *s = NULL; struct super_block *old; struct user_namespace *user_ns = fc->global ? &init_user_ns : fc->user_ns; int err; /* * Never allow s_user_ns != &init_user_ns when FS_USERNS_MOUNT is * not set, as the filesystem is likely unprepared to handle it. * This can happen when fsconfig() is called from init_user_ns with * an fs_fd opened in another user namespace. */ if (user_ns != &init_user_ns && !(fc->fs_type->fs_flags & FS_USERNS_MOUNT)) { errorfc(fc, "VFS: Mounting from non-initial user namespace is not allowed"); return ERR_PTR(-EPERM); } retry: spin_lock(&sb_lock); if (test) { hlist_for_each_entry(old, &fc->fs_type->fs_supers, s_instances) { if (test(old, fc)) goto share_extant_sb; } } if (!s) { spin_unlock(&sb_lock); s = alloc_super(fc->fs_type, fc->sb_flags, user_ns); if (!s) return ERR_PTR(-ENOMEM); goto retry; } s->s_fs_info = fc->s_fs_info; err = set(s, fc); if (err) { s->s_fs_info = NULL; spin_unlock(&sb_lock); destroy_unused_super(s); return ERR_PTR(err); } fc->s_fs_info = NULL; s->s_type = fc->fs_type; s->s_iflags |= fc->s_iflags; strscpy(s->s_id, s->s_type->name, sizeof(s->s_id)); /* * Make the superblock visible on @super_blocks and @fs_supers. * It's in a nascent state and users should wait on SB_BORN or * SB_DYING to be set. */ list_add_tail(&s->s_list, &super_blocks); hlist_add_head(&s->s_instances, &s->s_type->fs_supers); spin_unlock(&sb_lock); get_filesystem(s->s_type); shrinker_register(s->s_shrink); return s; share_extant_sb: if (user_ns != old->s_user_ns || fc->exclusive) { spin_unlock(&sb_lock); destroy_unused_super(s); if (fc->exclusive) warnfc(fc, "reusing existing filesystem not allowed"); else warnfc(fc, "reusing existing filesystem in another namespace not allowed"); return ERR_PTR(-EBUSY); } if (!grab_super(old)) goto retry; destroy_unused_super(s); return old; } EXPORT_SYMBOL(sget_fc); /** * sget - find or create a superblock * @type: filesystem type superblock should belong to * @test: comparison callback * @set: setup callback * @flags: mount flags * @data: argument to each of them */ struct super_block *sget(struct file_system_type *type, int (*test)(struct super_block *,void *), int (*set)(struct super_block *,void *), int flags, void *data) { struct user_namespace *user_ns = current_user_ns(); struct super_block *s = NULL; struct super_block *old; int err; retry: spin_lock(&sb_lock); if (test) { hlist_for_each_entry(old, &type->fs_supers, s_instances) { if (!test(old, data)) continue; if (user_ns != old->s_user_ns) { spin_unlock(&sb_lock); destroy_unused_super(s); return ERR_PTR(-EBUSY); } if (!grab_super(old)) goto retry; destroy_unused_super(s); return old; } } if (!s) { spin_unlock(&sb_lock); s = alloc_super(type, flags, user_ns); if (!s) return ERR_PTR(-ENOMEM); goto retry; } err = set(s, data); if (err) { spin_unlock(&sb_lock); destroy_unused_super(s); return ERR_PTR(err); } s->s_type = type; strscpy(s->s_id, type->name, sizeof(s->s_id)); list_add_tail(&s->s_list, &super_blocks); hlist_add_head(&s->s_instances, &type->fs_supers); spin_unlock(&sb_lock); get_filesystem(type); shrinker_register(s->s_shrink); return s; } EXPORT_SYMBOL(sget); void drop_super(struct super_block *sb) { super_unlock_shared(sb); put_super(sb); } EXPORT_SYMBOL(drop_super); void drop_super_exclusive(struct super_block *sb) { super_unlock_excl(sb); put_super(sb); } EXPORT_SYMBOL(drop_super_exclusive); enum super_iter_flags_t { SUPER_ITER_EXCL = (1U << 0), SUPER_ITER_UNLOCKED = (1U << 1), SUPER_ITER_REVERSE = (1U << 2), }; static inline struct super_block *first_super(enum super_iter_flags_t flags) { if (flags & SUPER_ITER_REVERSE) return list_last_entry(&super_blocks, struct super_block, s_list); return list_first_entry(&super_blocks, struct super_block, s_list); } static inline struct super_block *next_super(struct super_block *sb, enum super_iter_flags_t flags) { if (flags & SUPER_ITER_REVERSE) return list_prev_entry(sb, s_list); return list_next_entry(sb, s_list); } static void __iterate_supers(void (*f)(struct super_block *, void *), void *arg, enum super_iter_flags_t flags) { struct super_block *sb, *p = NULL; bool excl = flags & SUPER_ITER_EXCL; guard(spinlock)(&sb_lock); for (sb = first_super(flags); !list_entry_is_head(sb, &super_blocks, s_list); sb = next_super(sb, flags)) { if (super_flags(sb, SB_DYING)) continue; sb->s_count++; spin_unlock(&sb_lock); if (flags & SUPER_ITER_UNLOCKED) { f(sb, arg); } else if (super_lock(sb, excl)) { f(sb, arg); super_unlock(sb, excl); } spin_lock(&sb_lock); if (p) __put_super(p); p = sb; } if (p) __put_super(p); } void iterate_supers(void (*f)(struct super_block *, void *), void *arg) { __iterate_supers(f, arg, 0); } /** * iterate_supers_type - call function for superblocks of given type * @type: fs type * @f: function to call * @arg: argument to pass to it * * Scans the superblock list and calls given function, passing it * locked superblock and given argument. */ void iterate_supers_type(struct file_system_type *type, void (*f)(struct super_block *, void *), void *arg) { struct super_block *sb, *p = NULL; spin_lock(&sb_lock); hlist_for_each_entry(sb, &type->fs_supers, s_instances) { bool locked; if (super_flags(sb, SB_DYING)) continue; sb->s_count++; spin_unlock(&sb_lock); locked = super_lock_shared(sb); if (locked) { f(sb, arg); super_unlock_shared(sb); } spin_lock(&sb_lock); if (p) __put_super(p); p = sb; } if (p) __put_super(p); spin_unlock(&sb_lock); } EXPORT_SYMBOL(iterate_supers_type); struct super_block *user_get_super(dev_t dev, bool excl) { struct super_block *sb; spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { bool locked; if (sb->s_dev != dev) continue; sb->s_count++; spin_unlock(&sb_lock); locked = super_lock(sb, excl); if (locked) return sb; spin_lock(&sb_lock); __put_super(sb); break; } spin_unlock(&sb_lock); return NULL; } /** * reconfigure_super - asks filesystem to change superblock parameters * @fc: The superblock and configuration * * Alters the configuration parameters of a live superblock. */ int reconfigure_super(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; int retval; bool remount_ro = false; bool remount_rw = false; bool force = fc->sb_flags & SB_FORCE; if (fc->sb_flags_mask & ~MS_RMT_MASK) return -EINVAL; if (sb->s_writers.frozen != SB_UNFROZEN) return -EBUSY; retval = security_sb_remount(sb, fc->security); if (retval) return retval; if (fc->sb_flags_mask & SB_RDONLY) { #ifdef CONFIG_BLOCK if (!(fc->sb_flags & SB_RDONLY) && sb->s_bdev && bdev_read_only(sb->s_bdev)) return -EACCES; #endif remount_rw = !(fc->sb_flags & SB_RDONLY) && sb_rdonly(sb); remount_ro = (fc->sb_flags & SB_RDONLY) && !sb_rdonly(sb); } if (remount_ro) { if (!hlist_empty(&sb->s_pins)) { super_unlock_excl(sb); group_pin_kill(&sb->s_pins); __super_lock_excl(sb); if (!sb->s_root) return 0; if (sb->s_writers.frozen != SB_UNFROZEN) return -EBUSY; remount_ro = !sb_rdonly(sb); } } shrink_dcache_sb(sb); /* If we are reconfiguring to RDONLY and current sb is read/write, * make sure there are no files open for writing. */ if (remount_ro) { if (force) { sb_start_ro_state_change(sb); } else { retval = sb_prepare_remount_readonly(sb); if (retval) return retval; } } else if (remount_rw) { /* * Protect filesystem's reconfigure code from writes from * userspace until reconfigure finishes. */ sb_start_ro_state_change(sb); } if (fc->ops->reconfigure) { retval = fc->ops->reconfigure(fc); if (retval) { if (!force) goto cancel_readonly; /* If forced remount, go ahead despite any errors */ WARN(1, "forced remount of a %s fs returned %i\n", sb->s_type->name, retval); } } WRITE_ONCE(sb->s_flags, ((sb->s_flags & ~fc->sb_flags_mask) | (fc->sb_flags & fc->sb_flags_mask))); sb_end_ro_state_change(sb); /* * Some filesystems modify their metadata via some other path than the * bdev buffer cache (eg. use a private mapping, or directories in * pagecache, etc). Also file data modifications go via their own * mappings. So If we try to mount readonly then copy the filesystem * from bdev, we could get stale data, so invalidate it to give a best * effort at coherency. */ if (remount_ro && sb->s_bdev) invalidate_bdev(sb->s_bdev); return 0; cancel_readonly: sb_end_ro_state_change(sb); return retval; } static void do_emergency_remount_callback(struct super_block *sb, void *unused) { if (sb->s_bdev && !sb_rdonly(sb)) { struct fs_context *fc; fc = fs_context_for_reconfigure(sb->s_root, SB_RDONLY | SB_FORCE, SB_RDONLY); if (!IS_ERR(fc)) { if (parse_monolithic_mount_data(fc, NULL) == 0) (void)reconfigure_super(fc); put_fs_context(fc); } } } static void do_emergency_remount(struct work_struct *work) { __iterate_supers(do_emergency_remount_callback, NULL, SUPER_ITER_EXCL | SUPER_ITER_REVERSE); kfree(work); printk("Emergency Remount complete\n"); } void emergency_remount(void) { struct work_struct *work; work = kmalloc(sizeof(*work), GFP_ATOMIC); if (work) { INIT_WORK(work, do_emergency_remount); schedule_work(work); } } static void do_thaw_all_callback(struct super_block *sb, void *unused) { if (IS_ENABLED(CONFIG_BLOCK)) while (sb->s_bdev && !bdev_thaw(sb->s_bdev)) pr_warn("Emergency Thaw on %pg\n", sb->s_bdev); thaw_super_locked(sb, FREEZE_HOLDER_USERSPACE, NULL); return; } static void do_thaw_all(struct work_struct *work) { __iterate_supers(do_thaw_all_callback, NULL, SUPER_ITER_EXCL); kfree(work); printk(KERN_WARNING "Emergency Thaw complete\n"); } /** * emergency_thaw_all -- forcibly thaw every frozen filesystem * * Used for emergency unfreeze of all filesystems via SysRq */ void emergency_thaw_all(void) { struct work_struct *work; work = kmalloc(sizeof(*work), GFP_ATOMIC); if (work) { INIT_WORK(work, do_thaw_all); schedule_work(work); } } static inline bool get_active_super(struct super_block *sb) { bool active = false; if (super_lock_excl(sb)) { active = atomic_inc_not_zero(&sb->s_active); super_unlock_excl(sb); } return active; } static const char *filesystems_freeze_ptr = "filesystems_freeze"; static void filesystems_freeze_callback(struct super_block *sb, void *unused) { if (!sb->s_op->freeze_fs && !sb->s_op->freeze_super) return; if (!get_active_super(sb)) return; if (sb->s_op->freeze_super) sb->s_op->freeze_super(sb, FREEZE_EXCL | FREEZE_HOLDER_KERNEL, filesystems_freeze_ptr); else freeze_super(sb, FREEZE_EXCL | FREEZE_HOLDER_KERNEL, filesystems_freeze_ptr); deactivate_super(sb); } void filesystems_freeze(void) { __iterate_supers(filesystems_freeze_callback, NULL, SUPER_ITER_UNLOCKED | SUPER_ITER_REVERSE); } static void filesystems_thaw_callback(struct super_block *sb, void *unused) { if (!sb->s_op->freeze_fs && !sb->s_op->freeze_super) return; if (!get_active_super(sb)) return; if (sb->s_op->thaw_super) sb->s_op->thaw_super(sb, FREEZE_EXCL | FREEZE_HOLDER_KERNEL, filesystems_freeze_ptr); else thaw_super(sb, FREEZE_EXCL | FREEZE_HOLDER_KERNEL, filesystems_freeze_ptr); deactivate_super(sb); } void filesystems_thaw(void) { __iterate_supers(filesystems_thaw_callback, NULL, SUPER_ITER_UNLOCKED); } static DEFINE_IDA(unnamed_dev_ida); /** * get_anon_bdev - Allocate a block device for filesystems which don't have one. * @p: Pointer to a dev_t. * * Filesystems which don't use real block devices can call this function * to allocate a virtual block device. * * Context: Any context. Frequently called while holding sb_lock. * Return: 0 on success, -EMFILE if there are no anonymous bdevs left * or -ENOMEM if memory allocation failed. */ int get_anon_bdev(dev_t *p) { int dev; /* * Many userspace utilities consider an FSID of 0 invalid. * Always return at least 1 from get_anon_bdev. */ dev = ida_alloc_range(&unnamed_dev_ida, 1, (1 << MINORBITS) - 1, GFP_ATOMIC); if (dev == -ENOSPC) dev = -EMFILE; if (dev < 0) return dev; *p = MKDEV(0, dev); return 0; } EXPORT_SYMBOL(get_anon_bdev); void free_anon_bdev(dev_t dev) { ida_free(&unnamed_dev_ida, MINOR(dev)); } EXPORT_SYMBOL(free_anon_bdev); int set_anon_super(struct super_block *s, void *data) { return get_anon_bdev(&s->s_dev); } EXPORT_SYMBOL(set_anon_super); void kill_anon_super(struct super_block *sb) { dev_t dev = sb->s_dev; generic_shutdown_super(sb); kill_super_notify(sb); free_anon_bdev(dev); } EXPORT_SYMBOL(kill_anon_super); void kill_litter_super(struct super_block *sb) { if (sb->s_root) d_genocide(sb->s_root); kill_anon_super(sb); } EXPORT_SYMBOL(kill_litter_super); int set_anon_super_fc(struct super_block *sb, struct fs_context *fc) { return set_anon_super(sb, NULL); } EXPORT_SYMBOL(set_anon_super_fc); static int test_keyed_super(struct super_block *sb, struct fs_context *fc) { return sb->s_fs_info == fc->s_fs_info; } static int test_single_super(struct super_block *s, struct fs_context *fc) { return 1; } static int vfs_get_super(struct fs_context *fc, int (*test)(struct super_block *, struct fs_context *), int (*fill_super)(struct super_block *sb, struct fs_context *fc)) { struct super_block *sb; int err; sb = sget_fc(fc, test, set_anon_super_fc); if (IS_ERR(sb)) return PTR_ERR(sb); if (!sb->s_root) { err = fill_super(sb, fc); if (err) goto error; sb->s_flags |= SB_ACTIVE; } fc->root = dget(sb->s_root); return 0; error: deactivate_locked_super(sb); return err; } int get_tree_nodev(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc)) { return vfs_get_super(fc, NULL, fill_super); } EXPORT_SYMBOL(get_tree_nodev); int get_tree_single(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc)) { return vfs_get_super(fc, test_single_super, fill_super); } EXPORT_SYMBOL(get_tree_single); int get_tree_keyed(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc), void *key) { fc->s_fs_info = key; return vfs_get_super(fc, test_keyed_super, fill_super); } EXPORT_SYMBOL(get_tree_keyed); static int set_bdev_super(struct super_block *s, void *data) { s->s_dev = *(dev_t *)data; return 0; } static int super_s_dev_set(struct super_block *s, struct fs_context *fc) { return set_bdev_super(s, fc->sget_key); } static int super_s_dev_test(struct super_block *s, struct fs_context *fc) { return !(s->s_iflags & SB_I_RETIRED) && s->s_dev == *(dev_t *)fc->sget_key; } /** * sget_dev - Find or create a superblock by device number * @fc: Filesystem context. * @dev: device number * * Find or create a superblock using the provided device number that * will be stored in fc->sget_key. * * If an extant superblock is matched, then that will be returned with * an elevated reference count that the caller must transfer or discard. * * If no match is made, a new superblock will be allocated and basic * initialisation will be performed (s_type, s_fs_info, s_id, s_dev will * be set). The superblock will be published and it will be returned in * a partially constructed state with SB_BORN and SB_ACTIVE as yet * unset. * * Return: an existing or newly created superblock on success, an error * pointer on failure. */ struct super_block *sget_dev(struct fs_context *fc, dev_t dev) { fc->sget_key = &dev; return sget_fc(fc, super_s_dev_test, super_s_dev_set); } EXPORT_SYMBOL(sget_dev); #ifdef CONFIG_BLOCK /* * Lock the superblock that is holder of the bdev. Returns the superblock * pointer if we successfully locked the superblock and it is alive. Otherwise * we return NULL and just unlock bdev->bd_holder_lock. * * The function must be called with bdev->bd_holder_lock and releases it. */ static struct super_block *bdev_super_lock(struct block_device *bdev, bool excl) __releases(&bdev->bd_holder_lock) { struct super_block *sb = bdev->bd_holder; bool locked; lockdep_assert_held(&bdev->bd_holder_lock); lockdep_assert_not_held(&sb->s_umount); lockdep_assert_not_held(&bdev->bd_disk->open_mutex); /* Make sure sb doesn't go away from under us */ spin_lock(&sb_lock); sb->s_count++; spin_unlock(&sb_lock); mutex_unlock(&bdev->bd_holder_lock); locked = super_lock(sb, excl); /* * If the superblock wasn't already SB_DYING then we hold * s_umount and can safely drop our temporary reference. */ put_super(sb); if (!locked) return NULL; if (!sb->s_root || !(sb->s_flags & SB_ACTIVE)) { super_unlock(sb, excl); return NULL; } return sb; } static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise) { struct super_block *sb; sb = bdev_super_lock(bdev, false); if (!sb) return; if (!surprise) sync_filesystem(sb); shrink_dcache_sb(sb); evict_inodes(sb); if (sb->s_op->shutdown) sb->s_op->shutdown(sb); super_unlock_shared(sb); } static void fs_bdev_sync(struct block_device *bdev) { struct super_block *sb; sb = bdev_super_lock(bdev, false); if (!sb) return; sync_filesystem(sb); super_unlock_shared(sb); } static struct super_block *get_bdev_super(struct block_device *bdev) { bool active = false; struct super_block *sb; sb = bdev_super_lock(bdev, true); if (sb) { active = atomic_inc_not_zero(&sb->s_active); super_unlock_excl(sb); } if (!active) return NULL; return sb; } /** * fs_bdev_freeze - freeze owning filesystem of block device * @bdev: block device * * Freeze the filesystem that owns this block device if it is still * active. * * A filesystem that owns multiple block devices may be frozen from each * block device and won't be unfrozen until all block devices are * unfrozen. Each block device can only freeze the filesystem once as we * nest freezes for block devices in the block layer. * * Return: If the freeze was successful zero is returned. If the freeze * failed a negative error code is returned. */ static int fs_bdev_freeze(struct block_device *bdev) { struct super_block *sb; int error = 0; lockdep_assert_held(&bdev->bd_fsfreeze_mutex); sb = get_bdev_super(bdev); if (!sb) return -EINVAL; if (sb->s_op->freeze_super) error = sb->s_op->freeze_super(sb, FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL); else error = freeze_super(sb, FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL); if (!error) error = sync_blockdev(bdev); deactivate_super(sb); return error; } /** * fs_bdev_thaw - thaw owning filesystem of block device * @bdev: block device * * Thaw the filesystem that owns this block device. * * A filesystem that owns multiple block devices may be frozen from each * block device and won't be unfrozen until all block devices are * unfrozen. Each block device can only freeze the filesystem once as we * nest freezes for block devices in the block layer. * * Return: If the thaw was successful zero is returned. If the thaw * failed a negative error code is returned. If this function * returns zero it doesn't mean that the filesystem is unfrozen * as it may have been frozen multiple times (kernel may hold a * freeze or might be frozen from other block devices). */ static int fs_bdev_thaw(struct block_device *bdev) { struct super_block *sb; int error; lockdep_assert_held(&bdev->bd_fsfreeze_mutex); /* * The block device may have been frozen before it was claimed by a * filesystem. Concurrently another process might try to mount that * frozen block device and has temporarily claimed the block device for * that purpose causing a concurrent fs_bdev_thaw() to end up here. The * mounter is already about to abort mounting because they still saw an * elevanted bdev->bd_fsfreeze_count so get_bdev_super() will return * NULL in that case. */ sb = get_bdev_super(bdev); if (!sb) return -EINVAL; if (sb->s_op->thaw_super) error = sb->s_op->thaw_super(sb, FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL); else error = thaw_super(sb, FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL); deactivate_super(sb); return error; } const struct blk_holder_ops fs_holder_ops = { .mark_dead = fs_bdev_mark_dead, .sync = fs_bdev_sync, .freeze = fs_bdev_freeze, .thaw = fs_bdev_thaw, }; EXPORT_SYMBOL_GPL(fs_holder_ops); int setup_bdev_super(struct super_block *sb, int sb_flags, struct fs_context *fc) { blk_mode_t mode = sb_open_mode(sb_flags); struct file *bdev_file; struct block_device *bdev; bdev_file = bdev_file_open_by_dev(sb->s_dev, mode, sb, &fs_holder_ops); if (IS_ERR(bdev_file)) { if (fc) errorf(fc, "%s: Can't open blockdev", fc->source); return PTR_ERR(bdev_file); } bdev = file_bdev(bdev_file); /* * This really should be in blkdev_get_by_dev, but right now can't due * to legacy issues that require us to allow opening a block device node * writable from userspace even for a read-only block device. */ if ((mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) { bdev_fput(bdev_file); return -EACCES; } /* * It is enough to check bdev was not frozen before we set * s_bdev as freezing will wait until SB_BORN is set. */ if (atomic_read(&bdev->bd_fsfreeze_count) > 0) { if (fc) warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev); bdev_fput(bdev_file); return -EBUSY; } spin_lock(&sb_lock); sb->s_bdev_file = bdev_file; sb->s_bdev = bdev; sb->s_bdi = bdi_get(bdev->bd_disk->bdi); if (bdev_stable_writes(bdev)) sb->s_iflags |= SB_I_STABLE_WRITES; spin_unlock(&sb_lock); snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev); shrinker_debugfs_rename(sb->s_shrink, "sb-%s:%s", sb->s_type->name, sb->s_id); sb_set_blocksize(sb, block_size(bdev)); return 0; } EXPORT_SYMBOL_GPL(setup_bdev_super); /** * get_tree_bdev_flags - Get a superblock based on a single block device * @fc: The filesystem context holding the parameters * @fill_super: Helper to initialise a new superblock * @flags: GET_TREE_BDEV_* flags */ int get_tree_bdev_flags(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc), unsigned int flags) { struct super_block *s; int error = 0; dev_t dev; if (!fc->source) return invalf(fc, "No source specified"); error = lookup_bdev(fc->source, &dev); if (error) { if (!(flags & GET_TREE_BDEV_QUIET_LOOKUP)) errorf(fc, "%s: Can't lookup blockdev", fc->source); return error; } fc->sb_flags |= SB_NOSEC; s = sget_dev(fc, dev); if (IS_ERR(s)) return PTR_ERR(s); if (s->s_root) { /* Don't summarily change the RO/RW state. */ if ((fc->sb_flags ^ s->s_flags) & SB_RDONLY) { warnf(fc, "%pg: Can't mount, would change RO state", s->s_bdev); deactivate_locked_super(s); return -EBUSY; } } else { error = setup_bdev_super(s, fc->sb_flags, fc); if (!error) error = fill_super(s, fc); if (error) { deactivate_locked_super(s); return error; } s->s_flags |= SB_ACTIVE; } BUG_ON(fc->root); fc->root = dget(s->s_root); return 0; } EXPORT_SYMBOL_GPL(get_tree_bdev_flags); /** * get_tree_bdev - Get a superblock based on a single block device * @fc: The filesystem context holding the parameters * @fill_super: Helper to initialise a new superblock */ int get_tree_bdev(struct fs_context *fc, int (*fill_super)(struct super_block *, struct fs_context *)) { return get_tree_bdev_flags(fc, fill_super, 0); } EXPORT_SYMBOL(get_tree_bdev); static int test_bdev_super(struct super_block *s, void *data) { return !(s->s_iflags & SB_I_RETIRED) && s->s_dev == *(dev_t *)data; } struct dentry *mount_bdev(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, int (*fill_super)(struct super_block *, void *, int)) { struct super_block *s; int error; dev_t dev; error = lookup_bdev(dev_name, &dev); if (error) return ERR_PTR(error); flags |= SB_NOSEC; s = sget(fs_type, test_bdev_super, set_bdev_super, flags, &dev); if (IS_ERR(s)) return ERR_CAST(s); if (s->s_root) { if ((flags ^ s->s_flags) & SB_RDONLY) { deactivate_locked_super(s); return ERR_PTR(-EBUSY); } } else { error = setup_bdev_super(s, flags, NULL); if (!error) error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); if (error) { deactivate_locked_super(s); return ERR_PTR(error); } s->s_flags |= SB_ACTIVE; } return dget(s->s_root); } EXPORT_SYMBOL(mount_bdev); void kill_block_super(struct super_block *sb) { struct block_device *bdev = sb->s_bdev; generic_shutdown_super(sb); if (bdev) { sync_blockdev(bdev); bdev_fput(sb->s_bdev_file); } } EXPORT_SYMBOL(kill_block_super); #endif struct dentry *mount_nodev(struct file_system_type *fs_type, int flags, void *data, int (*fill_super)(struct super_block *, void *, int)) { int error; struct super_block *s = sget(fs_type, NULL, set_anon_super, flags, NULL); if (IS_ERR(s)) return ERR_CAST(s); error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); if (error) { deactivate_locked_super(s); return ERR_PTR(error); } s->s_flags |= SB_ACTIVE; return dget(s->s_root); } EXPORT_SYMBOL(mount_nodev); /** * vfs_get_tree - Get the mountable root * @fc: The superblock configuration context. * * The filesystem is invoked to get or create a superblock which can then later * be used for mounting. The filesystem places a pointer to the root to be * used for mounting in @fc->root. */ int vfs_get_tree(struct fs_context *fc) { struct super_block *sb; int error; if (fc->root) return -EBUSY; /* Get the mountable root in fc->root, with a ref on the root and a ref * on the superblock. */ error = fc->ops->get_tree(fc); if (error < 0) return error; if (!fc->root) { pr_err("Filesystem %s get_tree() didn't set fc->root, returned %i\n", fc->fs_type->name, error); /* We don't know what the locking state of the superblock is - * if there is a superblock. */ BUG(); } sb = fc->root->d_sb; WARN_ON(!sb->s_bdi); /* * super_wake() contains a memory barrier which also care of * ordering for super_cache_count(). We place it before setting * SB_BORN as the data dependency between the two functions is * the superblock structure contents that we just set up, not * the SB_BORN flag. */ super_wake(sb, SB_BORN); error = security_sb_set_mnt_opts(sb, fc->security, 0, NULL); if (unlikely(error)) { fc_drop_locked(fc); return error; } /* * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE * but s_maxbytes was an unsigned long long for many releases. Throw * this warning for a little while to try and catch filesystems that * violate this rule. */ WARN((sb->s_maxbytes < 0), "%s set sb->s_maxbytes to " "negative value (%lld)\n", fc->fs_type->name, sb->s_maxbytes); return 0; } EXPORT_SYMBOL(vfs_get_tree); /* * Setup private BDI for given superblock. It gets automatically cleaned up * in generic_shutdown_super(). */ int super_setup_bdi_name(struct super_block *sb, char *fmt, ...) { struct backing_dev_info *bdi; int err; va_list args; bdi = bdi_alloc(NUMA_NO_NODE); if (!bdi) return -ENOMEM; va_start(args, fmt); err = bdi_register_va(bdi, fmt, args); va_end(args); if (err) { bdi_put(bdi); return err; } WARN_ON(sb->s_bdi != &noop_backing_dev_info); sb->s_bdi = bdi; sb->s_iflags |= SB_I_PERSB_BDI; return 0; } EXPORT_SYMBOL(super_setup_bdi_name); /* * Setup private BDI for given superblock. I gets automatically cleaned up * in generic_shutdown_super(). */ int super_setup_bdi(struct super_block *sb) { static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); return super_setup_bdi_name(sb, "%.28s-%ld", sb->s_type->name, atomic_long_inc_return(&bdi_seq)); } EXPORT_SYMBOL(super_setup_bdi); /** * sb_wait_write - wait until all writers to given file system finish * @sb: the super for which we wait * @level: type of writers we wait for (normal vs page fault) * * This function waits until there are no writers of given type to given file * system. */ static void sb_wait_write(struct super_block *sb, int level) { percpu_down_write(sb->s_writers.rw_sem + level-1); } /* * We are going to return to userspace and forget about these locks, the * ownership goes to the caller of thaw_super() which does unlock(). */ static void lockdep_sb_freeze_release(struct super_block *sb) { int level; for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--) percpu_rwsem_release(sb->s_writers.rw_sem + level, _THIS_IP_); } /* * Tell lockdep we are holding these locks before we call ->unfreeze_fs(sb). */ static void lockdep_sb_freeze_acquire(struct super_block *sb) { int level; for (level = 0; level < SB_FREEZE_LEVELS; ++level) percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_); } static void sb_freeze_unlock(struct super_block *sb, int level) { for (level--; level >= 0; level--) percpu_up_write(sb->s_writers.rw_sem + level); } static int wait_for_partially_frozen(struct super_block *sb) { int ret = 0; do { unsigned short old = sb->s_writers.frozen; up_write(&sb->s_umount); ret = wait_var_event_killable(&sb->s_writers.frozen, sb->s_writers.frozen != old); down_write(&sb->s_umount); } while (ret == 0 && sb->s_writers.frozen != SB_UNFROZEN && sb->s_writers.frozen != SB_FREEZE_COMPLETE); return ret; } #define FREEZE_HOLDERS (FREEZE_HOLDER_KERNEL | FREEZE_HOLDER_USERSPACE) #define FREEZE_FLAGS (FREEZE_HOLDERS | FREEZE_MAY_NEST | FREEZE_EXCL) static inline int freeze_inc(struct super_block *sb, enum freeze_holder who) { WARN_ON_ONCE((who & ~FREEZE_FLAGS)); WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1); if (who & FREEZE_HOLDER_KERNEL) ++sb->s_writers.freeze_kcount; if (who & FREEZE_HOLDER_USERSPACE) ++sb->s_writers.freeze_ucount; return sb->s_writers.freeze_kcount + sb->s_writers.freeze_ucount; } static inline int freeze_dec(struct super_block *sb, enum freeze_holder who) { WARN_ON_ONCE((who & ~FREEZE_FLAGS)); WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1); if ((who & FREEZE_HOLDER_KERNEL) && sb->s_writers.freeze_kcount) --sb->s_writers.freeze_kcount; if ((who & FREEZE_HOLDER_USERSPACE) && sb->s_writers.freeze_ucount) --sb->s_writers.freeze_ucount; return sb->s_writers.freeze_kcount + sb->s_writers.freeze_ucount; } static inline bool may_freeze(struct super_block *sb, enum freeze_holder who, const void *freeze_owner) { lockdep_assert_held(&sb->s_umount); WARN_ON_ONCE((who & ~FREEZE_FLAGS)); WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1); if (who & FREEZE_EXCL) { if (WARN_ON_ONCE(!(who & FREEZE_HOLDER_KERNEL))) return false; if (WARN_ON_ONCE(who & ~(FREEZE_EXCL | FREEZE_HOLDER_KERNEL))) return false; if (WARN_ON_ONCE(!freeze_owner)) return false; /* This freeze already has a specific owner. */ if (sb->s_writers.freeze_owner) return false; /* * This is already frozen multiple times so we're just * going to take a reference count and mark the freeze as * being owned by the caller. */ if (sb->s_writers.freeze_kcount + sb->s_writers.freeze_ucount) sb->s_writers.freeze_owner = freeze_owner; return true; } if (who & FREEZE_HOLDER_KERNEL) return (who & FREEZE_MAY_NEST) || sb->s_writers.freeze_kcount == 0; if (who & FREEZE_HOLDER_USERSPACE) return (who & FREEZE_MAY_NEST) || sb->s_writers.freeze_ucount == 0; return false; } static inline bool may_unfreeze(struct super_block *sb, enum freeze_holder who, const void *freeze_owner) { lockdep_assert_held(&sb->s_umount); WARN_ON_ONCE((who & ~FREEZE_FLAGS)); WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1); if (who & FREEZE_EXCL) { if (WARN_ON_ONCE(!(who & FREEZE_HOLDER_KERNEL))) return false; if (WARN_ON_ONCE(who & ~(FREEZE_EXCL | FREEZE_HOLDER_KERNEL))) return false; if (WARN_ON_ONCE(!freeze_owner)) return false; if (WARN_ON_ONCE(sb->s_writers.freeze_kcount == 0)) return false; /* This isn't exclusively frozen. */ if (!sb->s_writers.freeze_owner) return false; /* This isn't exclusively frozen by us. */ if (sb->s_writers.freeze_owner != freeze_owner) return false; /* * This is still frozen multiple times so we're just * going to drop our reference count and undo our * exclusive freeze. */ if ((sb->s_writers.freeze_kcount + sb->s_writers.freeze_ucount) > 1) sb->s_writers.freeze_owner = NULL; return true; } if (who & FREEZE_HOLDER_KERNEL) { /* * Someone's trying to steal the reference belonging to * @sb->s_writers.freeze_owner. */ if (sb->s_writers.freeze_kcount == 1 && sb->s_writers.freeze_owner) return false; return sb->s_writers.freeze_kcount > 0; } if (who & FREEZE_HOLDER_USERSPACE) return sb->s_writers.freeze_ucount > 0; return false; } /** * freeze_super - lock the filesystem and force it into a consistent state * @sb: the super to lock * @who: context that wants to freeze * @freeze_owner: owner of the freeze * * Syncs the super to make sure the filesystem is consistent and calls the fs's * freeze_fs. Subsequent calls to this without first thawing the fs may return * -EBUSY. * * @who should be: * * %FREEZE_HOLDER_USERSPACE if userspace wants to freeze the fs; * * %FREEZE_HOLDER_KERNEL if the kernel wants to freeze the fs. * * %FREEZE_MAY_NEST whether nesting freeze and thaw requests is allowed. * * The @who argument distinguishes between the kernel and userspace trying to * freeze the filesystem. Although there cannot be multiple kernel freezes or * multiple userspace freezes in effect at any given time, the kernel and * userspace can both hold a filesystem frozen. The filesystem remains frozen * until there are no kernel or userspace freezes in effect. * * A filesystem may hold multiple devices and thus a filesystems may be * frozen through the block layer via multiple block devices. In this * case the request is marked as being allowed to nest by passing * FREEZE_MAY_NEST. The filesystem remains frozen until all block * devices are unfrozen. If multiple freezes are attempted without * FREEZE_MAY_NEST -EBUSY will be returned. * * During this function, sb->s_writers.frozen goes through these values: * * SB_UNFROZEN: File system is normal, all writes progress as usual. * * SB_FREEZE_WRITE: The file system is in the process of being frozen. New * writes should be blocked, though page faults are still allowed. We wait for * all writes to complete and then proceed to the next stage. * * SB_FREEZE_PAGEFAULT: Freezing continues. Now also page faults are blocked * but internal fs threads can still modify the filesystem (although they * should not dirty new pages or inodes), writeback can run etc. After waiting * for all running page faults we sync the filesystem which will clean all * dirty pages and inodes (no new dirty pages or inodes can be created when * sync is running). * * SB_FREEZE_FS: The file system is frozen. Now all internal sources of fs * modification are blocked (e.g. XFS preallocation truncation on inode * reclaim). This is usually implemented by blocking new transactions for * filesystems that have them and need this additional guard. After all * internal writers are finished we call ->freeze_fs() to finish filesystem * freezing. Then we transition to SB_FREEZE_COMPLETE state. This state is * mostly auxiliary for filesystems to verify they do not modify frozen fs. * * sb->s_writers.frozen is protected by sb->s_umount. * * Return: If the freeze was successful zero is returned. If the freeze * failed a negative error code is returned. */ int freeze_super(struct super_block *sb, enum freeze_holder who, const void *freeze_owner) { int ret; if (!super_lock_excl(sb)) { WARN_ON_ONCE("Dying superblock while freezing!"); return -EINVAL; } atomic_inc(&sb->s_active); retry: if (sb->s_writers.frozen == SB_FREEZE_COMPLETE) { if (may_freeze(sb, who, freeze_owner)) ret = !!WARN_ON_ONCE(freeze_inc(sb, who) == 1); else ret = -EBUSY; /* All freezers share a single active reference. */ deactivate_locked_super(sb); return ret; } if (sb->s_writers.frozen != SB_UNFROZEN) { ret = wait_for_partially_frozen(sb); if (ret) { deactivate_locked_super(sb); return ret; } goto retry; } if (sb_rdonly(sb)) { /* Nothing to do really... */ WARN_ON_ONCE(freeze_inc(sb, who) > 1); sb->s_writers.freeze_owner = freeze_owner; sb->s_writers.frozen = SB_FREEZE_COMPLETE; wake_up_var(&sb->s_writers.frozen); super_unlock_excl(sb); return 0; } sb->s_writers.frozen = SB_FREEZE_WRITE; /* Release s_umount to preserve sb_start_write -> s_umount ordering */ super_unlock_excl(sb); sb_wait_write(sb, SB_FREEZE_WRITE); __super_lock_excl(sb); /* Now we go and block page faults... */ sb->s_writers.frozen = SB_FREEZE_PAGEFAULT; sb_wait_write(sb, SB_FREEZE_PAGEFAULT); /* All writers are done so after syncing there won't be dirty data */ ret = sync_filesystem(sb); if (ret) { sb->s_writers.frozen = SB_UNFROZEN; sb_freeze_unlock(sb, SB_FREEZE_PAGEFAULT); wake_up_var(&sb->s_writers.frozen); deactivate_locked_super(sb); return ret; } /* Now wait for internal filesystem counter */ sb->s_writers.frozen = SB_FREEZE_FS; sb_wait_write(sb, SB_FREEZE_FS); if (sb->s_op->freeze_fs) { ret = sb->s_op->freeze_fs(sb); if (ret) { printk(KERN_ERR "VFS:Filesystem freeze failed\n"); sb->s_writers.frozen = SB_UNFROZEN; sb_freeze_unlock(sb, SB_FREEZE_FS); wake_up_var(&sb->s_writers.frozen); deactivate_locked_super(sb); return ret; } } /* * For debugging purposes so that fs can warn if it sees write activity * when frozen is set to SB_FREEZE_COMPLETE, and for thaw_super(). */ WARN_ON_ONCE(freeze_inc(sb, who) > 1); sb->s_writers.freeze_owner = freeze_owner; sb->s_writers.frozen = SB_FREEZE_COMPLETE; wake_up_var(&sb->s_writers.frozen); lockdep_sb_freeze_release(sb); super_unlock_excl(sb); return 0; } EXPORT_SYMBOL(freeze_super); /* * Undoes the effect of a freeze_super_locked call. If the filesystem is * frozen both by userspace and the kernel, a thaw call from either source * removes that state without releasing the other state or unlocking the * filesystem. */ static int thaw_super_locked(struct super_block *sb, enum freeze_holder who, const void *freeze_owner) { int error = -EINVAL; if (sb->s_writers.frozen != SB_FREEZE_COMPLETE) goto out_unlock; if (!may_unfreeze(sb, who, freeze_owner)) goto out_unlock; /* * All freezers share a single active reference. * So just unlock in case there are any left. */ if (freeze_dec(sb, who)) goto out_unlock; if (sb_rdonly(sb)) { sb->s_writers.frozen = SB_UNFROZEN; sb->s_writers.freeze_owner = NULL; wake_up_var(&sb->s_writers.frozen); goto out_deactivate; } lockdep_sb_freeze_acquire(sb); if (sb->s_op->unfreeze_fs) { error = sb->s_op->unfreeze_fs(sb); if (error) { pr_err("VFS: Filesystem thaw failed\n"); freeze_inc(sb, who); lockdep_sb_freeze_release(sb); goto out_unlock; } } sb->s_writers.frozen = SB_UNFROZEN; sb->s_writers.freeze_owner = NULL; wake_up_var(&sb->s_writers.frozen); sb_freeze_unlock(sb, SB_FREEZE_FS); out_deactivate: deactivate_locked_super(sb); return 0; out_unlock: super_unlock_excl(sb); return error; } /** * thaw_super -- unlock filesystem * @sb: the super to thaw * @who: context that wants to freeze * @freeze_owner: owner of the freeze * * Unlocks the filesystem and marks it writeable again after freeze_super() * if there are no remaining freezes on the filesystem. * * @who should be: * * %FREEZE_HOLDER_USERSPACE if userspace wants to thaw the fs; * * %FREEZE_HOLDER_KERNEL if the kernel wants to thaw the fs. * * %FREEZE_MAY_NEST whether nesting freeze and thaw requests is allowed * * A filesystem may hold multiple devices and thus a filesystems may * have been frozen through the block layer via multiple block devices. * The filesystem remains frozen until all block devices are unfrozen. */ int thaw_super(struct super_block *sb, enum freeze_holder who, const void *freeze_owner) { if (!super_lock_excl(sb)) { WARN_ON_ONCE("Dying superblock while thawing!"); return -EINVAL; } return thaw_super_locked(sb, who, freeze_owner); } EXPORT_SYMBOL(thaw_super); /* * Create workqueue for deferred direct IO completions. We allocate the * workqueue when it's first needed. This avoids creating workqueue for * filesystems that don't need it and also allows us to create the workqueue * late enough so the we can include s_id in the name of the workqueue. */ int sb_init_dio_done_wq(struct super_block *sb) { struct workqueue_struct *old; struct workqueue_struct *wq = alloc_workqueue("dio/%s", WQ_MEM_RECLAIM, 0, sb->s_id); if (!wq) return -ENOMEM; /* * This has to be atomic as more DIOs can race to create the workqueue */ old = cmpxchg(&sb->s_dio_done_wq, NULL, wq); /* Someone created workqueue before us? Free ours... */ if (old) destroy_workqueue(wq); return 0; } EXPORT_SYMBOL_GPL(sb_init_dio_done_wq);
315 315 315 315 315 315 195 196 3 3 3 315 314 315 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 // SPDX-License-Identifier: GPL-2.0 /* * kobject.c - library routines for handling generic kernel objects * * Copyright (c) 2002-2003 Patrick Mochel <mochel@osdl.org> * Copyright (c) 2006-2007 Greg Kroah-Hartman <greg@kroah.com> * Copyright (c) 2006-2007 Novell Inc. * * Please see the file Documentation/core-api/kobject.rst for critical information * about using the kobject interface. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kobject.h> #include <linux/string.h> #include <linux/export.h> #include <linux/stat.h> #include <linux/slab.h> #include <linux/random.h> /** * kobject_namespace() - Return @kobj's namespace tag. * @kobj: kobject in question * * Returns namespace tag of @kobj if its parent has namespace ops enabled * and thus @kobj should have a namespace tag associated with it. Returns * %NULL otherwise. */ const void *kobject_namespace(const struct kobject *kobj) { const struct kobj_ns_type_operations *ns_ops = kobj_ns_ops(kobj); if (!ns_ops || ns_ops->type == KOBJ_NS_TYPE_NONE) return NULL; return kobj->ktype->namespace(kobj); } /** * kobject_get_ownership() - Get sysfs ownership data for @kobj. * @kobj: kobject in question * @uid: kernel user ID for sysfs objects * @gid: kernel group ID for sysfs objects * * Returns initial uid/gid pair that should be used when creating sysfs * representation of given kobject. Normally used to adjust ownership of * objects in a container. */ void kobject_get_ownership(const struct kobject *kobj, kuid_t *uid, kgid_t *gid) { *uid = GLOBAL_ROOT_UID; *gid = GLOBAL_ROOT_GID; if (kobj->ktype->get_ownership) kobj->ktype->get_ownership(kobj, uid, gid); } static bool kobj_ns_type_is_valid(enum kobj_ns_type type) { if ((type <= KOBJ_NS_TYPE_NONE) || (type >= KOBJ_NS_TYPES)) return false; return true; } static int create_dir(struct kobject *kobj) { const struct kobj_type *ktype = get_ktype(kobj); const struct kobj_ns_type_operations *ops; int error; error = sysfs_create_dir_ns(kobj, kobject_namespace(kobj)); if (error) return error; if (ktype) { error = sysfs_create_groups(kobj, ktype->default_groups); if (error) { sysfs_remove_dir(kobj); return error; } } /* * @kobj->sd may be deleted by an ancestor going away. Hold an * extra reference so that it stays until @kobj is gone. */ sysfs_get(kobj->sd); /* * If @kobj has ns_ops, its children need to be filtered based on * their namespace tags. Enable namespace support on @kobj->sd. */ ops = kobj_child_ns_ops(kobj); if (ops) { BUG_ON(!kobj_ns_type_is_valid(ops->type)); BUG_ON(!kobj_ns_type_registered(ops->type)); sysfs_enable_ns(kobj->sd); } return 0; } static int get_kobj_path_length(const struct kobject *kobj) { int length = 1; const struct kobject *parent = kobj; /* walk up the ancestors until we hit the one pointing to the * root. * Add 1 to strlen for leading '/' of each level. */ do { if (kobject_name(parent) == NULL) return 0; length += strlen(kobject_name(parent)) + 1; parent = parent->parent; } while (parent); return length; } static int fill_kobj_path(const struct kobject *kobj, char *path, int length) { const struct kobject *parent; --length; for (parent = kobj; parent; parent = parent->parent) { int cur = strlen(kobject_name(parent)); /* back up enough to print this name with '/' */ length -= cur; if (length <= 0) return -EINVAL; memcpy(path + length, kobject_name(parent), cur); *(path + --length) = '/'; } pr_debug("'%s' (%p): %s: path = '%s'\n", kobject_name(kobj), kobj, __func__, path); return 0; } /** * kobject_get_path() - Allocate memory and fill in the path for @kobj. * @kobj: kobject in question, with which to build the path * @gfp_mask: the allocation type used to allocate the path * * Return: The newly allocated memory, caller must free with kfree(). */ char *kobject_get_path(const struct kobject *kobj, gfp_t gfp_mask) { char *path; int len; retry: len = get_kobj_path_length(kobj); if (len == 0) return NULL; path = kzalloc(len, gfp_mask); if (!path) return NULL; if (fill_kobj_path(kobj, path, len)) { kfree(path); goto retry; } return path; } EXPORT_SYMBOL_GPL(kobject_get_path); /* add the kobject to its kset's list */ static void kobj_kset_join(struct kobject *kobj) { if (!kobj->kset) return; kset_get(kobj->kset); spin_lock(&kobj->kset->list_lock); list_add_tail(&kobj->entry, &kobj->kset->list); spin_unlock(&kobj->kset->list_lock); } /* remove the kobject from its kset's list */ static void kobj_kset_leave(struct kobject *kobj) { if (!kobj->kset) return; spin_lock(&kobj->kset->list_lock); list_del_init(&kobj->entry); spin_unlock(&kobj->kset->list_lock); kset_put(kobj->kset); } static void kobject_init_internal(struct kobject *kobj) { if (!kobj) return; kref_init(&kobj->kref); INIT_LIST_HEAD(&kobj->entry); kobj->state_in_sysfs = 0; kobj->state_add_uevent_sent = 0; kobj->state_remove_uevent_sent = 0; kobj->state_initialized = 1; } static int kobject_add_internal(struct kobject *kobj) { int error = 0; struct kobject *parent; if (!kobj) return -ENOENT; if (!kobj->name || !kobj->name[0]) { WARN(1, "kobject: (%p): attempted to be registered with empty name!\n", kobj); return -EINVAL; } parent = kobject_get(kobj->parent); /* join kset if set, use it as parent if we do not already have one */ if (kobj->kset) { if (!parent) parent = kobject_get(&kobj->kset->kobj); kobj_kset_join(kobj); kobj->parent = parent; } pr_debug("'%s' (%p): %s: parent: '%s', set: '%s'\n", kobject_name(kobj), kobj, __func__, parent ? kobject_name(parent) : "<NULL>", kobj->kset ? kobject_name(&kobj->kset->kobj) : "<NULL>"); error = create_dir(kobj); if (error) { kobj_kset_leave(kobj); kobject_put(parent); kobj->parent = NULL; /* be noisy on error issues */ if (error == -EEXIST) pr_err("%s failed for %s with -EEXIST, don't try to register things with the same name in the same directory.\n", __func__, kobject_name(kobj)); else pr_err("%s failed for %s (error: %d parent: %s)\n", __func__, kobject_name(kobj), error, parent ? kobject_name(parent) : "'none'"); } else kobj->state_in_sysfs = 1; return error; } /** * kobject_set_name_vargs() - Set the name of a kobject. * @kobj: struct kobject to set the name of * @fmt: format string used to build the name * @vargs: vargs to format the string. */ int kobject_set_name_vargs(struct kobject *kobj, const char *fmt, va_list vargs) { const char *s; if (kobj->name && !fmt) return 0; s = kvasprintf_const(GFP_KERNEL, fmt, vargs); if (!s) return -ENOMEM; /* * ewww... some of these buggers have '/' in the name ... If * that's the case, we need to make sure we have an actual * allocated copy to modify, since kvasprintf_const may have * returned something from .rodata. */ if (strchr(s, '/')) { char *t; t = kstrdup(s, GFP_KERNEL); kfree_const(s); if (!t) return -ENOMEM; s = strreplace(t, '/', '!'); } kfree_const(kobj->name); kobj->name = s; return 0; } /** * kobject_set_name() - Set the name of a kobject. * @kobj: struct kobject to set the name of * @fmt: format string used to build the name * * This sets the name of the kobject. If you have already added the * kobject to the system, you must call kobject_rename() in order to * change the name of the kobject. */ int kobject_set_name(struct kobject *kobj, const char *fmt, ...) { va_list vargs; int retval; va_start(vargs, fmt); retval = kobject_set_name_vargs(kobj, fmt, vargs); va_end(vargs); return retval; } EXPORT_SYMBOL(kobject_set_name); /** * kobject_init() - Initialize a kobject structure. * @kobj: pointer to the kobject to initialize * @ktype: pointer to the ktype for this kobject. * * This function will properly initialize a kobject such that it can then * be passed to the kobject_add() call. * * After this function is called, the kobject MUST be cleaned up by a call * to kobject_put(), not by a call to kfree directly to ensure that all of * the memory is cleaned up properly. */ void kobject_init(struct kobject *kobj, const struct kobj_type *ktype) { char *err_str; if (!kobj) { err_str = "invalid kobject pointer!"; goto error; } if (!ktype) { err_str = "must have a ktype to be initialized properly!\n"; goto error; } if (kobj->state_initialized) { /* do not error out as sometimes we can recover */ pr_err("kobject (%p): tried to init an initialized object, something is seriously wrong.\n", kobj); dump_stack_lvl(KERN_ERR); } kobject_init_internal(kobj); kobj->ktype = ktype; return; error: pr_err("kobject (%p): %s\n", kobj, err_str); dump_stack_lvl(KERN_ERR); } EXPORT_SYMBOL(kobject_init); static __printf(3, 0) int kobject_add_varg(struct kobject *kobj, struct kobject *parent, const char *fmt, va_list vargs) { int retval; retval = kobject_set_name_vargs(kobj, fmt, vargs); if (retval) { pr_err("can not set name properly!\n"); return retval; } kobj->parent = parent; return kobject_add_internal(kobj); } /** * kobject_add() - The main kobject add function. * @kobj: the kobject to add * @parent: pointer to the parent of the kobject. * @fmt: format to name the kobject with. * * The kobject name is set and added to the kobject hierarchy in this * function. * * If @parent is set, then the parent of the @kobj will be set to it. * If @parent is NULL, then the parent of the @kobj will be set to the * kobject associated with the kset assigned to this kobject. If no kset * is assigned to the kobject, then the kobject will be located in the * root of the sysfs tree. * * Note, no "add" uevent will be created with this call, the caller should set * up all of the necessary sysfs files for the object and then call * kobject_uevent() with the UEVENT_ADD parameter to ensure that * userspace is properly notified of this kobject's creation. * * Return: If this function returns an error, kobject_put() must be * called to properly clean up the memory associated with the * object. Under no instance should the kobject that is passed * to this function be directly freed with a call to kfree(), * that can leak memory. * * If this function returns success, kobject_put() must also be called * in order to properly clean up the memory associated with the object. * * In short, once this function is called, kobject_put() MUST be called * when the use of the object is finished in order to properly free * everything. */ int kobject_add(struct kobject *kobj, struct kobject *parent, const char *fmt, ...) { va_list args; int retval; if (!kobj) return -EINVAL; if (!kobj->state_initialized) { pr_err("kobject '%s' (%p): tried to add an uninitialized object, something is seriously wrong.\n", kobject_name(kobj), kobj); dump_stack_lvl(KERN_ERR); return -EINVAL; } va_start(args, fmt); retval = kobject_add_varg(kobj, parent, fmt, args); va_end(args); return retval; } EXPORT_SYMBOL(kobject_add); /** * kobject_init_and_add() - Initialize a kobject structure and add it to * the kobject hierarchy. * @kobj: pointer to the kobject to initialize * @ktype: pointer to the ktype for this kobject. * @parent: pointer to the parent of this kobject. * @fmt: the name of the kobject. * * This function combines the call to kobject_init() and kobject_add(). * * If this function returns an error, kobject_put() must be called to * properly clean up the memory associated with the object. This is the * same type of error handling after a call to kobject_add() and kobject * lifetime rules are the same here. */ int kobject_init_and_add(struct kobject *kobj, const struct kobj_type *ktype, struct kobject *parent, const char *fmt, ...) { va_list args; int retval; kobject_init(kobj, ktype); va_start(args, fmt); retval = kobject_add_varg(kobj, parent, fmt, args); va_end(args); return retval; } EXPORT_SYMBOL_GPL(kobject_init_and_add); /** * kobject_rename() - Change the name of an object. * @kobj: object in question. * @new_name: object's new name * * It is the responsibility of the caller to provide mutual * exclusion between two different calls of kobject_rename * on the same kobject and to ensure that new_name is valid and * won't conflict with other kobjects. */ int kobject_rename(struct kobject *kobj, const char *new_name) { int error = 0; const char *devpath = NULL; const char *dup_name = NULL, *name; char *devpath_string = NULL; char *envp[2]; kobj = kobject_get(kobj); if (!kobj) return -EINVAL; if (!kobj->parent) { kobject_put(kobj); return -EINVAL; } devpath = kobject_get_path(kobj, GFP_KERNEL); if (!devpath) { error = -ENOMEM; goto out; } devpath_string = kmalloc(strlen(devpath) + 15, GFP_KERNEL); if (!devpath_string) { error = -ENOMEM; goto out; } sprintf(devpath_string, "DEVPATH_OLD=%s", devpath); envp[0] = devpath_string; envp[1] = NULL; name = dup_name = kstrdup_const(new_name, GFP_KERNEL); if (!name) { error = -ENOMEM; goto out; } error = sysfs_rename_dir_ns(kobj, new_name, kobject_namespace(kobj)); if (error) goto out; /* Install the new kobject name */ dup_name = kobj->name; kobj->name = name; /* This function is mostly/only used for network interface. * Some hotplug package track interfaces by their name and * therefore want to know when the name is changed by the user. */ kobject_uevent_env(kobj, KOBJ_MOVE, envp); out: kfree_const(dup_name); kfree(devpath_string); kfree(devpath); kobject_put(kobj); return error; } EXPORT_SYMBOL_GPL(kobject_rename); /** * kobject_move() - Move object to another parent. * @kobj: object in question. * @new_parent: object's new parent (can be NULL) */ int kobject_move(struct kobject *kobj, struct kobject *new_parent) { int error; struct kobject *old_parent; const char *devpath = NULL; char *devpath_string = NULL; char *envp[2]; kobj = kobject_get(kobj); if (!kobj) return -EINVAL; new_parent = kobject_get(new_parent); if (!new_parent) { if (kobj->kset) new_parent = kobject_get(&kobj->kset->kobj); } /* old object path */ devpath = kobject_get_path(kobj, GFP_KERNEL); if (!devpath) { error = -ENOMEM; goto out; } devpath_string = kmalloc(strlen(devpath) + 15, GFP_KERNEL); if (!devpath_string) { error = -ENOMEM; goto out; } sprintf(devpath_string, "DEVPATH_OLD=%s", devpath); envp[0] = devpath_string; envp[1] = NULL; error = sysfs_move_dir_ns(kobj, new_parent, kobject_namespace(kobj)); if (error) goto out; old_parent = kobj->parent; kobj->parent = new_parent; new_parent = NULL; kobject_put(old_parent); kobject_uevent_env(kobj, KOBJ_MOVE, envp); out: kobject_put(new_parent); kobject_put(kobj); kfree(devpath_string); kfree(devpath); return error; } EXPORT_SYMBOL_GPL(kobject_move); static void __kobject_del(struct kobject *kobj) { struct kernfs_node *sd; const struct kobj_type *ktype; sd = kobj->sd; ktype = get_ktype(kobj); if (ktype) sysfs_remove_groups(kobj, ktype->default_groups); /* send "remove" if the caller did not do it but sent "add" */ if (kobj->state_add_uevent_sent && !kobj->state_remove_uevent_sent) { pr_debug("'%s' (%p): auto cleanup 'remove' event\n", kobject_name(kobj), kobj); kobject_uevent(kobj, KOBJ_REMOVE); } sysfs_remove_dir(kobj); sysfs_put(sd); kobj->state_in_sysfs = 0; kobj_kset_leave(kobj); kobj->parent = NULL; } /** * kobject_del() - Unlink kobject from hierarchy. * @kobj: object. * * This is the function that should be called to delete an object * successfully added via kobject_add(). */ void kobject_del(struct kobject *kobj) { struct kobject *parent; if (!kobj) return; parent = kobj->parent; __kobject_del(kobj); kobject_put(parent); } EXPORT_SYMBOL(kobject_del); /** * kobject_get() - Increment refcount for object. * @kobj: object. */ struct kobject *kobject_get(struct kobject *kobj) { if (kobj) { if (!kobj->state_initialized) WARN(1, KERN_WARNING "kobject: '%s' (%p): is not initialized, yet kobject_get() is being called.\n", kobject_name(kobj), kobj); kref_get(&kobj->kref); } return kobj; } EXPORT_SYMBOL(kobject_get); struct kobject * __must_check kobject_get_unless_zero(struct kobject *kobj) { if (!kobj) return NULL; if (!kref_get_unless_zero(&kobj->kref)) kobj = NULL; return kobj; } EXPORT_SYMBOL(kobject_get_unless_zero); /* * kobject_cleanup - free kobject resources. * @kobj: object to cleanup */ static void kobject_cleanup(struct kobject *kobj) { struct kobject *parent = kobj->parent; const struct kobj_type *t = get_ktype(kobj); const char *name = kobj->name; pr_debug("'%s' (%p): %s, parent %p\n", kobject_name(kobj), kobj, __func__, kobj->parent); if (t && !t->release) pr_debug("'%s' (%p): does not have a release() function, it is broken and must be fixed. See Documentation/core-api/kobject.rst.\n", kobject_name(kobj), kobj); /* remove from sysfs if the caller did not do it */ if (kobj->state_in_sysfs) { pr_debug("'%s' (%p): auto cleanup kobject_del\n", kobject_name(kobj), kobj); __kobject_del(kobj); } else { /* avoid dropping the parent reference unnecessarily */ parent = NULL; } if (t && t->release) { pr_debug("'%s' (%p): calling ktype release\n", kobject_name(kobj), kobj); t->release(kobj); } /* free name if we allocated it */ if (name) { pr_debug("'%s': free name\n", name); kfree_const(name); } kobject_put(parent); } #ifdef CONFIG_DEBUG_KOBJECT_RELEASE static void kobject_delayed_cleanup(struct work_struct *work) { kobject_cleanup(container_of(to_delayed_work(work), struct kobject, release)); } #endif static void kobject_release(struct kref *kref) { struct kobject *kobj = container_of(kref, struct kobject, kref); #ifdef CONFIG_DEBUG_KOBJECT_RELEASE unsigned long delay = HZ + HZ * get_random_u32_below(4); pr_info("'%s' (%p): %s, parent %p (delayed %ld)\n", kobject_name(kobj), kobj, __func__, kobj->parent, delay); INIT_DELAYED_WORK(&kobj->release, kobject_delayed_cleanup); schedule_delayed_work(&kobj->release, delay); #else kobject_cleanup(kobj); #endif } /** * kobject_put() - Decrement refcount for object. * @kobj: object. * * Decrement the refcount, and if 0, call kobject_cleanup(). */ void kobject_put(struct kobject *kobj) { if (kobj) { if (!kobj->state_initialized) WARN(1, KERN_WARNING "kobject: '%s' (%p): is not initialized, yet kobject_put() is being called.\n", kobject_name(kobj), kobj); kref_put(&kobj->kref, kobject_release); } } EXPORT_SYMBOL(kobject_put); static void dynamic_kobj_release(struct kobject *kobj) { pr_debug("(%p): %s\n", kobj, __func__); kfree(kobj); } static const struct kobj_type dynamic_kobj_ktype = { .release = dynamic_kobj_release, .sysfs_ops = &kobj_sysfs_ops, }; /** * kobject_create() - Create a struct kobject dynamically. * * This function creates a kobject structure dynamically and sets it up * to be a "dynamic" kobject with a default release function set up. * * If the kobject was not able to be created, NULL will be returned. * The kobject structure returned from here must be cleaned up with a * call to kobject_put() and not kfree(), as kobject_init() has * already been called on this structure. */ static struct kobject *kobject_create(void) { struct kobject *kobj; kobj = kzalloc(sizeof(*kobj), GFP_KERNEL); if (!kobj) return NULL; kobject_init(kobj, &dynamic_kobj_ktype); return kobj; } /** * kobject_create_and_add() - Create a struct kobject dynamically and * register it with sysfs. * @name: the name for the kobject * @parent: the parent kobject of this kobject, if any. * * This function creates a kobject structure dynamically and registers it * with sysfs. When you are finished with this structure, call * kobject_put() and the structure will be dynamically freed when * it is no longer being used. * * If the kobject was not able to be created, NULL will be returned. */ struct kobject *kobject_create_and_add(const char *name, struct kobject *parent) { struct kobject *kobj; int retval; kobj = kobject_create(); if (!kobj) return NULL; retval = kobject_add(kobj, parent, "%s", name); if (retval) { pr_warn("%s: kobject_add error: %d\n", __func__, retval); kobject_put(kobj); kobj = NULL; } return kobj; } EXPORT_SYMBOL_GPL(kobject_create_and_add); /** * kset_init() - Initialize a kset for use. * @k: kset */ void kset_init(struct kset *k) { kobject_init_internal(&k->kobj); INIT_LIST_HEAD(&k->list); spin_lock_init(&k->list_lock); } /* default kobject attribute operations */ static ssize_t kobj_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct kobj_attribute *kattr; ssize_t ret = -EIO; kattr = container_of(attr, struct kobj_attribute, attr); if (kattr->show) ret = kattr->show(kobj, kattr, buf); return ret; } static ssize_t kobj_attr_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) { struct kobj_attribute *kattr; ssize_t ret = -EIO; kattr = container_of(attr, struct kobj_attribute, attr); if (kattr->store) ret = kattr->store(kobj, kattr, buf, count); return ret; } const struct sysfs_ops kobj_sysfs_ops = { .show = kobj_attr_show, .store = kobj_attr_store, }; EXPORT_SYMBOL_GPL(kobj_sysfs_ops); /** * kset_register() - Initialize and add a kset. * @k: kset. * * NOTE: On error, the kset.kobj.name allocated by() kobj_set_name() * is freed, it can not be used any more. */ int kset_register(struct kset *k) { int err; if (!k) return -EINVAL; if (!k->kobj.ktype) { pr_err("must have a ktype to be initialized properly!\n"); return -EINVAL; } kset_init(k); err = kobject_add_internal(&k->kobj); if (err) { kfree_const(k->kobj.name); /* Set it to NULL to avoid accessing bad pointer in callers. */ k->kobj.name = NULL; return err; } kobject_uevent(&k->kobj, KOBJ_ADD); return 0; } EXPORT_SYMBOL(kset_register); /** * kset_unregister() - Remove a kset. * @k: kset. */ void kset_unregister(struct kset *k) { if (!k) return; kobject_del(&k->kobj); kobject_put(&k->kobj); } EXPORT_SYMBOL(kset_unregister); /** * kset_find_obj() - Search for object in kset. * @kset: kset we're looking in. * @name: object's name. * * Lock kset via @kset->subsys, and iterate over @kset->list, * looking for a matching kobject. If matching object is found * take a reference and return the object. */ struct kobject *kset_find_obj(struct kset *kset, const char *name) { struct kobject *k; struct kobject *ret = NULL; spin_lock(&kset->list_lock); list_for_each_entry(k, &kset->list, entry) { if (kobject_name(k) && !strcmp(kobject_name(k), name)) { ret = kobject_get_unless_zero(k); break; } } spin_unlock(&kset->list_lock); return ret; } EXPORT_SYMBOL_GPL(kset_find_obj); static void kset_release(struct kobject *kobj) { struct kset *kset = container_of(kobj, struct kset, kobj); pr_debug("'%s' (%p): %s\n", kobject_name(kobj), kobj, __func__); kfree(kset); } static void kset_get_ownership(const struct kobject *kobj, kuid_t *uid, kgid_t *gid) { if (kobj->parent) kobject_get_ownership(kobj->parent, uid, gid); } static const struct kobj_type kset_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = kset_release, .get_ownership = kset_get_ownership, }; /** * kset_create() - Create a struct kset dynamically. * * @name: the name for the kset * @uevent_ops: a struct kset_uevent_ops for the kset * @parent_kobj: the parent kobject of this kset, if any. * * This function creates a kset structure dynamically. This structure can * then be registered with the system and show up in sysfs with a call to * kset_register(). When you are finished with this structure, if * kset_register() has been called, call kset_unregister() and the * structure will be dynamically freed when it is no longer being used. * * If the kset was not able to be created, NULL will be returned. */ static struct kset *kset_create(const char *name, const struct kset_uevent_ops *uevent_ops, struct kobject *parent_kobj) { struct kset *kset; int retval; kset = kzalloc(sizeof(*kset), GFP_KERNEL); if (!kset) return NULL; retval = kobject_set_name(&kset->kobj, "%s", name); if (retval) { kfree(kset); return NULL; } kset->uevent_ops = uevent_ops; kset->kobj.parent = parent_kobj; /* * The kobject of this kset will have a type of kset_ktype and belong to * no kset itself. That way we can properly free it when it is * finished being used. */ kset->kobj.ktype = &kset_ktype; kset->kobj.kset = NULL; return kset; } /** * kset_create_and_add() - Create a struct kset dynamically and add it to sysfs. * * @name: the name for the kset * @uevent_ops: a struct kset_uevent_ops for the kset * @parent_kobj: the parent kobject of this kset, if any. * * This function creates a kset structure dynamically and registers it * with sysfs. When you are finished with this structure, call * kset_unregister() and the structure will be dynamically freed when it * is no longer being used. * * If the kset was not able to be created, NULL will be returned. */ struct kset *kset_create_and_add(const char *name, const struct kset_uevent_ops *uevent_ops, struct kobject *parent_kobj) { struct kset *kset; int error; kset = kset_create(name, uevent_ops, parent_kobj); if (!kset) return NULL; error = kset_register(kset); if (error) { kfree(kset); return NULL; } return kset; } EXPORT_SYMBOL_GPL(kset_create_and_add); static DEFINE_SPINLOCK(kobj_ns_type_lock); static const struct kobj_ns_type_operations *kobj_ns_ops_tbl[KOBJ_NS_TYPES]; int kobj_ns_type_register(const struct kobj_ns_type_operations *ops) { enum kobj_ns_type type = ops->type; int error; spin_lock(&kobj_ns_type_lock); error = -EINVAL; if (!kobj_ns_type_is_valid(type)) goto out; error = -EBUSY; if (kobj_ns_ops_tbl[type]) goto out; error = 0; kobj_ns_ops_tbl[type] = ops; out: spin_unlock(&kobj_ns_type_lock); return error; } int kobj_ns_type_registered(enum kobj_ns_type type) { int registered = 0; spin_lock(&kobj_ns_type_lock); if (kobj_ns_type_is_valid(type)) registered = kobj_ns_ops_tbl[type] != NULL; spin_unlock(&kobj_ns_type_lock); return registered; } const struct kobj_ns_type_operations *kobj_child_ns_ops(const struct kobject *parent) { const struct kobj_ns_type_operations *ops = NULL; if (parent && parent->ktype && parent->ktype->child_ns_type) ops = parent->ktype->child_ns_type(parent); return ops; } const struct kobj_ns_type_operations *kobj_ns_ops(const struct kobject *kobj) { return kobj_child_ns_ops(kobj->parent); } bool kobj_ns_current_may_mount(enum kobj_ns_type type) { bool may_mount = true; spin_lock(&kobj_ns_type_lock); if (kobj_ns_type_is_valid(type) && kobj_ns_ops_tbl[type]) may_mount = kobj_ns_ops_tbl[type]->current_may_mount(); spin_unlock(&kobj_ns_type_lock); return may_mount; } void *kobj_ns_grab_current(enum kobj_ns_type type) { void *ns = NULL; spin_lock(&kobj_ns_type_lock); if (kobj_ns_type_is_valid(type) && kobj_ns_ops_tbl[type]) ns = kobj_ns_ops_tbl[type]->grab_current_ns(); spin_unlock(&kobj_ns_type_lock); return ns; } EXPORT_SYMBOL_GPL(kobj_ns_grab_current); void kobj_ns_drop(enum kobj_ns_type type, void *ns) { spin_lock(&kobj_ns_type_lock); if (kobj_ns_type_is_valid(type) && kobj_ns_ops_tbl[type] && kobj_ns_ops_tbl[type]->drop_ns) kobj_ns_ops_tbl[type]->drop_ns(ns); spin_unlock(&kobj_ns_type_lock); } EXPORT_SYMBOL_GPL(kobj_ns_drop);
160 45 142 7 4 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 // SPDX-License-Identifier: GPL-2.0 #include <linux/export.h> #include <linux/spinlock.h> #include <linux/atomic.h> /* * This is an implementation of the notion of "decrement a * reference count, and return locked if it decremented to zero". * * NOTE NOTE NOTE! This is _not_ equivalent to * * if (atomic_dec_and_test(&atomic)) { * spin_lock(&lock); * return 1; * } * return 0; * * because the spin-lock and the decrement must be * "atomic". */ int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock) { /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */ if (atomic_add_unless(atomic, -1, 1)) return 0; /* Otherwise do it the slow way */ spin_lock(lock); if (atomic_dec_and_test(atomic)) return 1; spin_unlock(lock); return 0; } EXPORT_SYMBOL(_atomic_dec_and_lock); int _atomic_dec_and_lock_irqsave(atomic_t *atomic, spinlock_t *lock, unsigned long *flags) { /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */ if (atomic_add_unless(atomic, -1, 1)) return 0; /* Otherwise do it the slow way */ spin_lock_irqsave(lock, *flags); if (atomic_dec_and_test(atomic)) return 1; spin_unlock_irqrestore(lock, *flags); return 0; } EXPORT_SYMBOL(_atomic_dec_and_lock_irqsave); int _atomic_dec_and_raw_lock(atomic_t *atomic, raw_spinlock_t *lock) { /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */ if (atomic_add_unless(atomic, -1, 1)) return 0; /* Otherwise do it the slow way */ raw_spin_lock(lock); if (atomic_dec_and_test(atomic)) return 1; raw_spin_unlock(lock); return 0; } EXPORT_SYMBOL(_atomic_dec_and_raw_lock); int _atomic_dec_and_raw_lock_irqsave(atomic_t *atomic, raw_spinlock_t *lock, unsigned long *flags) { /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */ if (atomic_add_unless(atomic, -1, 1)) return 0; /* Otherwise do it the slow way */ raw_spin_lock_irqsave(lock, *flags); if (atomic_dec_and_test(atomic)) return 1; raw_spin_unlock_irqrestore(lock, *flags); return 0; } EXPORT_SYMBOL(_atomic_dec_and_raw_lock_irqsave);
1009 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2016 ARM Ltd. */ #ifndef __ASM_PGTABLE_PROT_H #define __ASM_PGTABLE_PROT_H #include <asm/memory.h> #include <asm/pgtable-hwdef.h> #include <linux/const.h> /* * Software defined PTE bits definition. */ #define PTE_WRITE (PTE_DBM) /* same as DBM (51) */ #define PTE_SWP_EXCLUSIVE (_AT(pteval_t, 1) << 2) /* only for swp ptes */ #define PTE_DIRTY (_AT(pteval_t, 1) << 55) #define PTE_SPECIAL (_AT(pteval_t, 1) << 56) #define PTE_DEVMAP (_AT(pteval_t, 1) << 57) /* * PTE_PRESENT_INVALID=1 & PTE_VALID=0 indicates that the pte's fields should be * interpreted according to the HW layout by SW but any attempted HW access to * the address will result in a fault. pte_present() returns true. */ #define PTE_PRESENT_INVALID (PTE_NG) /* only when !PTE_VALID */ #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP #define PTE_UFFD_WP (_AT(pteval_t, 1) << 58) /* uffd-wp tracking */ #define PTE_SWP_UFFD_WP (_AT(pteval_t, 1) << 3) /* only for swp ptes */ #else #define PTE_UFFD_WP (_AT(pteval_t, 0)) #define PTE_SWP_UFFD_WP (_AT(pteval_t, 0)) #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */ #define _PROT_DEFAULT (PTE_TYPE_PAGE | PTE_AF | PTE_SHARED) #define PROT_DEFAULT (PTE_TYPE_PAGE | PTE_MAYBE_NG | PTE_MAYBE_SHARED | PTE_AF) #define PROT_SECT_DEFAULT (PMD_TYPE_SECT | PMD_MAYBE_NG | PMD_MAYBE_SHARED | PMD_SECT_AF) #define PROT_DEVICE_nGnRnE (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_DEVICE_nGnRnE)) #define PROT_DEVICE_nGnRE (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_DEVICE_nGnRE)) #define PROT_NORMAL_NC (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_NORMAL_NC)) #define PROT_NORMAL (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_NORMAL)) #define PROT_NORMAL_TAGGED (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_NORMAL_TAGGED)) #define PROT_SECT_DEVICE_nGnRE (PROT_SECT_DEFAULT | PMD_SECT_PXN | PMD_SECT_UXN | PMD_ATTRINDX(MT_DEVICE_nGnRE)) #define PROT_SECT_NORMAL (PROT_SECT_DEFAULT | PMD_SECT_PXN | PMD_SECT_UXN | PTE_WRITE | PMD_ATTRINDX(MT_NORMAL)) #define PROT_SECT_NORMAL_EXEC (PROT_SECT_DEFAULT | PMD_SECT_UXN | PMD_ATTRINDX(MT_NORMAL)) #define _PAGE_DEFAULT (_PROT_DEFAULT | PTE_ATTRINDX(MT_NORMAL)) #define _PAGE_KERNEL (PROT_NORMAL) #define _PAGE_KERNEL_RO ((PROT_NORMAL & ~PTE_WRITE) | PTE_RDONLY) #define _PAGE_KERNEL_ROX ((PROT_NORMAL & ~(PTE_WRITE | PTE_PXN)) | PTE_RDONLY) #define _PAGE_KERNEL_EXEC (PROT_NORMAL & ~PTE_PXN) #define _PAGE_KERNEL_EXEC_CONT ((PROT_NORMAL & ~PTE_PXN) | PTE_CONT) #define _PAGE_SHARED (_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN | PTE_WRITE) #define _PAGE_SHARED_EXEC (_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_WRITE) #define _PAGE_READONLY (_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN) #define _PAGE_READONLY_EXEC (_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN) #define _PAGE_EXECONLY (_PAGE_DEFAULT | PTE_RDONLY | PTE_NG | PTE_PXN) #ifndef __ASSEMBLY__ #include <asm/cpufeature.h> #include <asm/pgtable-types.h> #include <asm/rsi.h> extern bool arm64_use_ng_mappings; extern unsigned long prot_ns_shared; #define PROT_NS_SHARED (is_realm_world() ? prot_ns_shared : 0) #define PTE_MAYBE_NG (arm64_use_ng_mappings ? PTE_NG : 0) #define PMD_MAYBE_NG (arm64_use_ng_mappings ? PMD_SECT_NG : 0) #ifndef CONFIG_ARM64_LPA2 #define lpa2_is_enabled() false #define PTE_MAYBE_SHARED PTE_SHARED #define PMD_MAYBE_SHARED PMD_SECT_S #define PHYS_MASK_SHIFT (CONFIG_ARM64_PA_BITS) #else static inline bool __pure lpa2_is_enabled(void) { return read_tcr() & TCR_DS; } #define PTE_MAYBE_SHARED (lpa2_is_enabled() ? 0 : PTE_SHARED) #define PMD_MAYBE_SHARED (lpa2_is_enabled() ? 0 : PMD_SECT_S) #define PHYS_MASK_SHIFT (lpa2_is_enabled() ? CONFIG_ARM64_PA_BITS : 48) #endif /* * Highest possible physical address supported. */ #define PHYS_MASK ((UL(1) << PHYS_MASK_SHIFT) - 1) /* * If we have userspace only BTI we don't want to mark kernel pages * guarded even if the system does support BTI. */ #define PTE_MAYBE_GP (system_supports_bti_kernel() ? PTE_GP : 0) #define PAGE_KERNEL __pgprot(_PAGE_KERNEL) #define PAGE_KERNEL_RO __pgprot(_PAGE_KERNEL_RO) #define PAGE_KERNEL_ROX __pgprot(_PAGE_KERNEL_ROX) #define PAGE_KERNEL_EXEC __pgprot(_PAGE_KERNEL_EXEC) #define PAGE_KERNEL_EXEC_CONT __pgprot(_PAGE_KERNEL_EXEC_CONT) #define PAGE_S2_MEMATTR(attr, has_fwb) \ ({ \ u64 __val; \ if (has_fwb) \ __val = PTE_S2_MEMATTR(MT_S2_FWB_ ## attr); \ else \ __val = PTE_S2_MEMATTR(MT_S2_ ## attr); \ __val; \ }) #define PAGE_NONE __pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | PTE_PRESENT_INVALID | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN) /* shared+writable pages are clean by default, hence PTE_RDONLY|PTE_WRITE */ #define PAGE_SHARED __pgprot(_PAGE_SHARED) #define PAGE_SHARED_EXEC __pgprot(_PAGE_SHARED_EXEC) #define PAGE_READONLY __pgprot(_PAGE_READONLY) #define PAGE_READONLY_EXEC __pgprot(_PAGE_READONLY_EXEC) #define PAGE_EXECONLY __pgprot(_PAGE_EXECONLY) #endif /* __ASSEMBLY__ */ #define pte_pi_index(pte) ( \ ((pte & BIT(PTE_PI_IDX_3)) >> (PTE_PI_IDX_3 - 3)) | \ ((pte & BIT(PTE_PI_IDX_2)) >> (PTE_PI_IDX_2 - 2)) | \ ((pte & BIT(PTE_PI_IDX_1)) >> (PTE_PI_IDX_1 - 1)) | \ ((pte & BIT(PTE_PI_IDX_0)) >> (PTE_PI_IDX_0 - 0))) /* * Page types used via Permission Indirection Extension (PIE). PIE uses * the USER, DBM, PXN and UXN bits to to generate an index which is used * to look up the actual permission in PIR_ELx and PIRE0_EL1. We define * combinations we use on non-PIE systems with the same encoding, for * convenience these are listed here as comments as are the unallocated * encodings. */ /* 0: PAGE_DEFAULT */ /* 1: PTE_USER */ /* 2: PTE_WRITE */ /* 3: PTE_WRITE | PTE_USER */ /* 4: PAGE_EXECONLY PTE_PXN */ /* 5: PAGE_READONLY_EXEC PTE_PXN | PTE_USER */ /* 6: PTE_PXN | PTE_WRITE */ /* 7: PAGE_SHARED_EXEC PTE_PXN | PTE_WRITE | PTE_USER */ /* 8: PAGE_KERNEL_ROX PTE_UXN */ /* 9: PAGE_GCS_RO PTE_UXN | PTE_USER */ /* a: PAGE_KERNEL_EXEC PTE_UXN | PTE_WRITE */ /* b: PAGE_GCS PTE_UXN | PTE_WRITE | PTE_USER */ /* c: PAGE_KERNEL_RO PTE_UXN | PTE_PXN */ /* d: PAGE_READONLY PTE_UXN | PTE_PXN | PTE_USER */ /* e: PAGE_KERNEL PTE_UXN | PTE_PXN | PTE_WRITE */ /* f: PAGE_SHARED PTE_UXN | PTE_PXN | PTE_WRITE | PTE_USER */ #define _PAGE_GCS (_PAGE_DEFAULT | PTE_NG | PTE_UXN | PTE_WRITE | PTE_USER) #define _PAGE_GCS_RO (_PAGE_DEFAULT | PTE_NG | PTE_UXN | PTE_USER) #define PAGE_GCS __pgprot(_PAGE_GCS) #define PAGE_GCS_RO __pgprot(_PAGE_GCS_RO) #define PIE_E0 ( \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_GCS), PIE_GCS) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_GCS_RO), PIE_R) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_EXECONLY), PIE_X_O) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_READONLY_EXEC), PIE_RX_O) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_SHARED_EXEC), PIE_RWX_O) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_READONLY), PIE_R_O) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_SHARED), PIE_RW_O)) #define PIE_E1 ( \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_GCS), PIE_NONE_O) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_GCS_RO), PIE_NONE_O) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_EXECONLY), PIE_NONE_O) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_READONLY_EXEC), PIE_R) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_SHARED_EXEC), PIE_RW) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_READONLY), PIE_R) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_SHARED), PIE_RW) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_KERNEL_ROX), PIE_RX) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_KERNEL_EXEC), PIE_RWX) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_KERNEL_RO), PIE_R) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_KERNEL), PIE_RW)) #endif /* __ASM_PGTABLE_PROT_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_CPUSET_H #define _LINUX_CPUSET_H /* * cpuset interface * * Copyright (C) 2003 BULL SA * Copyright (C) 2004-2006 Silicon Graphics, Inc. * */ #include <linux/sched.h> #include <linux/sched/topology.h> #include <linux/sched/task.h> #include <linux/cpumask.h> #include <linux/nodemask.h> #include <linux/mm.h> #include <linux/mmu_context.h> #include <linux/jump_label.h> #ifdef CONFIG_CPUSETS /* * Static branch rewrites can happen in an arbitrary order for a given * key. In code paths where we need to loop with read_mems_allowed_begin() and * read_mems_allowed_retry() to get a consistent view of mems_allowed, we need * to ensure that begin() always gets rewritten before retry() in the * disabled -> enabled transition. If not, then if local irqs are disabled * around the loop, we can deadlock since retry() would always be * comparing the latest value of the mems_allowed seqcount against 0 as * begin() still would see cpusets_enabled() as false. The enabled -> disabled * transition should happen in reverse order for the same reasons (want to stop * looking at real value of mems_allowed.sequence in retry() first). */ extern struct static_key_false cpusets_pre_enable_key; extern struct static_key_false cpusets_enabled_key; extern struct static_key_false cpusets_insane_config_key; static inline bool cpusets_enabled(void) { return static_branch_unlikely(&cpusets_enabled_key); } static inline void cpuset_inc(void) { static_branch_inc_cpuslocked(&cpusets_pre_enable_key); static_branch_inc_cpuslocked(&cpusets_enabled_key); } static inline void cpuset_dec(void) { static_branch_dec_cpuslocked(&cpusets_enabled_key); static_branch_dec_cpuslocked(&cpusets_pre_enable_key); } /* * This will get enabled whenever a cpuset configuration is considered * unsupportable in general. E.g. movable only node which cannot satisfy * any non movable allocations (see update_nodemask). Page allocator * needs to make additional checks for those configurations and this * check is meant to guard those checks without any overhead for sane * configurations. */ static inline bool cpusets_insane_config(void) { return static_branch_unlikely(&cpusets_insane_config_key); } extern int cpuset_init(void); extern void cpuset_init_smp(void); extern void cpuset_force_rebuild(void); extern void cpuset_update_active_cpus(void); extern void inc_dl_tasks_cs(struct task_struct *task); extern void dec_dl_tasks_cs(struct task_struct *task); extern void cpuset_lock(void); extern void cpuset_unlock(void); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); extern bool cpuset_cpus_allowed_fallback(struct task_struct *p); extern bool cpuset_cpu_is_isolated(int cpu); extern nodemask_t cpuset_mems_allowed(struct task_struct *p); #define cpuset_current_mems_allowed (current->mems_allowed) void cpuset_init_current_mems_allowed(void); int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask); extern bool cpuset_current_node_allowed(int node, gfp_t gfp_mask); static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) { return cpuset_current_node_allowed(zone_to_nid(z), gfp_mask); } static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) { if (cpusets_enabled()) return __cpuset_zone_allowed(z, gfp_mask); return true; } extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, const struct task_struct *tsk2); #ifdef CONFIG_CPUSETS_V1 #define cpuset_memory_pressure_bump() \ do { \ if (cpuset_memory_pressure_enabled) \ __cpuset_memory_pressure_bump(); \ } while (0) extern int cpuset_memory_pressure_enabled; extern void __cpuset_memory_pressure_bump(void); #else static inline void cpuset_memory_pressure_bump(void) { } #endif extern void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task); extern int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *tsk); extern int cpuset_mem_spread_node(void); static inline int cpuset_do_page_mem_spread(void) { return task_spread_page(current); } extern bool current_cpuset_is_being_rebound(void); extern void dl_rebuild_rd_accounting(void); extern void rebuild_sched_domains(void); extern void cpuset_print_current_mems_allowed(void); extern void cpuset_reset_sched_domains(void); /* * read_mems_allowed_begin is required when making decisions involving * mems_allowed such as during page allocation. mems_allowed can be updated in * parallel and depending on the new value an operation can fail potentially * causing process failure. A retry loop with read_mems_allowed_begin and * read_mems_allowed_retry prevents these artificial failures. */ static inline unsigned int read_mems_allowed_begin(void) { if (!static_branch_unlikely(&cpusets_pre_enable_key)) return 0; return read_seqcount_begin(&current->mems_allowed_seq); } /* * If this returns true, the operation that took place after * read_mems_allowed_begin may have failed artificially due to a concurrent * update of mems_allowed. It is up to the caller to retry the operation if * appropriate. */ static inline bool read_mems_allowed_retry(unsigned int seq) { if (!static_branch_unlikely(&cpusets_enabled_key)) return false; return read_seqcount_retry(&current->mems_allowed_seq, seq); } static inline void set_mems_allowed(nodemask_t nodemask) { unsigned long flags; task_lock(current); local_irq_save(flags); write_seqcount_begin(&current->mems_allowed_seq); current->mems_allowed = nodemask; write_seqcount_end(&current->mems_allowed_seq); local_irq_restore(flags); task_unlock(current); } extern bool cpuset_node_allowed(struct cgroup *cgroup, int nid); #else /* !CONFIG_CPUSETS */ static inline bool cpusets_enabled(void) { return false; } static inline bool cpusets_insane_config(void) { return false; } static inline int cpuset_init(void) { return 0; } static inline void cpuset_init_smp(void) {} static inline void cpuset_force_rebuild(void) { } static inline void cpuset_update_active_cpus(void) { partition_sched_domains(1, NULL, NULL); } static inline void inc_dl_tasks_cs(struct task_struct *task) { } static inline void dec_dl_tasks_cs(struct task_struct *task) { } static inline void cpuset_lock(void) { } static inline void cpuset_unlock(void) { } static inline void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask) { cpumask_copy(mask, task_cpu_possible_mask(p)); } static inline bool cpuset_cpus_allowed_fallback(struct task_struct *p) { return false; } static inline bool cpuset_cpu_is_isolated(int cpu) { return false; } static inline nodemask_t cpuset_mems_allowed(struct task_struct *p) { return node_possible_map; } #define cpuset_current_mems_allowed (node_states[N_MEMORY]) static inline void cpuset_init_current_mems_allowed(void) {} static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) { return 1; } static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) { return true; } static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) { return true; } static inline int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, const struct task_struct *tsk2) { return 1; } static inline void cpuset_memory_pressure_bump(void) {} static inline void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) { } static inline int cpuset_mem_spread_node(void) { return 0; } static inline int cpuset_do_page_mem_spread(void) { return 0; } static inline bool current_cpuset_is_being_rebound(void) { return false; } static inline void dl_rebuild_rd_accounting(void) { } static inline void rebuild_sched_domains(void) { partition_sched_domains(1, NULL, NULL); } static inline void cpuset_reset_sched_domains(void) { partition_sched_domains(1, NULL, NULL); } static inline void cpuset_print_current_mems_allowed(void) { } static inline void set_mems_allowed(nodemask_t nodemask) { } static inline unsigned int read_mems_allowed_begin(void) { return 0; } static inline bool read_mems_allowed_retry(unsigned int seq) { return false; } static inline bool cpuset_node_allowed(struct cgroup *cgroup, int nid) { return true; } #endif /* !CONFIG_CPUSETS */ #endif /* _LINUX_CPUSET_H */
1868 1864 1871 1862 1872 1859 1866 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 // SPDX-License-Identifier: GPL-2.0 #include <linux/compiler.h> #include <linux/context_tracking.h> #include <linux/errno.h> #include <linux/nospec.h> #include <linux/ptrace.h> #include <linux/randomize_kstack.h> #include <linux/syscalls.h> #include <asm/debug-monitors.h> #include <asm/exception.h> #include <asm/fpsimd.h> #include <asm/syscall.h> #include <asm/thread_info.h> #include <asm/unistd.h> #include <asm/unistd_compat_32.h> long compat_arm_syscall(struct pt_regs *regs, int scno); long sys_ni_syscall(void); static long do_ni_syscall(struct pt_regs *regs, int scno) { if (is_compat_task()) { long ret = compat_arm_syscall(regs, scno); if (ret != -ENOSYS) return ret; } return sys_ni_syscall(); } static long __invoke_syscall(struct pt_regs *regs, syscall_fn_t syscall_fn) { return syscall_fn(regs); } static void invoke_syscall(struct pt_regs *regs, unsigned int scno, unsigned int sc_nr, const syscall_fn_t syscall_table[]) { long ret; add_random_kstack_offset(); if (scno < sc_nr) { syscall_fn_t syscall_fn; syscall_fn = syscall_table[array_index_nospec(scno, sc_nr)]; ret = __invoke_syscall(regs, syscall_fn); } else { ret = do_ni_syscall(regs, scno); } syscall_set_return_value(current, regs, 0, ret); /* * This value will get limited by KSTACK_OFFSET_MAX(), which is 10 * bits. The actual entropy will be further reduced by the compiler * when applying stack alignment constraints: the AAPCS mandates a * 16-byte aligned SP at function boundaries, which will remove the * 4 low bits from any entropy chosen here. * * The resulting 6 bits of entropy is seen in SP[9:4]. */ choose_random_kstack_offset(get_random_u16()); } static inline bool has_syscall_work(unsigned long flags) { return unlikely(flags & _TIF_SYSCALL_WORK); } static void el0_svc_common(struct pt_regs *regs, int scno, int sc_nr, const syscall_fn_t syscall_table[]) { unsigned long flags = read_thread_flags(); regs->orig_x0 = regs->regs[0]; regs->syscallno = scno; /* * BTI note: * The architecture does not guarantee that SPSR.BTYPE is zero * on taking an SVC, so we could return to userspace with a * non-zero BTYPE after the syscall. * * This shouldn't matter except when userspace is explicitly * doing something stupid, such as setting PROT_BTI on a page * that lacks conforming BTI/PACIxSP instructions, falling * through from one executable page to another with differing * PROT_BTI, or messing with BTYPE via ptrace: in such cases, * userspace should not be surprised if a SIGILL occurs on * syscall return. * * So, don't touch regs->pstate & PSR_BTYPE_MASK here. * (Similarly for HVC and SMC elsewhere.) */ if (flags & _TIF_MTE_ASYNC_FAULT) { /* * Process the asynchronous tag check fault before the actual * syscall. do_notify_resume() will send a signal to userspace * before the syscall is restarted. */ syscall_set_return_value(current, regs, -ERESTARTNOINTR, 0); return; } if (has_syscall_work(flags)) { /* * The de-facto standard way to skip a system call using ptrace * is to set the system call to -1 (NO_SYSCALL) and set x0 to a * suitable error code for consumption by userspace. However, * this cannot be distinguished from a user-issued syscall(-1) * and so we must set x0 to -ENOSYS here in case the tracer doesn't * issue the skip and we fall into trace_exit with x0 preserved. * * This is slightly odd because it also means that if a tracer * sets the system call number to -1 but does not initialise x0, * then x0 will be preserved for all system calls apart from a * user-issued syscall(-1). However, requesting a skip and not * setting the return value is unlikely to do anything sensible * anyway. */ if (scno == NO_SYSCALL) syscall_set_return_value(current, regs, -ENOSYS, 0); scno = syscall_trace_enter(regs); if (scno == NO_SYSCALL) goto trace_exit; } invoke_syscall(regs, scno, sc_nr, syscall_table); /* * The tracing status may have changed under our feet, so we have to * check again. However, if we were tracing entry, then we always trace * exit regardless, as the old entry assembly did. */ if (!has_syscall_work(flags) && !IS_ENABLED(CONFIG_DEBUG_RSEQ)) { flags = read_thread_flags(); if (!has_syscall_work(flags) && !(flags & _TIF_SINGLESTEP)) return; } trace_exit: syscall_trace_exit(regs); } void do_el0_svc(struct pt_regs *regs) { el0_svc_common(regs, regs->regs[8], __NR_syscalls, sys_call_table); } #ifdef CONFIG_COMPAT void do_el0_svc_compat(struct pt_regs *regs) { el0_svc_common(regs, regs->regs[7], __NR_compat32_syscalls, compat_sys_call_table); } #endif
1578 21 1559 1618 36 36 1610 1618 1605 227 225 2 226 1614 1613 226 1585 1614 1611 1611 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 // SPDX-License-Identifier: GPL-2.0 /* * security/tomoyo/util.c * * Copyright (C) 2005-2011 NTT DATA CORPORATION */ #include <linux/slab.h> #include <linux/rculist.h> #include "common.h" /* Lock for protecting policy. */ DEFINE_MUTEX(tomoyo_policy_lock); /* Has /sbin/init started? */ bool tomoyo_policy_loaded; /* * Mapping table from "enum tomoyo_mac_index" to * "enum tomoyo_mac_category_index". */ const u8 tomoyo_index2category[TOMOYO_MAX_MAC_INDEX] = { /* CONFIG::file group */ [TOMOYO_MAC_FILE_EXECUTE] = TOMOYO_MAC_CATEGORY_FILE, [TOMOYO_MAC_FILE_OPEN] = TOMOYO_MAC_CATEGORY_FILE, [TOMOYO_MAC_FILE_CREATE] = TOMOYO_MAC_CATEGORY_FILE, [TOMOYO_MAC_FILE_UNLINK] = TOMOYO_MAC_CATEGORY_FILE, [TOMOYO_MAC_FILE_GETATTR] = TOMOYO_MAC_CATEGORY_FILE, [TOMOYO_MAC_FILE_MKDIR] = TOMOYO_MAC_CATEGORY_FILE, [TOMOYO_MAC_FILE_RMDIR] = TOMOYO_MAC_CATEGORY_FILE, [TOMOYO_MAC_FILE_MKFIFO] = TOMOYO_MAC_CATEGORY_FILE, [TOMOYO_MAC_FILE_MKSOCK] = TOMOYO_MAC_CATEGORY_FILE, [TOMOYO_MAC_FILE_TRUNCATE] = TOMOYO_MAC_CATEGORY_FILE, [TOMOYO_MAC_FILE_SYMLINK] = TOMOYO_MAC_CATEGORY_FILE, [TOMOYO_MAC_FILE_MKBLOCK] = TOMOYO_MAC_CATEGORY_FILE, [TOMOYO_MAC_FILE_MKCHAR] = TOMOYO_MAC_CATEGORY_FILE, [TOMOYO_MAC_FILE_LINK] = TOMOYO_MAC_CATEGORY_FILE, [TOMOYO_MAC_FILE_RENAME] = TOMOYO_MAC_CATEGORY_FILE, [TOMOYO_MAC_FILE_CHMOD] = TOMOYO_MAC_CATEGORY_FILE, [TOMOYO_MAC_FILE_CHOWN] = TOMOYO_MAC_CATEGORY_FILE, [TOMOYO_MAC_FILE_CHGRP] = TOMOYO_MAC_CATEGORY_FILE, [TOMOYO_MAC_FILE_IOCTL] = TOMOYO_MAC_CATEGORY_FILE, [TOMOYO_MAC_FILE_CHROOT] = TOMOYO_MAC_CATEGORY_FILE, [TOMOYO_MAC_FILE_MOUNT] = TOMOYO_MAC_CATEGORY_FILE, [TOMOYO_MAC_FILE_UMOUNT] = TOMOYO_MAC_CATEGORY_FILE, [TOMOYO_MAC_FILE_PIVOT_ROOT] = TOMOYO_MAC_CATEGORY_FILE, /* CONFIG::network group */ [TOMOYO_MAC_NETWORK_INET_STREAM_BIND] = TOMOYO_MAC_CATEGORY_NETWORK, [TOMOYO_MAC_NETWORK_INET_STREAM_LISTEN] = TOMOYO_MAC_CATEGORY_NETWORK, [TOMOYO_MAC_NETWORK_INET_STREAM_CONNECT] = TOMOYO_MAC_CATEGORY_NETWORK, [TOMOYO_MAC_NETWORK_INET_DGRAM_BIND] = TOMOYO_MAC_CATEGORY_NETWORK, [TOMOYO_MAC_NETWORK_INET_DGRAM_SEND] = TOMOYO_MAC_CATEGORY_NETWORK, [TOMOYO_MAC_NETWORK_INET_RAW_BIND] = TOMOYO_MAC_CATEGORY_NETWORK, [TOMOYO_MAC_NETWORK_INET_RAW_SEND] = TOMOYO_MAC_CATEGORY_NETWORK, [TOMOYO_MAC_NETWORK_UNIX_STREAM_BIND] = TOMOYO_MAC_CATEGORY_NETWORK, [TOMOYO_MAC_NETWORK_UNIX_STREAM_LISTEN] = TOMOYO_MAC_CATEGORY_NETWORK, [TOMOYO_MAC_NETWORK_UNIX_STREAM_CONNECT] = TOMOYO_MAC_CATEGORY_NETWORK, [TOMOYO_MAC_NETWORK_UNIX_DGRAM_BIND] = TOMOYO_MAC_CATEGORY_NETWORK, [TOMOYO_MAC_NETWORK_UNIX_DGRAM_SEND] = TOMOYO_MAC_CATEGORY_NETWORK, [TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_BIND] = TOMOYO_MAC_CATEGORY_NETWORK, [TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_LISTEN] = TOMOYO_MAC_CATEGORY_NETWORK, [TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_CONNECT] = TOMOYO_MAC_CATEGORY_NETWORK, /* CONFIG::misc group */ [TOMOYO_MAC_ENVIRON] = TOMOYO_MAC_CATEGORY_MISC, }; /** * tomoyo_convert_time - Convert time_t to YYYY/MM/DD hh/mm/ss. * * @time64: Seconds since 1970/01/01 00:00:00. * @stamp: Pointer to "struct tomoyo_time". * * Returns nothing. */ void tomoyo_convert_time(time64_t time64, struct tomoyo_time *stamp) { struct tm tm; time64_to_tm(time64, 0, &tm); stamp->sec = tm.tm_sec; stamp->min = tm.tm_min; stamp->hour = tm.tm_hour; stamp->day = tm.tm_mday; stamp->month = tm.tm_mon + 1; stamp->year = tm.tm_year + 1900; } /** * tomoyo_permstr - Find permission keywords. * * @string: String representation for permissions in foo/bar/buz format. * @keyword: Keyword to find from @string/ * * Returns true if @keyword was found in @string, false otherwise. * * This function assumes that strncmp(w1, w2, strlen(w1)) != 0 if w1 != w2. */ bool tomoyo_permstr(const char *string, const char *keyword) { const char *cp = strstr(string, keyword); if (cp) return cp == string || *(cp - 1) == '/'; return false; } /** * tomoyo_read_token - Read a word from a line. * * @param: Pointer to "struct tomoyo_acl_param". * * Returns a word on success, "" otherwise. * * To allow the caller to skip NULL check, this function returns "" rather than * NULL if there is no more words to read. */ char *tomoyo_read_token(struct tomoyo_acl_param *param) { char *pos = param->data; char *del = strchr(pos, ' '); if (del) *del++ = '\0'; else del = pos + strlen(pos); param->data = del; return pos; } static bool tomoyo_correct_path2(const char *filename, const size_t len); /** * tomoyo_get_domainname - Read a domainname from a line. * * @param: Pointer to "struct tomoyo_acl_param". * * Returns a domainname on success, NULL otherwise. */ const struct tomoyo_path_info *tomoyo_get_domainname (struct tomoyo_acl_param *param) { char *start = param->data; char *pos = start; while (*pos) { if (*pos++ != ' ' || tomoyo_correct_path2(pos, strchrnul(pos, ' ') - pos)) continue; *(pos - 1) = '\0'; break; } param->data = pos; if (tomoyo_correct_domain(start)) return tomoyo_get_name(start); return NULL; } /** * tomoyo_parse_ulong - Parse an "unsigned long" value. * * @result: Pointer to "unsigned long". * @str: Pointer to string to parse. * * Returns one of values in "enum tomoyo_value_type". * * The @src is updated to point the first character after the value * on success. */ u8 tomoyo_parse_ulong(unsigned long *result, char **str) { const char *cp = *str; char *ep; int base = 10; if (*cp == '0') { char c = *(cp + 1); if (c == 'x' || c == 'X') { base = 16; cp += 2; } else if (c >= '0' && c <= '7') { base = 8; cp++; } } *result = simple_strtoul(cp, &ep, base); if (cp == ep) return TOMOYO_VALUE_TYPE_INVALID; *str = ep; switch (base) { case 16: return TOMOYO_VALUE_TYPE_HEXADECIMAL; case 8: return TOMOYO_VALUE_TYPE_OCTAL; default: return TOMOYO_VALUE_TYPE_DECIMAL; } } /** * tomoyo_print_ulong - Print an "unsigned long" value. * * @buffer: Pointer to buffer. * @buffer_len: Size of @buffer. * @value: An "unsigned long" value. * @type: Type of @value. * * Returns nothing. */ void tomoyo_print_ulong(char *buffer, const int buffer_len, const unsigned long value, const u8 type) { if (type == TOMOYO_VALUE_TYPE_DECIMAL) snprintf(buffer, buffer_len, "%lu", value); else if (type == TOMOYO_VALUE_TYPE_OCTAL) snprintf(buffer, buffer_len, "0%lo", value); else if (type == TOMOYO_VALUE_TYPE_HEXADECIMAL) snprintf(buffer, buffer_len, "0x%lX", value); else snprintf(buffer, buffer_len, "type(%u)", type); } /** * tomoyo_parse_name_union - Parse a tomoyo_name_union. * * @param: Pointer to "struct tomoyo_acl_param". * @ptr: Pointer to "struct tomoyo_name_union". * * Returns true on success, false otherwise. */ bool tomoyo_parse_name_union(struct tomoyo_acl_param *param, struct tomoyo_name_union *ptr) { char *filename; if (param->data[0] == '@') { param->data++; ptr->group = tomoyo_get_group(param, TOMOYO_PATH_GROUP); return ptr->group != NULL; } filename = tomoyo_read_token(param); if (!tomoyo_correct_word(filename)) return false; ptr->filename = tomoyo_get_name(filename); return ptr->filename != NULL; } /** * tomoyo_parse_number_union - Parse a tomoyo_number_union. * * @param: Pointer to "struct tomoyo_acl_param". * @ptr: Pointer to "struct tomoyo_number_union". * * Returns true on success, false otherwise. */ bool tomoyo_parse_number_union(struct tomoyo_acl_param *param, struct tomoyo_number_union *ptr) { char *data; u8 type; unsigned long v; memset(ptr, 0, sizeof(*ptr)); if (param->data[0] == '@') { param->data++; ptr->group = tomoyo_get_group(param, TOMOYO_NUMBER_GROUP); return ptr->group != NULL; } data = tomoyo_read_token(param); type = tomoyo_parse_ulong(&v, &data); if (type == TOMOYO_VALUE_TYPE_INVALID) return false; ptr->values[0] = v; ptr->value_type[0] = type; if (!*data) { ptr->values[1] = v; ptr->value_type[1] = type; return true; } if (*data++ != '-') return false; type = tomoyo_parse_ulong(&v, &data); if (type == TOMOYO_VALUE_TYPE_INVALID || *data || ptr->values[0] > v) return false; ptr->values[1] = v; ptr->value_type[1] = type; return true; } /** * tomoyo_byte_range - Check whether the string is a \ooo style octal value. * * @str: Pointer to the string. * * Returns true if @str is a \ooo style octal value, false otherwise. * * TOMOYO uses \ooo style representation for 0x01 - 0x20 and 0x7F - 0xFF. * This function verifies that \ooo is in valid range. */ static inline bool tomoyo_byte_range(const char *str) { return *str >= '0' && *str++ <= '3' && *str >= '0' && *str++ <= '7' && *str >= '0' && *str <= '7'; } /** * tomoyo_alphabet_char - Check whether the character is an alphabet. * * @c: The character to check. * * Returns true if @c is an alphabet character, false otherwise. */ static inline bool tomoyo_alphabet_char(const char c) { return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); } /** * tomoyo_make_byte - Make byte value from three octal characters. * * @c1: The first character. * @c2: The second character. * @c3: The third character. * * Returns byte value. */ static inline u8 tomoyo_make_byte(const u8 c1, const u8 c2, const u8 c3) { return ((c1 - '0') << 6) + ((c2 - '0') << 3) + (c3 - '0'); } /** * tomoyo_valid - Check whether the character is a valid char. * * @c: The character to check. * * Returns true if @c is a valid character, false otherwise. */ static inline bool tomoyo_valid(const unsigned char c) { return c > ' ' && c < 127; } /** * tomoyo_invalid - Check whether the character is an invalid char. * * @c: The character to check. * * Returns true if @c is an invalid character, false otherwise. */ static inline bool tomoyo_invalid(const unsigned char c) { return c && (c <= ' ' || c >= 127); } /** * tomoyo_str_starts - Check whether the given string starts with the given keyword. * * @src: Pointer to pointer to the string. * @find: Pointer to the keyword. * * Returns true if @src starts with @find, false otherwise. * * The @src is updated to point the first character after the @find * if @src starts with @find. */ bool tomoyo_str_starts(char **src, const char *find) { const int len = strlen(find); char *tmp = *src; if (strncmp(tmp, find, len)) return false; tmp += len; *src = tmp; return true; } /** * tomoyo_normalize_line - Format string. * * @buffer: The line to normalize. * * Leading and trailing whitespaces are removed. * Multiple whitespaces are packed into single space. * * Returns nothing. */ void tomoyo_normalize_line(unsigned char *buffer) { unsigned char *sp = buffer; unsigned char *dp = buffer; bool first = true; while (tomoyo_invalid(*sp)) sp++; while (*sp) { if (!first) *dp++ = ' '; first = false; while (tomoyo_valid(*sp)) *dp++ = *sp++; while (tomoyo_invalid(*sp)) sp++; } *dp = '\0'; } /** * tomoyo_correct_word2 - Validate a string. * * @string: The string to check. Maybe non-'\0'-terminated. * @len: Length of @string. * * Check whether the given string follows the naming rules. * Returns true if @string follows the naming rules, false otherwise. */ static bool tomoyo_correct_word2(const char *string, size_t len) { u8 recursion = 20; const char *const start = string; bool in_repetition = false; if (!len) goto out; while (len--) { unsigned char c = *string++; if (c == '\\') { if (!len--) goto out; c = *string++; if (c >= '0' && c <= '3') { unsigned char d; unsigned char e; if (!len-- || !len--) goto out; d = *string++; e = *string++; if (d < '0' || d > '7' || e < '0' || e > '7') goto out; c = tomoyo_make_byte(c, d, e); if (c <= ' ' || c >= 127) continue; goto out; } switch (c) { case '\\': /* "\\" */ case '+': /* "\+" */ case '?': /* "\?" */ case 'x': /* "\x" */ case 'a': /* "\a" */ case '-': /* "\-" */ continue; } if (!recursion--) goto out; switch (c) { case '*': /* "\*" */ case '@': /* "\@" */ case '$': /* "\$" */ case 'X': /* "\X" */ case 'A': /* "\A" */ continue; case '{': /* "/\{" */ if (string - 3 < start || *(string - 3) != '/') goto out; in_repetition = true; continue; case '}': /* "\}/" */ if (*string != '/') goto out; if (!in_repetition) goto out; in_repetition = false; continue; } goto out; } else if (in_repetition && c == '/') { goto out; } else if (c <= ' ' || c >= 127) { goto out; } } if (in_repetition) goto out; return true; out: return false; } /** * tomoyo_correct_word - Validate a string. * * @string: The string to check. * * Check whether the given string follows the naming rules. * Returns true if @string follows the naming rules, false otherwise. */ bool tomoyo_correct_word(const char *string) { return tomoyo_correct_word2(string, strlen(string)); } /** * tomoyo_correct_path2 - Check whether the given pathname follows the naming rules. * * @filename: The pathname to check. * @len: Length of @filename. * * Returns true if @filename follows the naming rules, false otherwise. */ static bool tomoyo_correct_path2(const char *filename, const size_t len) { const char *cp1 = memchr(filename, '/', len); const char *cp2 = memchr(filename, '.', len); return cp1 && (!cp2 || (cp1 < cp2)) && tomoyo_correct_word2(filename, len); } /** * tomoyo_correct_path - Validate a pathname. * * @filename: The pathname to check. * * Check whether the given pathname follows the naming rules. * Returns true if @filename follows the naming rules, false otherwise. */ bool tomoyo_correct_path(const char *filename) { return tomoyo_correct_path2(filename, strlen(filename)); } /** * tomoyo_correct_domain - Check whether the given domainname follows the naming rules. * * @domainname: The domainname to check. * * Returns true if @domainname follows the naming rules, false otherwise. */ bool tomoyo_correct_domain(const unsigned char *domainname) { if (!domainname || !tomoyo_domain_def(domainname)) return false; domainname = strchr(domainname, ' '); if (!domainname++) return true; while (1) { const unsigned char *cp = strchr(domainname, ' '); if (!cp) break; if (!tomoyo_correct_path2(domainname, cp - domainname)) return false; domainname = cp + 1; } return tomoyo_correct_path(domainname); } /** * tomoyo_domain_def - Check whether the given token can be a domainname. * * @buffer: The token to check. * * Returns true if @buffer possibly be a domainname, false otherwise. */ bool tomoyo_domain_def(const unsigned char *buffer) { const unsigned char *cp; int len; if (*buffer != '<') return false; cp = strchr(buffer, ' '); if (!cp) len = strlen(buffer); else len = cp - buffer; if (buffer[len - 1] != '>' || !tomoyo_correct_word2(buffer + 1, len - 2)) return false; return true; } /** * tomoyo_find_domain - Find a domain by the given name. * * @domainname: The domainname to find. * * Returns pointer to "struct tomoyo_domain_info" if found, NULL otherwise. * * Caller holds tomoyo_read_lock(). */ struct tomoyo_domain_info *tomoyo_find_domain(const char *domainname) { struct tomoyo_domain_info *domain; struct tomoyo_path_info name; name.name = domainname; tomoyo_fill_path_info(&name); list_for_each_entry_rcu(domain, &tomoyo_domain_list, list, srcu_read_lock_held(&tomoyo_ss)) { if (!domain->is_deleted && !tomoyo_pathcmp(&name, domain->domainname)) return domain; } return NULL; } /** * tomoyo_const_part_length - Evaluate the initial length without a pattern in a token. * * @filename: The string to evaluate. * * Returns the initial length without a pattern in @filename. */ static int tomoyo_const_part_length(const char *filename) { char c; int len = 0; if (!filename) return 0; while ((c = *filename++) != '\0') { if (c != '\\') { len++; continue; } c = *filename++; switch (c) { case '\\': /* "\\" */ len += 2; continue; case '0': /* "\ooo" */ case '1': case '2': case '3': c = *filename++; if (c < '0' || c > '7') break; c = *filename++; if (c < '0' || c > '7') break; len += 4; continue; } break; } return len; } /** * tomoyo_fill_path_info - Fill in "struct tomoyo_path_info" members. * * @ptr: Pointer to "struct tomoyo_path_info" to fill in. * * The caller sets "struct tomoyo_path_info"->name. */ void tomoyo_fill_path_info(struct tomoyo_path_info *ptr) { const char *name = ptr->name; const int len = strlen(name); ptr->const_len = tomoyo_const_part_length(name); ptr->is_dir = len && (name[len - 1] == '/'); ptr->is_patterned = (ptr->const_len < len); ptr->hash = full_name_hash(NULL, name, len); } /** * tomoyo_file_matches_pattern2 - Pattern matching without '/' character and "\-" pattern. * * @filename: The start of string to check. * @filename_end: The end of string to check. * @pattern: The start of pattern to compare. * @pattern_end: The end of pattern to compare. * * Returns true if @filename matches @pattern, false otherwise. */ static bool tomoyo_file_matches_pattern2(const char *filename, const char *filename_end, const char *pattern, const char *pattern_end) { while (filename < filename_end && pattern < pattern_end) { char c; int i; int j; if (*pattern != '\\') { if (*filename++ != *pattern++) return false; continue; } c = *filename; pattern++; switch (*pattern) { case '?': if (c == '/') { return false; } else if (c == '\\') { if (filename[1] == '\\') filename++; else if (tomoyo_byte_range(filename + 1)) filename += 3; else return false; } break; case '\\': if (c != '\\') return false; if (*++filename != '\\') return false; break; case '+': if (!isdigit(c)) return false; break; case 'x': if (!isxdigit(c)) return false; break; case 'a': if (!tomoyo_alphabet_char(c)) return false; break; case '0': case '1': case '2': case '3': if (c == '\\' && tomoyo_byte_range(filename + 1) && strncmp(filename + 1, pattern, 3) == 0) { filename += 3; pattern += 2; break; } return false; /* Not matched. */ case '*': case '@': for (i = 0; i <= filename_end - filename; i++) { if (tomoyo_file_matches_pattern2( filename + i, filename_end, pattern + 1, pattern_end)) return true; c = filename[i]; if (c == '.' && *pattern == '@') break; if (c != '\\') continue; if (filename[i + 1] == '\\') i++; else if (tomoyo_byte_range(filename + i + 1)) i += 3; else break; /* Bad pattern. */ } return false; /* Not matched. */ default: j = 0; c = *pattern; if (c == '$') { while (isdigit(filename[j])) j++; } else if (c == 'X') { while (isxdigit(filename[j])) j++; } else if (c == 'A') { while (tomoyo_alphabet_char(filename[j])) j++; } for (i = 1; i <= j; i++) { if (tomoyo_file_matches_pattern2( filename + i, filename_end, pattern + 1, pattern_end)) return true; } return false; /* Not matched or bad pattern. */ } filename++; pattern++; } while (*pattern == '\\' && (*(pattern + 1) == '*' || *(pattern + 1) == '@')) pattern += 2; return filename == filename_end && pattern == pattern_end; } /** * tomoyo_file_matches_pattern - Pattern matching without '/' character. * * @filename: The start of string to check. * @filename_end: The end of string to check. * @pattern: The start of pattern to compare. * @pattern_end: The end of pattern to compare. * * Returns true if @filename matches @pattern, false otherwise. */ static bool tomoyo_file_matches_pattern(const char *filename, const char *filename_end, const char *pattern, const char *pattern_end) { const char *pattern_start = pattern; bool first = true; bool result; while (pattern < pattern_end - 1) { /* Split at "\-" pattern. */ if (*pattern++ != '\\' || *pattern++ != '-') continue; result = tomoyo_file_matches_pattern2(filename, filename_end, pattern_start, pattern - 2); if (first) result = !result; if (result) return false; first = false; pattern_start = pattern; } result = tomoyo_file_matches_pattern2(filename, filename_end, pattern_start, pattern_end); return first ? result : !result; } /** * tomoyo_path_matches_pattern2 - Do pathname pattern matching. * * @f: The start of string to check. * @p: The start of pattern to compare. * * Returns true if @f matches @p, false otherwise. */ static bool tomoyo_path_matches_pattern2(const char *f, const char *p) { const char *f_delimiter; const char *p_delimiter; while (*f && *p) { f_delimiter = strchr(f, '/'); if (!f_delimiter) f_delimiter = f + strlen(f); p_delimiter = strchr(p, '/'); if (!p_delimiter) p_delimiter = p + strlen(p); if (*p == '\\' && *(p + 1) == '{') goto recursive; if (!tomoyo_file_matches_pattern(f, f_delimiter, p, p_delimiter)) return false; f = f_delimiter; if (*f) f++; p = p_delimiter; if (*p) p++; } /* Ignore trailing "\*" and "\@" in @pattern. */ while (*p == '\\' && (*(p + 1) == '*' || *(p + 1) == '@')) p += 2; return !*f && !*p; recursive: /* * The "\{" pattern is permitted only after '/' character. * This guarantees that below "*(p - 1)" is safe. * Also, the "\}" pattern is permitted only before '/' character * so that "\{" + "\}" pair will not break the "\-" operator. */ if (*(p - 1) != '/' || p_delimiter <= p + 3 || *p_delimiter != '/' || *(p_delimiter - 1) != '}' || *(p_delimiter - 2) != '\\') return false; /* Bad pattern. */ do { /* Compare current component with pattern. */ if (!tomoyo_file_matches_pattern(f, f_delimiter, p + 2, p_delimiter - 2)) break; /* Proceed to next component. */ f = f_delimiter; if (!*f) break; f++; /* Continue comparison. */ if (tomoyo_path_matches_pattern2(f, p_delimiter + 1)) return true; f_delimiter = strchr(f, '/'); } while (f_delimiter); return false; /* Not matched. */ } /** * tomoyo_path_matches_pattern - Check whether the given filename matches the given pattern. * * @filename: The filename to check. * @pattern: The pattern to compare. * * Returns true if matches, false otherwise. * * The following patterns are available. * \\ \ itself. * \ooo Octal representation of a byte. * \* Zero or more repetitions of characters other than '/'. * \@ Zero or more repetitions of characters other than '/' or '.'. * \? 1 byte character other than '/'. * \$ One or more repetitions of decimal digits. * \+ 1 decimal digit. * \X One or more repetitions of hexadecimal digits. * \x 1 hexadecimal digit. * \A One or more repetitions of alphabet characters. * \a 1 alphabet character. * * \- Subtraction operator. * * /\{dir\}/ '/' + 'One or more repetitions of dir/' (e.g. /dir/ /dir/dir/ * /dir/dir/dir/ ). */ bool tomoyo_path_matches_pattern(const struct tomoyo_path_info *filename, const struct tomoyo_path_info *pattern) { const char *f = filename->name; const char *p = pattern->name; const int len = pattern->const_len; /* If @pattern doesn't contain pattern, I can use strcmp(). */ if (!pattern->is_patterned) return !tomoyo_pathcmp(filename, pattern); /* Don't compare directory and non-directory. */ if (filename->is_dir != pattern->is_dir) return false; /* Compare the initial length without patterns. */ if (strncmp(f, p, len)) return false; f += len; p += len; return tomoyo_path_matches_pattern2(f, p); } /** * tomoyo_get_exe - Get tomoyo_realpath() of current process. * * Returns the tomoyo_realpath() of current process on success, NULL otherwise. * * This function uses kzalloc(), so the caller must call kfree() * if this function didn't return NULL. */ const char *tomoyo_get_exe(void) { struct file *exe_file; const char *cp; struct mm_struct *mm = current->mm; if (!mm) return NULL; exe_file = get_mm_exe_file(mm); if (!exe_file) return NULL; cp = tomoyo_realpath_from_path(&exe_file->f_path); fput(exe_file); return cp; } /** * tomoyo_get_mode - Get MAC mode. * * @ns: Pointer to "struct tomoyo_policy_namespace". * @profile: Profile number. * @index: Index number of functionality. * * Returns mode. */ int tomoyo_get_mode(const struct tomoyo_policy_namespace *ns, const u8 profile, const u8 index) { u8 mode; struct tomoyo_profile *p; if (!tomoyo_policy_loaded) return TOMOYO_CONFIG_DISABLED; p = tomoyo_profile(ns, profile); mode = p->config[index]; if (mode == TOMOYO_CONFIG_USE_DEFAULT) mode = p->config[tomoyo_index2category[index] + TOMOYO_MAX_MAC_INDEX]; if (mode == TOMOYO_CONFIG_USE_DEFAULT) mode = p->default_config; return mode & 3; } /** * tomoyo_init_request_info - Initialize "struct tomoyo_request_info" members. * * @r: Pointer to "struct tomoyo_request_info" to initialize. * @domain: Pointer to "struct tomoyo_domain_info". NULL for tomoyo_domain(). * @index: Index number of functionality. * * Returns mode. */ int tomoyo_init_request_info(struct tomoyo_request_info *r, struct tomoyo_domain_info *domain, const u8 index) { u8 profile; memset(r, 0, sizeof(*r)); if (!domain) domain = tomoyo_domain(); r->domain = domain; profile = domain->profile; r->profile = profile; r->type = index; r->mode = tomoyo_get_mode(domain->ns, profile, index); return r->mode; } /** * tomoyo_domain_quota_is_ok - Check for domain's quota. * * @r: Pointer to "struct tomoyo_request_info". * * Returns true if the domain is not exceeded quota, false otherwise. * * Caller holds tomoyo_read_lock(). */ bool tomoyo_domain_quota_is_ok(struct tomoyo_request_info *r) { unsigned int count = 0; struct tomoyo_domain_info *domain = r->domain; struct tomoyo_acl_info *ptr; if (r->mode != TOMOYO_CONFIG_LEARNING) return false; if (!domain) return true; if (READ_ONCE(domain->flags[TOMOYO_DIF_QUOTA_WARNED])) return false; list_for_each_entry_rcu(ptr, &domain->acl_info_list, list, srcu_read_lock_held(&tomoyo_ss)) { u16 perm; if (ptr->is_deleted) continue; /* * Reading perm bitmap might race with tomoyo_merge_*() because * caller does not hold tomoyo_policy_lock mutex. But exceeding * max_learning_entry parameter by a few entries does not harm. */ switch (ptr->type) { case TOMOYO_TYPE_PATH_ACL: perm = data_race(container_of(ptr, struct tomoyo_path_acl, head)->perm); break; case TOMOYO_TYPE_PATH2_ACL: perm = data_race(container_of(ptr, struct tomoyo_path2_acl, head)->perm); break; case TOMOYO_TYPE_PATH_NUMBER_ACL: perm = data_race(container_of(ptr, struct tomoyo_path_number_acl, head) ->perm); break; case TOMOYO_TYPE_MKDEV_ACL: perm = data_race(container_of(ptr, struct tomoyo_mkdev_acl, head)->perm); break; case TOMOYO_TYPE_INET_ACL: perm = data_race(container_of(ptr, struct tomoyo_inet_acl, head)->perm); break; case TOMOYO_TYPE_UNIX_ACL: perm = data_race(container_of(ptr, struct tomoyo_unix_acl, head)->perm); break; case TOMOYO_TYPE_MANUAL_TASK_ACL: perm = 0; break; default: perm = 1; } count += hweight16(perm); } if (count < tomoyo_profile(domain->ns, domain->profile)-> pref[TOMOYO_PREF_MAX_LEARNING_ENTRY]) return true; WRITE_ONCE(domain->flags[TOMOYO_DIF_QUOTA_WARNED], true); /* r->granted = false; */ tomoyo_write_log(r, "%s", tomoyo_dif[TOMOYO_DIF_QUOTA_WARNED]); #ifndef CONFIG_SECURITY_TOMOYO_INSECURE_BUILTIN_SETTING pr_warn("WARNING: Domain '%s' has too many ACLs to hold. Stopped learning mode.\n", domain->domainname->name); #endif return false; }
8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 /* SPDX-License-Identifier: GPL-2.0-only */ /* Authors: Karl MacMillan <kmacmillan@tresys.com> * Frank Mayer <mayerf@tresys.com> * Copyright (C) 2003 - 2004 Tresys Technology, LLC */ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/string.h> #include <linux/spinlock.h> #include <linux/slab.h> #include "security.h" #include "conditional.h" #include "services.h" /* * cond_evaluate_expr evaluates a conditional expr * in reverse polish notation. It returns true (1), false (0), * or undefined (-1). Undefined occurs when the expression * exceeds the stack depth of COND_EXPR_MAXDEPTH. */ static int cond_evaluate_expr(struct policydb *p, struct cond_expr *expr) { u32 i; int s[COND_EXPR_MAXDEPTH]; int sp = -1; if (expr->len == 0) return -1; for (i = 0; i < expr->len; i++) { struct cond_expr_node *node = &expr->nodes[i]; switch (node->expr_type) { case COND_BOOL: if (sp == (COND_EXPR_MAXDEPTH - 1)) return -1; sp++; s[sp] = p->bool_val_to_struct[node->boolean - 1]->state; break; case COND_NOT: if (sp < 0) return -1; s[sp] = !s[sp]; break; case COND_OR: if (sp < 1) return -1; sp--; s[sp] |= s[sp + 1]; break; case COND_AND: if (sp < 1) return -1; sp--; s[sp] &= s[sp + 1]; break; case COND_XOR: if (sp < 1) return -1; sp--; s[sp] ^= s[sp + 1]; break; case COND_EQ: if (sp < 1) return -1; sp--; s[sp] = (s[sp] == s[sp + 1]); break; case COND_NEQ: if (sp < 1) return -1; sp--; s[sp] = (s[sp] != s[sp + 1]); break; default: return -1; } } return s[0]; } /* * evaluate_cond_node evaluates the conditional stored in * a struct cond_node and if the result is different than the * current state of the node it sets the rules in the true/false * list appropriately. If the result of the expression is undefined * all of the rules are disabled for safety. */ static void evaluate_cond_node(struct policydb *p, struct cond_node *node) { struct avtab_node *avnode; int new_state; u32 i; new_state = cond_evaluate_expr(p, &node->expr); if (new_state != node->cur_state) { node->cur_state = new_state; if (new_state == -1) pr_err("SELinux: expression result was undefined - disabling all rules.\n"); /* turn the rules on or off */ for (i = 0; i < node->true_list.len; i++) { avnode = node->true_list.nodes[i]; if (new_state <= 0) avnode->key.specified &= ~AVTAB_ENABLED; else avnode->key.specified |= AVTAB_ENABLED; } for (i = 0; i < node->false_list.len; i++) { avnode = node->false_list.nodes[i]; /* -1 or 1 */ if (new_state) avnode->key.specified &= ~AVTAB_ENABLED; else avnode->key.specified |= AVTAB_ENABLED; } } } void evaluate_cond_nodes(struct policydb *p) { u32 i; for (i = 0; i < p->cond_list_len; i++) evaluate_cond_node(p, &p->cond_list[i]); } void cond_policydb_init(struct policydb *p) { p->bool_val_to_struct = NULL; p->cond_list = NULL; p->cond_list_len = 0; avtab_init(&p->te_cond_avtab); } static void cond_node_destroy(struct cond_node *node) { kfree(node->expr.nodes); /* the avtab_ptr_t nodes are destroyed by the avtab */ kfree(node->true_list.nodes); kfree(node->false_list.nodes); } static void cond_list_destroy(struct policydb *p) { u32 i; for (i = 0; i < p->cond_list_len; i++) cond_node_destroy(&p->cond_list[i]); kfree(p->cond_list); p->cond_list = NULL; p->cond_list_len = 0; } void cond_policydb_destroy(struct policydb *p) { kfree(p->bool_val_to_struct); avtab_destroy(&p->te_cond_avtab); cond_list_destroy(p); } int cond_init_bool_indexes(struct policydb *p) { kfree(p->bool_val_to_struct); p->bool_val_to_struct = kmalloc_array( p->p_bools.nprim, sizeof(*p->bool_val_to_struct), GFP_KERNEL); if (!p->bool_val_to_struct) return -ENOMEM; avtab_hash_eval(&p->te_cond_avtab, "conditional_rules"); return 0; } int cond_destroy_bool(void *key, void *datum, void *p) { kfree(key); kfree(datum); return 0; } int cond_index_bool(void *key, void *datum, void *datap) { struct policydb *p; struct cond_bool_datum *booldatum; booldatum = datum; p = datap; if (!booldatum->value || booldatum->value > p->p_bools.nprim) return -EINVAL; p->sym_val_to_name[SYM_BOOLS][booldatum->value - 1] = key; p->bool_val_to_struct[booldatum->value - 1] = booldatum; return 0; } static int bool_isvalid(struct cond_bool_datum *b) { if (!(b->state == 0 || b->state == 1)) return 0; return 1; } int cond_read_bool(struct policydb *p, struct symtab *s, struct policy_file *fp) { char *key = NULL; struct cond_bool_datum *booldatum; __le32 buf[3]; u32 len; int rc; booldatum = kzalloc(sizeof(*booldatum), GFP_KERNEL); if (!booldatum) return -ENOMEM; rc = next_entry(buf, fp, sizeof(buf)); if (rc) goto err; booldatum->value = le32_to_cpu(buf[0]); booldatum->state = le32_to_cpu(buf[1]); rc = -EINVAL; if (!bool_isvalid(booldatum)) goto err; len = le32_to_cpu(buf[2]); rc = str_read(&key, GFP_KERNEL, fp, len); if (rc) goto err; rc = symtab_insert(s, key, booldatum); if (rc) goto err; return 0; err: cond_destroy_bool(key, booldatum, NULL); return rc; } struct cond_insertf_data { struct policydb *p; struct avtab_node **dst; struct cond_av_list *other; }; static int cond_insertf(struct avtab *a, const struct avtab_key *k, const struct avtab_datum *d, void *ptr) { struct cond_insertf_data *data = ptr; struct policydb *p = data->p; struct cond_av_list *other = data->other; struct avtab_node *node_ptr; u32 i; bool found; /* * For type rules we have to make certain there aren't any * conflicting rules by searching the te_avtab and the * cond_te_avtab. */ if (k->specified & AVTAB_TYPE) { if (avtab_search_node(&p->te_avtab, k)) { pr_err("SELinux: type rule already exists outside of a conditional.\n"); return -EINVAL; } /* * If we are reading the false list other will be a pointer to * the true list. We can have duplicate entries if there is only * 1 other entry and it is in our true list. * * If we are reading the true list (other == NULL) there shouldn't * be any other entries. */ if (other) { node_ptr = avtab_search_node(&p->te_cond_avtab, k); if (node_ptr) { if (avtab_search_node_next(node_ptr, k->specified)) { pr_err("SELinux: too many conflicting type rules.\n"); return -EINVAL; } found = false; for (i = 0; i < other->len; i++) { if (other->nodes[i] == node_ptr) { found = true; break; } } if (!found) { pr_err("SELinux: conflicting type rules.\n"); return -EINVAL; } } } else { if (avtab_search_node(&p->te_cond_avtab, k)) { pr_err("SELinux: conflicting type rules when adding type rule for true.\n"); return -EINVAL; } } } node_ptr = avtab_insert_nonunique(&p->te_cond_avtab, k, d); if (!node_ptr) { pr_err("SELinux: could not insert rule.\n"); return -ENOMEM; } *data->dst = node_ptr; return 0; } static int cond_read_av_list(struct policydb *p, struct policy_file *fp, struct cond_av_list *list, struct cond_av_list *other) { int rc; __le32 buf[1]; u32 i, len; struct cond_insertf_data data; rc = next_entry(buf, fp, sizeof(u32)); if (rc) return rc; len = le32_to_cpu(buf[0]); if (len == 0) return 0; list->nodes = kcalloc(len, sizeof(*list->nodes), GFP_KERNEL); if (!list->nodes) return -ENOMEM; data.p = p; data.other = other; for (i = 0; i < len; i++) { data.dst = &list->nodes[i]; rc = avtab_read_item(&p->te_cond_avtab, fp, p, cond_insertf, &data, true); if (rc) { kfree(list->nodes); list->nodes = NULL; return rc; } } list->len = len; return 0; } static int expr_node_isvalid(struct policydb *p, struct cond_expr_node *expr) { if (expr->expr_type <= 0 || expr->expr_type > COND_LAST) { pr_err("SELinux: conditional expressions uses unknown operator.\n"); return 0; } if (expr->boolean > p->p_bools.nprim) { pr_err("SELinux: conditional expressions uses unknown bool.\n"); return 0; } return 1; } static int cond_read_node(struct policydb *p, struct cond_node *node, struct policy_file *fp) { __le32 buf[2]; u32 i, len; int rc; rc = next_entry(buf, fp, sizeof(u32) * 2); if (rc) return rc; node->cur_state = le32_to_cpu(buf[0]); /* expr */ len = le32_to_cpu(buf[1]); node->expr.nodes = kcalloc(len, sizeof(*node->expr.nodes), GFP_KERNEL); if (!node->expr.nodes) return -ENOMEM; node->expr.len = len; for (i = 0; i < len; i++) { struct cond_expr_node *expr = &node->expr.nodes[i]; rc = next_entry(buf, fp, sizeof(u32) * 2); if (rc) return rc; expr->expr_type = le32_to_cpu(buf[0]); expr->boolean = le32_to_cpu(buf[1]); if (!expr_node_isvalid(p, expr)) return -EINVAL; } rc = cond_read_av_list(p, fp, &node->true_list, NULL); if (rc) return rc; return cond_read_av_list(p, fp, &node->false_list, &node->true_list); } int cond_read_list(struct policydb *p, struct policy_file *fp) { __le32 buf[1]; u32 i, len; int rc; rc = next_entry(buf, fp, sizeof(buf)); if (rc) return rc; len = le32_to_cpu(buf[0]); p->cond_list = kcalloc(len, sizeof(*p->cond_list), GFP_KERNEL); if (!p->cond_list) return -ENOMEM; rc = avtab_alloc(&(p->te_cond_avtab), p->te_avtab.nel); if (rc) goto err; p->cond_list_len = len; for (i = 0; i < len; i++) { rc = cond_read_node(p, &p->cond_list[i], fp); if (rc) goto err; } return 0; err: cond_list_destroy(p); return rc; } int cond_write_bool(void *vkey, void *datum, void *ptr) { char *key = vkey; struct cond_bool_datum *booldatum = datum; struct policy_data *pd = ptr; struct policy_file *fp = pd->fp; __le32 buf[3]; u32 len; int rc; len = strlen(key); buf[0] = cpu_to_le32(booldatum->value); buf[1] = cpu_to_le32(booldatum->state); buf[2] = cpu_to_le32(len); rc = put_entry(buf, sizeof(u32), 3, fp); if (rc) return rc; rc = put_entry(key, 1, len, fp); if (rc) return rc; return 0; } /* * cond_write_cond_av_list doesn't write out the av_list nodes. * Instead it writes out the key/value pairs from the avtab. This * is necessary because there is no way to uniquely identifying rules * in the avtab so it is not possible to associate individual rules * in the avtab with a conditional without saving them as part of * the conditional. This means that the avtab with the conditional * rules will not be saved but will be rebuilt on policy load. */ static int cond_write_av_list(struct policydb *p, struct cond_av_list *list, struct policy_file *fp) { __le32 buf[1]; u32 i; int rc; buf[0] = cpu_to_le32(list->len); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; for (i = 0; i < list->len; i++) { rc = avtab_write_item(p, list->nodes[i], fp); if (rc) return rc; } return 0; } static int cond_write_node(struct policydb *p, struct cond_node *node, struct policy_file *fp) { __le32 buf[2]; int rc; u32 i; buf[0] = cpu_to_le32(node->cur_state); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; buf[0] = cpu_to_le32(node->expr.len); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; for (i = 0; i < node->expr.len; i++) { buf[0] = cpu_to_le32(node->expr.nodes[i].expr_type); buf[1] = cpu_to_le32(node->expr.nodes[i].boolean); rc = put_entry(buf, sizeof(u32), 2, fp); if (rc) return rc; } rc = cond_write_av_list(p, &node->true_list, fp); if (rc) return rc; rc = cond_write_av_list(p, &node->false_list, fp); if (rc) return rc; return 0; } int cond_write_list(struct policydb *p, struct policy_file *fp) { u32 i; __le32 buf[1]; int rc; buf[0] = cpu_to_le32(p->cond_list_len); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; for (i = 0; i < p->cond_list_len; i++) { rc = cond_write_node(p, &p->cond_list[i], fp); if (rc) return rc; } return 0; } void cond_compute_xperms(struct avtab *ctab, struct avtab_key *key, struct extended_perms_decision *xpermd) { struct avtab_node *node; if (!ctab || !key || !xpermd) return; for (node = avtab_search_node(ctab, key); node; node = avtab_search_node_next(node, key->specified)) { if (node->key.specified & AVTAB_ENABLED) services_compute_xperms_decision(xpermd, node); } } /* Determine whether additional permissions are granted by the conditional * av table, and if so, add them to the result */ void cond_compute_av(struct avtab *ctab, struct avtab_key *key, struct av_decision *avd, struct extended_perms *xperms) { struct avtab_node *node; if (!ctab || !key || !avd) return; for (node = avtab_search_node(ctab, key); node; node = avtab_search_node_next(node, key->specified)) { if ((u16)(AVTAB_ALLOWED | AVTAB_ENABLED) == (node->key.specified & (AVTAB_ALLOWED | AVTAB_ENABLED))) avd->allowed |= node->datum.u.data; if ((u16)(AVTAB_AUDITDENY | AVTAB_ENABLED) == (node->key.specified & (AVTAB_AUDITDENY | AVTAB_ENABLED))) /* Since a '0' in an auditdeny mask represents a * permission we do NOT want to audit (dontaudit), we use * the '&' operand to ensure that all '0's in the mask * are retained (much unlike the allow and auditallow cases). */ avd->auditdeny &= node->datum.u.data; if ((u16)(AVTAB_AUDITALLOW | AVTAB_ENABLED) == (node->key.specified & (AVTAB_AUDITALLOW | AVTAB_ENABLED))) avd->auditallow |= node->datum.u.data; if (xperms && (node->key.specified & AVTAB_ENABLED) && (node->key.specified & AVTAB_XPERMS)) services_compute_xperms_drivers(xperms, node); } } static int cond_dup_av_list(struct cond_av_list *new, const struct cond_av_list *orig, struct avtab *avtab) { u32 i; memset(new, 0, sizeof(*new)); new->nodes = kcalloc(orig->len, sizeof(*new->nodes), GFP_KERNEL); if (!new->nodes) return -ENOMEM; for (i = 0; i < orig->len; i++) { new->nodes[i] = avtab_insert_nonunique( avtab, &orig->nodes[i]->key, &orig->nodes[i]->datum); if (!new->nodes[i]) return -ENOMEM; new->len++; } return 0; } static int duplicate_policydb_cond_list(struct policydb *newp, const struct policydb *origp) { int rc; u32 i; rc = avtab_alloc_dup(&newp->te_cond_avtab, &origp->te_cond_avtab); if (rc) return rc; newp->cond_list_len = 0; newp->cond_list = kcalloc(origp->cond_list_len, sizeof(*newp->cond_list), GFP_KERNEL); if (!newp->cond_list) goto error; for (i = 0; i < origp->cond_list_len; i++) { struct cond_node *newn = &newp->cond_list[i]; const struct cond_node *orign = &origp->cond_list[i]; newp->cond_list_len++; newn->cur_state = orign->cur_state; newn->expr.nodes = kmemdup(orign->expr.nodes, orign->expr.len * sizeof(*orign->expr.nodes), GFP_KERNEL); if (!newn->expr.nodes) goto error; newn->expr.len = orign->expr.len; rc = cond_dup_av_list(&newn->true_list, &orign->true_list, &newp->te_cond_avtab); if (rc) goto error; rc = cond_dup_av_list(&newn->false_list, &orign->false_list, &newp->te_cond_avtab); if (rc) goto error; } return 0; error: avtab_destroy(&newp->te_cond_avtab); cond_list_destroy(newp); return -ENOMEM; } static int cond_bools_destroy(void *key, void *datum, void *args) { /* key was not copied so no need to free here */ kfree(datum); return 0; } static int cond_bools_copy(struct hashtab_node *new, const struct hashtab_node *orig, void *args) { struct cond_bool_datum *datum; datum = kmemdup(orig->datum, sizeof(struct cond_bool_datum), GFP_KERNEL); if (!datum) return -ENOMEM; new->key = orig->key; /* No need to copy, never modified */ new->datum = datum; return 0; } static int cond_bools_index(void *key, void *datum, void *args) { struct cond_bool_datum *booldatum, **cond_bool_array; booldatum = datum; cond_bool_array = args; cond_bool_array[booldatum->value - 1] = booldatum; return 0; } static int duplicate_policydb_bools(struct policydb *newdb, const struct policydb *orig) { struct cond_bool_datum **cond_bool_array; int rc; cond_bool_array = kmalloc_array(orig->p_bools.nprim, sizeof(*orig->bool_val_to_struct), GFP_KERNEL); if (!cond_bool_array) return -ENOMEM; rc = hashtab_duplicate(&newdb->p_bools.table, &orig->p_bools.table, cond_bools_copy, cond_bools_destroy, NULL); if (rc) { kfree(cond_bool_array); return -ENOMEM; } hashtab_map(&newdb->p_bools.table, cond_bools_index, cond_bool_array); newdb->bool_val_to_struct = cond_bool_array; newdb->p_bools.nprim = orig->p_bools.nprim; return 0; } void cond_policydb_destroy_dup(struct policydb *p) { hashtab_map(&p->p_bools.table, cond_bools_destroy, NULL); hashtab_destroy(&p->p_bools.table); cond_policydb_destroy(p); } int cond_policydb_dup(struct policydb *new, const struct policydb *orig) { cond_policydb_init(new); if (duplicate_policydb_bools(new, orig)) return -ENOMEM; if (duplicate_policydb_cond_list(new, orig)) { cond_policydb_destroy_dup(new); return -ENOMEM; } return 0; }
1892 367 115 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2013 Huawei Ltd. * Author: Jiang Liu <liuj97@gmail.com> * * Based on arch/arm/include/asm/jump_label.h */ #ifndef __ASM_JUMP_LABEL_H #define __ASM_JUMP_LABEL_H #ifndef __ASSEMBLY__ #include <linux/types.h> #include <asm/insn.h> #define HAVE_JUMP_LABEL_BATCH #define JUMP_LABEL_NOP_SIZE AARCH64_INSN_SIZE #define JUMP_TABLE_ENTRY(key, label) \ ".pushsection __jump_table, \"aw\"\n\t" \ ".align 3\n\t" \ ".long 1b - ., " label " - .\n\t" \ ".quad " key " - .\n\t" \ ".popsection\n\t" /* This macro is also expanded on the Rust side. */ #define ARCH_STATIC_BRANCH_ASM(key, label) \ "1: nop\n\t" \ JUMP_TABLE_ENTRY(key, label) static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch) { char *k = &((char *)key)[branch]; asm goto( ARCH_STATIC_BRANCH_ASM("%c0", "%l[l_yes]") : : "i"(k) : : l_yes ); return false; l_yes: return true; } static __always_inline bool arch_static_branch_jump(struct static_key * const key, const bool branch) { char *k = &((char *)key)[branch]; asm goto( "1: b %l[l_yes] \n\t" JUMP_TABLE_ENTRY("%c0", "%l[l_yes]") : : "i"(k) : : l_yes ); return false; l_yes: return true; } #endif /* __ASSEMBLY__ */ #endif /* __ASM_JUMP_LABEL_H */
6 6 6 6 5 5 6 6 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 /* * Copyright (c) 2004 Topspin Communications. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include <linux/module.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/netdevice.h> #include <net/net_namespace.h> #include <linux/security.h> #include <linux/notifier.h> #include <linux/hashtable.h> #include <rdma/rdma_netlink.h> #include <rdma/ib_addr.h> #include <rdma/ib_cache.h> #include <rdma/rdma_counter.h> #include "core_priv.h" #include "restrack.h" MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("core kernel InfiniBand API"); MODULE_LICENSE("Dual BSD/GPL"); struct workqueue_struct *ib_comp_wq; struct workqueue_struct *ib_comp_unbound_wq; struct workqueue_struct *ib_wq; EXPORT_SYMBOL_GPL(ib_wq); static struct workqueue_struct *ib_unreg_wq; /* * Each of the three rwsem locks (devices, clients, client_data) protects the * xarray of the same name. Specifically it allows the caller to assert that * the MARK will/will not be changing under the lock, and for devices and * clients, that the value in the xarray is still a valid pointer. Change of * the MARK is linked to the object state, so holding the lock and testing the * MARK also asserts that the contained object is in a certain state. * * This is used to build a two stage register/unregister flow where objects * can continue to be in the xarray even though they are still in progress to * register/unregister. * * The xarray itself provides additional locking, and restartable iteration, * which is also relied on. * * Locks should not be nested, with the exception of client_data, which is * allowed to nest under the read side of the other two locks. * * The devices_rwsem also protects the device name list, any change or * assignment of device name must also hold the write side to guarantee unique * names. */ /* * devices contains devices that have had their names assigned. The * devices may not be registered. Users that care about the registration * status need to call ib_device_try_get() on the device to ensure it is * registered, and keep it registered, for the required duration. * */ static DEFINE_XARRAY_FLAGS(devices, XA_FLAGS_ALLOC); static DECLARE_RWSEM(devices_rwsem); #define DEVICE_REGISTERED XA_MARK_1 static u32 highest_client_id; #define CLIENT_REGISTERED XA_MARK_1 static DEFINE_XARRAY_FLAGS(clients, XA_FLAGS_ALLOC); static DECLARE_RWSEM(clients_rwsem); static void ib_client_put(struct ib_client *client) { if (refcount_dec_and_test(&client->uses)) complete(&client->uses_zero); } /* * If client_data is registered then the corresponding client must also still * be registered. */ #define CLIENT_DATA_REGISTERED XA_MARK_1 unsigned int rdma_dev_net_id; /* * A list of net namespaces is maintained in an xarray. This is necessary * because we can't get the locking right using the existing net ns list. We * would require a init_net callback after the list is updated. */ static DEFINE_XARRAY_FLAGS(rdma_nets, XA_FLAGS_ALLOC); /* * rwsem to protect accessing the rdma_nets xarray entries. */ static DECLARE_RWSEM(rdma_nets_rwsem); bool ib_devices_shared_netns = true; module_param_named(netns_mode, ib_devices_shared_netns, bool, 0444); MODULE_PARM_DESC(netns_mode, "Share device among net namespaces; default=1 (shared)"); /** * rdma_dev_access_netns() - Return whether an rdma device can be accessed * from a specified net namespace or not. * @dev: Pointer to rdma device which needs to be checked * @net: Pointer to net namesapce for which access to be checked * * When the rdma device is in shared mode, it ignores the net namespace. * When the rdma device is exclusive to a net namespace, rdma device net * namespace is checked against the specified one. */ bool rdma_dev_access_netns(const struct ib_device *dev, const struct net *net) { return (ib_devices_shared_netns || net_eq(read_pnet(&dev->coredev.rdma_net), net)); } EXPORT_SYMBOL(rdma_dev_access_netns); /* * xarray has this behavior where it won't iterate over NULL values stored in * allocated arrays. So we need our own iterator to see all values stored in * the array. This does the same thing as xa_for_each except that it also * returns NULL valued entries if the array is allocating. Simplified to only * work on simple xarrays. */ static void *xan_find_marked(struct xarray *xa, unsigned long *indexp, xa_mark_t filter) { XA_STATE(xas, xa, *indexp); void *entry; rcu_read_lock(); do { entry = xas_find_marked(&xas, ULONG_MAX, filter); if (xa_is_zero(entry)) break; } while (xas_retry(&xas, entry)); rcu_read_unlock(); if (entry) { *indexp = xas.xa_index; if (xa_is_zero(entry)) return NULL; return entry; } return XA_ERROR(-ENOENT); } #define xan_for_each_marked(xa, index, entry, filter) \ for (index = 0, entry = xan_find_marked(xa, &(index), filter); \ !xa_is_err(entry); \ (index)++, entry = xan_find_marked(xa, &(index), filter)) /* RCU hash table mapping netdevice pointers to struct ib_port_data */ static DEFINE_SPINLOCK(ndev_hash_lock); static DECLARE_HASHTABLE(ndev_hash, 5); static void free_netdevs(struct ib_device *ib_dev); static void ib_unregister_work(struct work_struct *work); static void __ib_unregister_device(struct ib_device *device); static int ib_security_change(struct notifier_block *nb, unsigned long event, void *lsm_data); static void ib_policy_change_task(struct work_struct *work); static DECLARE_WORK(ib_policy_change_work, ib_policy_change_task); static void __ibdev_printk(const char *level, const struct ib_device *ibdev, struct va_format *vaf) { if (ibdev && ibdev->dev.parent) dev_printk_emit(level[1] - '0', ibdev->dev.parent, "%s %s %s: %pV", dev_driver_string(ibdev->dev.parent), dev_name(ibdev->dev.parent), dev_name(&ibdev->dev), vaf); else if (ibdev) printk("%s%s: %pV", level, dev_name(&ibdev->dev), vaf); else printk("%s(NULL ib_device): %pV", level, vaf); } #define define_ibdev_printk_level(func, level) \ void func(const struct ib_device *ibdev, const char *fmt, ...) \ { \ struct va_format vaf; \ va_list args; \ \ va_start(args, fmt); \ \ vaf.fmt = fmt; \ vaf.va = &args; \ \ __ibdev_printk(level, ibdev, &vaf); \ \ va_end(args); \ } \ EXPORT_SYMBOL(func); define_ibdev_printk_level(ibdev_emerg, KERN_EMERG); define_ibdev_printk_level(ibdev_alert, KERN_ALERT); define_ibdev_printk_level(ibdev_crit, KERN_CRIT); define_ibdev_printk_level(ibdev_err, KERN_ERR); define_ibdev_printk_level(ibdev_warn, KERN_WARNING); define_ibdev_printk_level(ibdev_notice, KERN_NOTICE); define_ibdev_printk_level(ibdev_info, KERN_INFO); static struct notifier_block ibdev_lsm_nb = { .notifier_call = ib_security_change, }; static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, struct net *net); /* Pointer to the RCU head at the start of the ib_port_data array */ struct ib_port_data_rcu { struct rcu_head rcu_head; struct ib_port_data pdata[]; }; static void ib_device_check_mandatory(struct ib_device *device) { #define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device_ops, x), #x } static const struct { size_t offset; char *name; } mandatory_table[] = { IB_MANDATORY_FUNC(query_device), IB_MANDATORY_FUNC(query_port), IB_MANDATORY_FUNC(alloc_pd), IB_MANDATORY_FUNC(dealloc_pd), IB_MANDATORY_FUNC(create_qp), IB_MANDATORY_FUNC(modify_qp), IB_MANDATORY_FUNC(destroy_qp), IB_MANDATORY_FUNC(post_send), IB_MANDATORY_FUNC(post_recv), IB_MANDATORY_FUNC(create_cq), IB_MANDATORY_FUNC(destroy_cq), IB_MANDATORY_FUNC(poll_cq), IB_MANDATORY_FUNC(req_notify_cq), IB_MANDATORY_FUNC(get_dma_mr), IB_MANDATORY_FUNC(reg_user_mr), IB_MANDATORY_FUNC(dereg_mr), IB_MANDATORY_FUNC(get_port_immutable) }; int i; device->kverbs_provider = true; for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) { if (!*(void **) ((void *) &device->ops + mandatory_table[i].offset)) { device->kverbs_provider = false; break; } } } /* * Caller must perform ib_device_put() to return the device reference count * when ib_device_get_by_index() returns valid device pointer. */ struct ib_device *ib_device_get_by_index(const struct net *net, u32 index) { struct ib_device *device; down_read(&devices_rwsem); device = xa_load(&devices, index); if (device) { if (!rdma_dev_access_netns(device, net)) { device = NULL; goto out; } if (!ib_device_try_get(device)) device = NULL; } out: up_read(&devices_rwsem); return device; } /** * ib_device_put - Release IB device reference * @device: device whose reference to be released * * ib_device_put() releases reference to the IB device to allow it to be * unregistered and eventually free. */ void ib_device_put(struct ib_device *device) { if (refcount_dec_and_test(&device->refcount)) complete(&device->unreg_completion); } EXPORT_SYMBOL(ib_device_put); static struct ib_device *__ib_device_get_by_name(const char *name) { struct ib_device *device; unsigned long index; xa_for_each (&devices, index, device) if (!strcmp(name, dev_name(&device->dev))) return device; return NULL; } /** * ib_device_get_by_name - Find an IB device by name * @name: The name to look for * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) * * Find and hold an ib_device by its name. The caller must call * ib_device_put() on the returned pointer. */ struct ib_device *ib_device_get_by_name(const char *name, enum rdma_driver_id driver_id) { struct ib_device *device; down_read(&devices_rwsem); device = __ib_device_get_by_name(name); if (device && driver_id != RDMA_DRIVER_UNKNOWN && device->ops.driver_id != driver_id) device = NULL; if (device) { if (!ib_device_try_get(device)) device = NULL; } up_read(&devices_rwsem); return device; } EXPORT_SYMBOL(ib_device_get_by_name); static int rename_compat_devs(struct ib_device *device) { struct ib_core_device *cdev; unsigned long index; int ret = 0; mutex_lock(&device->compat_devs_mutex); xa_for_each (&device->compat_devs, index, cdev) { ret = device_rename(&cdev->dev, dev_name(&device->dev)); if (ret) { dev_warn(&cdev->dev, "Fail to rename compatdev to new name %s\n", dev_name(&device->dev)); break; } } mutex_unlock(&device->compat_devs_mutex); return ret; } int ib_device_rename(struct ib_device *ibdev, const char *name) { unsigned long index; void *client_data; int ret; down_write(&devices_rwsem); if (!strcmp(name, dev_name(&ibdev->dev))) { up_write(&devices_rwsem); return 0; } if (__ib_device_get_by_name(name)) { up_write(&devices_rwsem); return -EEXIST; } ret = device_rename(&ibdev->dev, name); if (ret) { up_write(&devices_rwsem); return ret; } strscpy(ibdev->name, name, IB_DEVICE_NAME_MAX); ret = rename_compat_devs(ibdev); downgrade_write(&devices_rwsem); down_read(&ibdev->client_data_rwsem); xan_for_each_marked(&ibdev->client_data, index, client_data, CLIENT_DATA_REGISTERED) { struct ib_client *client = xa_load(&clients, index); if (!client || !client->rename) continue; client->rename(ibdev, client_data); } up_read(&ibdev->client_data_rwsem); rdma_nl_notify_event(ibdev, 0, RDMA_RENAME_EVENT); up_read(&devices_rwsem); return 0; } int ib_device_set_dim(struct ib_device *ibdev, u8 use_dim) { if (use_dim > 1) return -EINVAL; ibdev->use_cq_dim = use_dim; return 0; } static int alloc_name(struct ib_device *ibdev, const char *name) { struct ib_device *device; unsigned long index; struct ida inuse; int rc; int i; lockdep_assert_held_write(&devices_rwsem); ida_init(&inuse); xa_for_each (&devices, index, device) { char buf[IB_DEVICE_NAME_MAX]; if (sscanf(dev_name(&device->dev), name, &i) != 1) continue; if (i < 0 || i >= INT_MAX) continue; snprintf(buf, sizeof buf, name, i); if (strcmp(buf, dev_name(&device->dev)) != 0) continue; rc = ida_alloc_range(&inuse, i, i, GFP_KERNEL); if (rc < 0) goto out; } rc = ida_alloc(&inuse, GFP_KERNEL); if (rc < 0) goto out; rc = dev_set_name(&ibdev->dev, name, rc); out: ida_destroy(&inuse); return rc; } static void ib_device_release(struct device *device) { struct ib_device *dev = container_of(device, struct ib_device, dev); free_netdevs(dev); WARN_ON(refcount_read(&dev->refcount)); if (dev->hw_stats_data) ib_device_release_hw_stats(dev->hw_stats_data); if (dev->port_data) { ib_cache_release_one(dev); ib_security_release_port_pkey_list(dev); rdma_counter_release(dev); kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu, pdata[0]), rcu_head); } mutex_destroy(&dev->subdev_lock); mutex_destroy(&dev->unregistration_lock); mutex_destroy(&dev->compat_devs_mutex); xa_destroy(&dev->compat_devs); xa_destroy(&dev->client_data); kfree_rcu(dev, rcu_head); } static int ib_device_uevent(const struct device *device, struct kobj_uevent_env *env) { if (add_uevent_var(env, "NAME=%s", dev_name(device))) return -ENOMEM; /* * It would be nice to pass the node GUID with the event... */ return 0; } static const void *net_namespace(const struct device *d) { const struct ib_core_device *coredev = container_of(d, struct ib_core_device, dev); return read_pnet(&coredev->rdma_net); } static struct class ib_class = { .name = "infiniband", .dev_release = ib_device_release, .dev_uevent = ib_device_uevent, .ns_type = &net_ns_type_operations, .namespace = net_namespace, }; static void rdma_init_coredev(struct ib_core_device *coredev, struct ib_device *dev, struct net *net) { bool is_full_dev = &dev->coredev == coredev; /* This BUILD_BUG_ON is intended to catch layout change * of union of ib_core_device and device. * dev must be the first element as ib_core and providers * driver uses it. Adding anything in ib_core_device before * device will break this assumption. */ BUILD_BUG_ON(offsetof(struct ib_device, coredev.dev) != offsetof(struct ib_device, dev)); coredev->dev.class = &ib_class; coredev->dev.groups = dev->groups; /* * Don't expose hw counters outside of the init namespace. */ if (!is_full_dev && dev->hw_stats_attr_index) coredev->dev.groups[dev->hw_stats_attr_index] = NULL; device_initialize(&coredev->dev); coredev->owner = dev; INIT_LIST_HEAD(&coredev->port_list); write_pnet(&coredev->rdma_net, net); } /** * _ib_alloc_device - allocate an IB device struct * @size:size of structure to allocate * * Low-level drivers should use ib_alloc_device() to allocate &struct * ib_device. @size is the size of the structure to be allocated, * including any private data used by the low-level driver. * ib_dealloc_device() must be used to free structures allocated with * ib_alloc_device(). */ struct ib_device *_ib_alloc_device(size_t size) { struct ib_device *device; unsigned int i; if (WARN_ON(size < sizeof(struct ib_device))) return NULL; device = kzalloc(size, GFP_KERNEL); if (!device) return NULL; if (rdma_restrack_init(device)) { kfree(device); return NULL; } rdma_init_coredev(&device->coredev, device, &init_net); INIT_LIST_HEAD(&device->event_handler_list); spin_lock_init(&device->qp_open_list_lock); init_rwsem(&device->event_handler_rwsem); mutex_init(&device->unregistration_lock); /* * client_data needs to be alloc because we don't want our mark to be * destroyed if the user stores NULL in the client data. */ xa_init_flags(&device->client_data, XA_FLAGS_ALLOC); init_rwsem(&device->client_data_rwsem); xa_init_flags(&device->compat_devs, XA_FLAGS_ALLOC); mutex_init(&device->compat_devs_mutex); init_completion(&device->unreg_completion); INIT_WORK(&device->unregistration_work, ib_unregister_work); spin_lock_init(&device->cq_pools_lock); for (i = 0; i < ARRAY_SIZE(device->cq_pools); i++) INIT_LIST_HEAD(&device->cq_pools[i]); rwlock_init(&device->cache_lock); device->uverbs_cmd_mask = BIT_ULL(IB_USER_VERBS_CMD_ALLOC_MW) | BIT_ULL(IB_USER_VERBS_CMD_ALLOC_PD) | BIT_ULL(IB_USER_VERBS_CMD_ATTACH_MCAST) | BIT_ULL(IB_USER_VERBS_CMD_CLOSE_XRCD) | BIT_ULL(IB_USER_VERBS_CMD_CREATE_AH) | BIT_ULL(IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | BIT_ULL(IB_USER_VERBS_CMD_CREATE_CQ) | BIT_ULL(IB_USER_VERBS_CMD_CREATE_QP) | BIT_ULL(IB_USER_VERBS_CMD_CREATE_SRQ) | BIT_ULL(IB_USER_VERBS_CMD_CREATE_XSRQ) | BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_MW) | BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_PD) | BIT_ULL(IB_USER_VERBS_CMD_DEREG_MR) | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_AH) | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_CQ) | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_QP) | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_SRQ) | BIT_ULL(IB_USER_VERBS_CMD_DETACH_MCAST) | BIT_ULL(IB_USER_VERBS_CMD_GET_CONTEXT) | BIT_ULL(IB_USER_VERBS_CMD_MODIFY_QP) | BIT_ULL(IB_USER_VERBS_CMD_MODIFY_SRQ) | BIT_ULL(IB_USER_VERBS_CMD_OPEN_QP) | BIT_ULL(IB_USER_VERBS_CMD_OPEN_XRCD) | BIT_ULL(IB_USER_VERBS_CMD_QUERY_DEVICE) | BIT_ULL(IB_USER_VERBS_CMD_QUERY_PORT) | BIT_ULL(IB_USER_VERBS_CMD_QUERY_QP) | BIT_ULL(IB_USER_VERBS_CMD_QUERY_SRQ) | BIT_ULL(IB_USER_VERBS_CMD_REG_MR) | BIT_ULL(IB_USER_VERBS_CMD_REREG_MR) | BIT_ULL(IB_USER_VERBS_CMD_RESIZE_CQ); mutex_init(&device->subdev_lock); INIT_LIST_HEAD(&device->subdev_list_head); INIT_LIST_HEAD(&device->subdev_list); return device; } EXPORT_SYMBOL(_ib_alloc_device); /** * ib_dealloc_device - free an IB device struct * @device:structure to free * * Free a structure allocated with ib_alloc_device(). */ void ib_dealloc_device(struct ib_device *device) { if (device->ops.dealloc_driver) device->ops.dealloc_driver(device); /* * ib_unregister_driver() requires all devices to remain in the xarray * while their ops are callable. The last op we call is dealloc_driver * above. This is needed to create a fence on op callbacks prior to * allowing the driver module to unload. */ down_write(&devices_rwsem); if (xa_load(&devices, device->index) == device) xa_erase(&devices, device->index); up_write(&devices_rwsem); /* Expedite releasing netdev references */ free_netdevs(device); WARN_ON(!xa_empty(&device->compat_devs)); WARN_ON(!xa_empty(&device->client_data)); WARN_ON(refcount_read(&device->refcount)); rdma_restrack_clean(device); /* Balances with device_initialize */ put_device(&device->dev); } EXPORT_SYMBOL(ib_dealloc_device); /* * add_client_context() and remove_client_context() must be safe against * parallel calls on the same device - registration/unregistration of both the * device and client can be occurring in parallel. * * The routines need to be a fence, any caller must not return until the add * or remove is fully completed. */ static int add_client_context(struct ib_device *device, struct ib_client *client) { int ret = 0; if (!device->kverbs_provider && !client->no_kverbs_req) return 0; down_write(&device->client_data_rwsem); /* * So long as the client is registered hold both the client and device * unregistration locks. */ if (!refcount_inc_not_zero(&client->uses)) goto out_unlock; refcount_inc(&device->refcount); /* * Another caller to add_client_context got here first and has already * completely initialized context. */ if (xa_get_mark(&device->client_data, client->client_id, CLIENT_DATA_REGISTERED)) goto out; ret = xa_err(xa_store(&device->client_data, client->client_id, NULL, GFP_KERNEL)); if (ret) goto out; downgrade_write(&device->client_data_rwsem); if (client->add) { if (client->add(device)) { /* * If a client fails to add then the error code is * ignored, but we won't call any more ops on this * client. */ xa_erase(&device->client_data, client->client_id); up_read(&device->client_data_rwsem); ib_device_put(device); ib_client_put(client); return 0; } } /* Readers shall not see a client until add has been completed */ xa_set_mark(&device->client_data, client->client_id, CLIENT_DATA_REGISTERED); up_read(&device->client_data_rwsem); return 0; out: ib_device_put(device); ib_client_put(client); out_unlock: up_write(&device->client_data_rwsem); return ret; } static void remove_client_context(struct ib_device *device, unsigned int client_id) { struct ib_client *client; void *client_data; down_write(&device->client_data_rwsem); if (!xa_get_mark(&device->client_data, client_id, CLIENT_DATA_REGISTERED)) { up_write(&device->client_data_rwsem); return; } client_data = xa_load(&device->client_data, client_id); xa_clear_mark(&device->client_data, client_id, CLIENT_DATA_REGISTERED); client = xa_load(&clients, client_id); up_write(&device->client_data_rwsem); /* * Notice we cannot be holding any exclusive locks when calling the * remove callback as the remove callback can recurse back into any * public functions in this module and thus try for any locks those * functions take. * * For this reason clients and drivers should not call the * unregistration functions will holdling any locks. */ if (client->remove) client->remove(device, client_data); xa_erase(&device->client_data, client_id); ib_device_put(device); ib_client_put(client); } static int alloc_port_data(struct ib_device *device) { struct ib_port_data_rcu *pdata_rcu; u32 port; if (device->port_data) return 0; /* This can only be called once the physical port range is defined */ if (WARN_ON(!device->phys_port_cnt)) return -EINVAL; /* Reserve U32_MAX so the logic to go over all the ports is sane */ if (WARN_ON(device->phys_port_cnt == U32_MAX)) return -EINVAL; /* * device->port_data is indexed directly by the port number to make * access to this data as efficient as possible. * * Therefore port_data is declared as a 1 based array with potential * empty slots at the beginning. */ pdata_rcu = kzalloc(struct_size(pdata_rcu, pdata, size_add(rdma_end_port(device), 1)), GFP_KERNEL); if (!pdata_rcu) return -ENOMEM; /* * The rcu_head is put in front of the port data array and the stored * pointer is adjusted since we never need to see that member until * kfree_rcu. */ device->port_data = pdata_rcu->pdata; rdma_for_each_port (device, port) { struct ib_port_data *pdata = &device->port_data[port]; pdata->ib_dev = device; spin_lock_init(&pdata->pkey_list_lock); INIT_LIST_HEAD(&pdata->pkey_list); spin_lock_init(&pdata->netdev_lock); INIT_HLIST_NODE(&pdata->ndev_hash_link); } return 0; } static int verify_immutable(const struct ib_device *dev, u32 port) { return WARN_ON(!rdma_cap_ib_mad(dev, port) && rdma_max_mad_size(dev, port) != 0); } static int setup_port_data(struct ib_device *device) { u32 port; int ret; ret = alloc_port_data(device); if (ret) return ret; rdma_for_each_port (device, port) { struct ib_port_data *pdata = &device->port_data[port]; ret = device->ops.get_port_immutable(device, port, &pdata->immutable); if (ret) return ret; if (verify_immutable(device, port)) return -EINVAL; } return 0; } /** * ib_port_immutable_read() - Read rdma port's immutable data * @dev: IB device * @port: port number whose immutable data to read. It starts with index 1 and * valid upto including rdma_end_port(). */ const struct ib_port_immutable* ib_port_immutable_read(struct ib_device *dev, unsigned int port) { WARN_ON(!rdma_is_port_valid(dev, port)); return &dev->port_data[port].immutable; } EXPORT_SYMBOL(ib_port_immutable_read); void ib_get_device_fw_str(struct ib_device *dev, char *str) { if (dev->ops.get_dev_fw_str) dev->ops.get_dev_fw_str(dev, str); else str[0] = '\0'; } EXPORT_SYMBOL(ib_get_device_fw_str); static void ib_policy_change_task(struct work_struct *work) { struct ib_device *dev; unsigned long index; down_read(&devices_rwsem); xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { unsigned int i; rdma_for_each_port (dev, i) { u64 sp; ib_get_cached_subnet_prefix(dev, i, &sp); ib_security_cache_change(dev, i, sp); } } up_read(&devices_rwsem); } static int ib_security_change(struct notifier_block *nb, unsigned long event, void *lsm_data) { if (event != LSM_POLICY_CHANGE) return NOTIFY_DONE; schedule_work(&ib_policy_change_work); ib_mad_agent_security_change(); return NOTIFY_OK; } static void compatdev_release(struct device *dev) { struct ib_core_device *cdev = container_of(dev, struct ib_core_device, dev); kfree(cdev); } static int add_one_compat_dev(struct ib_device *device, struct rdma_dev_net *rnet) { struct ib_core_device *cdev; int ret; lockdep_assert_held(&rdma_nets_rwsem); if (!ib_devices_shared_netns) return 0; /* * Create and add compat device in all namespaces other than where it * is currently bound to. */ if (net_eq(read_pnet(&rnet->net), read_pnet(&device->coredev.rdma_net))) return 0; /* * The first of init_net() or ib_register_device() to take the * compat_devs_mutex wins and gets to add the device. Others will wait * for completion here. */ mutex_lock(&device->compat_devs_mutex); cdev = xa_load(&device->compat_devs, rnet->id); if (cdev) { ret = 0; goto done; } ret = xa_reserve(&device->compat_devs, rnet->id, GFP_KERNEL); if (ret) goto done; cdev = kzalloc(sizeof(*cdev), GFP_KERNEL); if (!cdev) { ret = -ENOMEM; goto cdev_err; } cdev->dev.parent = device->dev.parent; rdma_init_coredev(cdev, device, read_pnet(&rnet->net)); cdev->dev.release = compatdev_release; ret = dev_set_name(&cdev->dev, "%s", dev_name(&device->dev)); if (ret) goto add_err; ret = device_add(&cdev->dev); if (ret) goto add_err; ret = ib_setup_port_attrs(cdev); if (ret) goto port_err; ret = xa_err(xa_store(&device->compat_devs, rnet->id, cdev, GFP_KERNEL)); if (ret) goto insert_err; mutex_unlock(&device->compat_devs_mutex); return 0; insert_err: ib_free_port_attrs(cdev); port_err: device_del(&cdev->dev); add_err: put_device(&cdev->dev); cdev_err: xa_release(&device->compat_devs, rnet->id); done: mutex_unlock(&device->compat_devs_mutex); return ret; } static void remove_one_compat_dev(struct ib_device *device, u32 id) { struct ib_core_device *cdev; mutex_lock(&device->compat_devs_mutex); cdev = xa_erase(&device->compat_devs, id); mutex_unlock(&device->compat_devs_mutex); if (cdev) { ib_free_port_attrs(cdev); device_del(&cdev->dev); put_device(&cdev->dev); } } static void remove_compat_devs(struct ib_device *device) { struct ib_core_device *cdev; unsigned long index; xa_for_each (&device->compat_devs, index, cdev) remove_one_compat_dev(device, index); } static int add_compat_devs(struct ib_device *device) { struct rdma_dev_net *rnet; unsigned long index; int ret = 0; lockdep_assert_held(&devices_rwsem); down_read(&rdma_nets_rwsem); xa_for_each (&rdma_nets, index, rnet) { ret = add_one_compat_dev(device, rnet); if (ret) break; } up_read(&rdma_nets_rwsem); return ret; } static void remove_all_compat_devs(void) { struct ib_compat_device *cdev; struct ib_device *dev; unsigned long index; down_read(&devices_rwsem); xa_for_each (&devices, index, dev) { unsigned long c_index = 0; /* Hold nets_rwsem so that any other thread modifying this * system param can sync with this thread. */ down_read(&rdma_nets_rwsem); xa_for_each (&dev->compat_devs, c_index, cdev) remove_one_compat_dev(dev, c_index); up_read(&rdma_nets_rwsem); } up_read(&devices_rwsem); } static int add_all_compat_devs(void) { struct rdma_dev_net *rnet; struct ib_device *dev; unsigned long index; int ret = 0; down_read(&devices_rwsem); xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { unsigned long net_index = 0; /* Hold nets_rwsem so that any other thread modifying this * system param can sync with this thread. */ down_read(&rdma_nets_rwsem); xa_for_each (&rdma_nets, net_index, rnet) { ret = add_one_compat_dev(dev, rnet); if (ret) break; } up_read(&rdma_nets_rwsem); } up_read(&devices_rwsem); if (ret) remove_all_compat_devs(); return ret; } int rdma_compatdev_set(u8 enable) { struct rdma_dev_net *rnet; unsigned long index; int ret = 0; down_write(&rdma_nets_rwsem); if (ib_devices_shared_netns == enable) { up_write(&rdma_nets_rwsem); return 0; } /* enable/disable of compat devices is not supported * when more than default init_net exists. */ xa_for_each (&rdma_nets, index, rnet) { ret++; break; } if (!ret) ib_devices_shared_netns = enable; up_write(&rdma_nets_rwsem); if (ret) return -EBUSY; if (enable) ret = add_all_compat_devs(); else remove_all_compat_devs(); return ret; } static void rdma_dev_exit_net(struct net *net) { struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); struct ib_device *dev; unsigned long index; int ret; down_write(&rdma_nets_rwsem); /* * Prevent the ID from being re-used and hide the id from xa_for_each. */ ret = xa_err(xa_store(&rdma_nets, rnet->id, NULL, GFP_KERNEL)); WARN_ON(ret); up_write(&rdma_nets_rwsem); down_read(&devices_rwsem); xa_for_each (&devices, index, dev) { get_device(&dev->dev); /* * Release the devices_rwsem so that pontentially blocking * device_del, doesn't hold the devices_rwsem for too long. */ up_read(&devices_rwsem); remove_one_compat_dev(dev, rnet->id); /* * If the real device is in the NS then move it back to init. */ rdma_dev_change_netns(dev, net, &init_net); put_device(&dev->dev); down_read(&devices_rwsem); } up_read(&devices_rwsem); rdma_nl_net_exit(rnet); xa_erase(&rdma_nets, rnet->id); } static __net_init int rdma_dev_init_net(struct net *net) { struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); unsigned long index; struct ib_device *dev; int ret; write_pnet(&rnet->net, net); ret = rdma_nl_net_init(rnet); if (ret) return ret; /* No need to create any compat devices in default init_net. */ if (net_eq(net, &init_net)) return 0; ret = xa_alloc(&rdma_nets, &rnet->id, rnet, xa_limit_32b, GFP_KERNEL); if (ret) { rdma_nl_net_exit(rnet); return ret; } down_read(&devices_rwsem); xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { /* Hold nets_rwsem so that netlink command cannot change * system configuration for device sharing mode. */ down_read(&rdma_nets_rwsem); ret = add_one_compat_dev(dev, rnet); up_read(&rdma_nets_rwsem); if (ret) break; } up_read(&devices_rwsem); if (ret) rdma_dev_exit_net(net); return ret; } /* * Assign the unique string device name and the unique device index. This is * undone by ib_dealloc_device. */ static int assign_name(struct ib_device *device, const char *name) { static u32 last_id; int ret; down_write(&devices_rwsem); /* Assign a unique name to the device */ if (strchr(name, '%')) ret = alloc_name(device, name); else ret = dev_set_name(&device->dev, name); if (ret) goto out; if (__ib_device_get_by_name(dev_name(&device->dev))) { ret = -ENFILE; goto out; } strscpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX); ret = xa_alloc_cyclic(&devices, &device->index, device, xa_limit_31b, &last_id, GFP_KERNEL); if (ret > 0) ret = 0; out: up_write(&devices_rwsem); return ret; } /* * setup_device() allocates memory and sets up data that requires calling the * device ops, this is the only reason these actions are not done during * ib_alloc_device. It is undone by ib_dealloc_device(). */ static int setup_device(struct ib_device *device) { struct ib_udata uhw = {.outlen = 0, .inlen = 0}; int ret; ib_device_check_mandatory(device); ret = setup_port_data(device); if (ret) { dev_warn(&device->dev, "Couldn't create per-port data\n"); return ret; } memset(&device->attrs, 0, sizeof(device->attrs)); ret = device->ops.query_device(device, &device->attrs, &uhw); if (ret) { dev_warn(&device->dev, "Couldn't query the device attributes\n"); return ret; } return 0; } static void disable_device(struct ib_device *device) { u32 cid; WARN_ON(!refcount_read(&device->refcount)); down_write(&devices_rwsem); xa_clear_mark(&devices, device->index, DEVICE_REGISTERED); up_write(&devices_rwsem); /* * Remove clients in LIFO order, see assign_client_id. This could be * more efficient if xarray learns to reverse iterate. Since no new * clients can be added to this ib_device past this point we only need * the maximum possible client_id value here. */ down_read(&clients_rwsem); cid = highest_client_id; up_read(&clients_rwsem); while (cid) { cid--; remove_client_context(device, cid); } ib_cq_pool_cleanup(device); /* Pairs with refcount_set in enable_device */ ib_device_put(device); wait_for_completion(&device->unreg_completion); /* * compat devices must be removed after device refcount drops to zero. * Otherwise init_net() may add more compatdevs after removing compat * devices and before device is disabled. */ remove_compat_devs(device); } /* * An enabled device is visible to all clients and to all the public facing * APIs that return a device pointer. This always returns with a new get, even * if it fails. */ static int enable_device_and_get(struct ib_device *device) { struct ib_client *client; unsigned long index; int ret = 0; /* * One ref belongs to the xa and the other belongs to this * thread. This is needed to guard against parallel unregistration. */ refcount_set(&device->refcount, 2); down_write(&devices_rwsem); xa_set_mark(&devices, device->index, DEVICE_REGISTERED); /* * By using downgrade_write() we ensure that no other thread can clear * DEVICE_REGISTERED while we are completing the client setup. */ downgrade_write(&devices_rwsem); if (device->ops.enable_driver) { ret = device->ops.enable_driver(device); if (ret) goto out; } down_read(&clients_rwsem); xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { ret = add_client_context(device, client); if (ret) break; } up_read(&clients_rwsem); if (!ret) ret = add_compat_devs(device); out: up_read(&devices_rwsem); return ret; } static void prevent_dealloc_device(struct ib_device *ib_dev) { } static void ib_device_notify_register(struct ib_device *device) { struct net_device *netdev; u32 port; int ret; down_read(&devices_rwsem); /* Mark for userspace that device is ready */ kobject_uevent(&device->dev.kobj, KOBJ_ADD); ret = rdma_nl_notify_event(device, 0, RDMA_REGISTER_EVENT); if (ret) goto out; rdma_for_each_port(device, port) { netdev = ib_device_get_netdev(device, port); if (!netdev) continue; ret = rdma_nl_notify_event(device, port, RDMA_NETDEV_ATTACH_EVENT); dev_put(netdev); if (ret) goto out; } out: up_read(&devices_rwsem); } /** * ib_register_device - Register an IB device with IB core * @device: Device to register * @name: unique string device name. This may include a '%' which will * cause a unique index to be added to the passed device name. * @dma_device: pointer to a DMA-capable device. If %NULL, then the IB * device will be used. In this case the caller should fully * setup the ibdev for DMA. This usually means using dma_virt_ops. * * Low-level drivers use ib_register_device() to register their * devices with the IB core. All registered clients will receive a * callback for each device that is added. @device must be allocated * with ib_alloc_device(). * * If the driver uses ops.dealloc_driver and calls any ib_unregister_device() * asynchronously then the device pointer may become freed as soon as this * function returns. */ int ib_register_device(struct ib_device *device, const char *name, struct device *dma_device) { int ret; ret = assign_name(device, name); if (ret) return ret; /* * If the caller does not provide a DMA capable device then the IB core * will set up ib_sge and scatterlist structures that stash the kernel * virtual address into the address field. */ WARN_ON(dma_device && !dma_device->dma_parms); device->dma_device = dma_device; ret = setup_device(device); if (ret) return ret; ret = ib_cache_setup_one(device); if (ret) { dev_warn(&device->dev, "Couldn't set up InfiniBand P_Key/GID cache\n"); return ret; } device->groups[0] = &ib_dev_attr_group; device->groups[1] = device->ops.device_group; ret = ib_setup_device_attrs(device); if (ret) goto cache_cleanup; ib_device_register_rdmacg(device); rdma_counter_init(device); /* * Ensure that ADD uevent is not fired because it * is too early amd device is not initialized yet. */ dev_set_uevent_suppress(&device->dev, true); ret = device_add(&device->dev); if (ret) goto cg_cleanup; ret = ib_setup_port_attrs(&device->coredev); if (ret) { dev_warn(&device->dev, "Couldn't register device with driver model\n"); goto dev_cleanup; } ret = enable_device_and_get(device); if (ret) { void (*dealloc_fn)(struct ib_device *); /* * If we hit this error flow then we don't want to * automatically dealloc the device since the caller is * expected to call ib_dealloc_device() after * ib_register_device() fails. This is tricky due to the * possibility for a parallel unregistration along with this * error flow. Since we have a refcount here we know any * parallel flow is stopped in disable_device and will see the * special dealloc_driver pointer, causing the responsibility to * ib_dealloc_device() to revert back to this thread. */ dealloc_fn = device->ops.dealloc_driver; device->ops.dealloc_driver = prevent_dealloc_device; ib_device_put(device); __ib_unregister_device(device); device->ops.dealloc_driver = dealloc_fn; dev_set_uevent_suppress(&device->dev, false); return ret; } dev_set_uevent_suppress(&device->dev, false); ib_device_notify_register(device); ib_device_put(device); return 0; dev_cleanup: device_del(&device->dev); cg_cleanup: dev_set_uevent_suppress(&device->dev, false); ib_device_unregister_rdmacg(device); cache_cleanup: ib_cache_cleanup_one(device); return ret; } EXPORT_SYMBOL(ib_register_device); /* Callers must hold a get on the device. */ static void __ib_unregister_device(struct ib_device *ib_dev) { struct ib_device *sub, *tmp; mutex_lock(&ib_dev->subdev_lock); list_for_each_entry_safe_reverse(sub, tmp, &ib_dev->subdev_list_head, subdev_list) { list_del(&sub->subdev_list); ib_dev->ops.del_sub_dev(sub); ib_device_put(ib_dev); } mutex_unlock(&ib_dev->subdev_lock); /* * We have a registration lock so that all the calls to unregister are * fully fenced, once any unregister returns the device is truely * unregistered even if multiple callers are unregistering it at the * same time. This also interacts with the registration flow and * provides sane semantics if register and unregister are racing. */ mutex_lock(&ib_dev->unregistration_lock); if (!refcount_read(&ib_dev->refcount)) goto out; disable_device(ib_dev); rdma_nl_notify_event(ib_dev, 0, RDMA_UNREGISTER_EVENT); /* Expedite removing unregistered pointers from the hash table */ free_netdevs(ib_dev); ib_free_port_attrs(&ib_dev->coredev); device_del(&ib_dev->dev); ib_device_unregister_rdmacg(ib_dev); ib_cache_cleanup_one(ib_dev); /* * Drivers using the new flow may not call ib_dealloc_device except * in error unwind prior to registration success. */ if (ib_dev->ops.dealloc_driver && ib_dev->ops.dealloc_driver != prevent_dealloc_device) { WARN_ON(kref_read(&ib_dev->dev.kobj.kref) <= 1); ib_dealloc_device(ib_dev); } out: mutex_unlock(&ib_dev->unregistration_lock); } /** * ib_unregister_device - Unregister an IB device * @ib_dev: The device to unregister * * Unregister an IB device. All clients will receive a remove callback. * * Callers should call this routine only once, and protect against races with * registration. Typically it should only be called as part of a remove * callback in an implementation of driver core's struct device_driver and * related. * * If ops.dealloc_driver is used then ib_dev will be freed upon return from * this function. */ void ib_unregister_device(struct ib_device *ib_dev) { get_device(&ib_dev->dev); __ib_unregister_device(ib_dev); put_device(&ib_dev->dev); } EXPORT_SYMBOL(ib_unregister_device); /** * ib_unregister_device_and_put - Unregister a device while holding a 'get' * @ib_dev: The device to unregister * * This is the same as ib_unregister_device(), except it includes an internal * ib_device_put() that should match a 'get' obtained by the caller. * * It is safe to call this routine concurrently from multiple threads while * holding the 'get'. When the function returns the device is fully * unregistered. * * Drivers using this flow MUST use the driver_unregister callback to clean up * their resources associated with the device and dealloc it. */ void ib_unregister_device_and_put(struct ib_device *ib_dev) { WARN_ON(!ib_dev->ops.dealloc_driver); get_device(&ib_dev->dev); ib_device_put(ib_dev); __ib_unregister_device(ib_dev); put_device(&ib_dev->dev); } EXPORT_SYMBOL(ib_unregister_device_and_put); /** * ib_unregister_driver - Unregister all IB devices for a driver * @driver_id: The driver to unregister * * This implements a fence for device unregistration. It only returns once all * devices associated with the driver_id have fully completed their * unregistration and returned from ib_unregister_device*(). * * If device's are not yet unregistered it goes ahead and starts unregistering * them. * * This does not block creation of new devices with the given driver_id, that * is the responsibility of the caller. */ void ib_unregister_driver(enum rdma_driver_id driver_id) { struct ib_device *ib_dev; unsigned long index; down_read(&devices_rwsem); xa_for_each (&devices, index, ib_dev) { if (ib_dev->ops.driver_id != driver_id) continue; get_device(&ib_dev->dev); up_read(&devices_rwsem); WARN_ON(!ib_dev->ops.dealloc_driver); __ib_unregister_device(ib_dev); put_device(&ib_dev->dev); down_read(&devices_rwsem); } up_read(&devices_rwsem); } EXPORT_SYMBOL(ib_unregister_driver); static void ib_unregister_work(struct work_struct *work) { struct ib_device *ib_dev = container_of(work, struct ib_device, unregistration_work); __ib_unregister_device(ib_dev); put_device(&ib_dev->dev); } /** * ib_unregister_device_queued - Unregister a device using a work queue * @ib_dev: The device to unregister * * This schedules an asynchronous unregistration using a WQ for the device. A * driver should use this to avoid holding locks while doing unregistration, * such as holding the RTNL lock. * * Drivers using this API must use ib_unregister_driver before module unload * to ensure that all scheduled unregistrations have completed. */ void ib_unregister_device_queued(struct ib_device *ib_dev) { WARN_ON(!refcount_read(&ib_dev->refcount)); WARN_ON(!ib_dev->ops.dealloc_driver); get_device(&ib_dev->dev); if (!queue_work(ib_unreg_wq, &ib_dev->unregistration_work)) put_device(&ib_dev->dev); } EXPORT_SYMBOL(ib_unregister_device_queued); /* * The caller must pass in a device that has the kref held and the refcount * released. If the device is in cur_net and still registered then it is moved * into net. */ static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, struct net *net) { int ret2 = -EINVAL; int ret; mutex_lock(&device->unregistration_lock); /* * If a device not under ib_device_get() or if the unregistration_lock * is not held, the namespace can be changed, or it can be unregistered. * Check again under the lock. */ if (refcount_read(&device->refcount) == 0 || !net_eq(cur_net, read_pnet(&device->coredev.rdma_net))) { ret = -ENODEV; goto out; } kobject_uevent(&device->dev.kobj, KOBJ_REMOVE); disable_device(device); /* * At this point no one can be using the device, so it is safe to * change the namespace. */ write_pnet(&device->coredev.rdma_net, net); down_read(&devices_rwsem); /* * Currently rdma devices are system wide unique. So the device name * is guaranteed free in the new namespace. Publish the new namespace * at the sysfs level. */ ret = device_rename(&device->dev, dev_name(&device->dev)); up_read(&devices_rwsem); if (ret) { dev_warn(&device->dev, "%s: Couldn't rename device after namespace change\n", __func__); /* Try and put things back and re-enable the device */ write_pnet(&device->coredev.rdma_net, cur_net); } ret2 = enable_device_and_get(device); if (ret2) { /* * This shouldn't really happen, but if it does, let the user * retry at later point. So don't disable the device. */ dev_warn(&device->dev, "%s: Couldn't re-enable device after namespace change\n", __func__); } kobject_uevent(&device->dev.kobj, KOBJ_ADD); ib_device_put(device); out: mutex_unlock(&device->unregistration_lock); if (ret) return ret; return ret2; } int ib_device_set_netns_put(struct sk_buff *skb, struct ib_device *dev, u32 ns_fd) { struct net *net; int ret; net = get_net_ns_by_fd(ns_fd); if (IS_ERR(net)) { ret = PTR_ERR(net); goto net_err; } if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { ret = -EPERM; goto ns_err; } /* * All the ib_clients, including uverbs, are reset when the namespace is * changed and this cannot be blocked waiting for userspace to do * something, so disassociation is mandatory. */ if (!dev->ops.disassociate_ucontext || ib_devices_shared_netns) { ret = -EOPNOTSUPP; goto ns_err; } get_device(&dev->dev); ib_device_put(dev); ret = rdma_dev_change_netns(dev, current->nsproxy->net_ns, net); put_device(&dev->dev); put_net(net); return ret; ns_err: put_net(net); net_err: ib_device_put(dev); return ret; } static struct pernet_operations rdma_dev_net_ops = { .init = rdma_dev_init_net, .exit = rdma_dev_exit_net, .id = &rdma_dev_net_id, .size = sizeof(struct rdma_dev_net), }; static int assign_client_id(struct ib_client *client) { int ret; lockdep_assert_held(&clients_rwsem); /* * The add/remove callbacks must be called in FIFO/LIFO order. To * achieve this we assign client_ids so they are sorted in * registration order. */ client->client_id = highest_client_id; ret = xa_insert(&clients, client->client_id, client, GFP_KERNEL); if (ret) return ret; highest_client_id++; xa_set_mark(&clients, client->client_id, CLIENT_REGISTERED); return 0; } static void remove_client_id(struct ib_client *client) { down_write(&clients_rwsem); xa_erase(&clients, client->client_id); for (; highest_client_id; highest_client_id--) if (xa_load(&clients, highest_client_id - 1)) break; up_write(&clients_rwsem); } /** * ib_register_client - Register an IB client * @client:Client to register * * Upper level users of the IB drivers can use ib_register_client() to * register callbacks for IB device addition and removal. When an IB * device is added, each registered client's add method will be called * (in the order the clients were registered), and when a device is * removed, each client's remove method will be called (in the reverse * order that clients were registered). In addition, when * ib_register_client() is called, the client will receive an add * callback for all devices already registered. */ int ib_register_client(struct ib_client *client) { struct ib_device *device; unsigned long index; bool need_unreg = false; int ret; refcount_set(&client->uses, 1); init_completion(&client->uses_zero); /* * The devices_rwsem is held in write mode to ensure that a racing * ib_register_device() sees a consisent view of clients and devices. */ down_write(&devices_rwsem); down_write(&clients_rwsem); ret = assign_client_id(client); if (ret) goto out; need_unreg = true; xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) { ret = add_client_context(device, client); if (ret) goto out; } ret = 0; out: up_write(&clients_rwsem); up_write(&devices_rwsem); if (need_unreg && ret) ib_unregister_client(client); return ret; } EXPORT_SYMBOL(ib_register_client); /** * ib_unregister_client - Unregister an IB client * @client:Client to unregister * * Upper level users use ib_unregister_client() to remove their client * registration. When ib_unregister_client() is called, the client * will receive a remove callback for each IB device still registered. * * This is a full fence, once it returns no client callbacks will be called, * or are running in another thread. */ void ib_unregister_client(struct ib_client *client) { struct ib_device *device; unsigned long index; down_write(&clients_rwsem); ib_client_put(client); xa_clear_mark(&clients, client->client_id, CLIENT_REGISTERED); up_write(&clients_rwsem); /* We do not want to have locks while calling client->remove() */ rcu_read_lock(); xa_for_each (&devices, index, device) { if (!ib_device_try_get(device)) continue; rcu_read_unlock(); remove_client_context(device, client->client_id); ib_device_put(device); rcu_read_lock(); } rcu_read_unlock(); /* * remove_client_context() is not a fence, it can return even though a * removal is ongoing. Wait until all removals are completed. */ wait_for_completion(&client->uses_zero); remove_client_id(client); } EXPORT_SYMBOL(ib_unregister_client); static int __ib_get_global_client_nl_info(const char *client_name, struct ib_client_nl_info *res) { struct ib_client *client; unsigned long index; int ret = -ENOENT; down_read(&clients_rwsem); xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { if (strcmp(client->name, client_name) != 0) continue; if (!client->get_global_nl_info) { ret = -EOPNOTSUPP; break; } ret = client->get_global_nl_info(res); if (WARN_ON(ret == -ENOENT)) ret = -EINVAL; if (!ret && res->cdev) get_device(res->cdev); break; } up_read(&clients_rwsem); return ret; } static int __ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name, struct ib_client_nl_info *res) { unsigned long index; void *client_data; int ret = -ENOENT; down_read(&ibdev->client_data_rwsem); xan_for_each_marked (&ibdev->client_data, index, client_data, CLIENT_DATA_REGISTERED) { struct ib_client *client = xa_load(&clients, index); if (!client || strcmp(client->name, client_name) != 0) continue; if (!client->get_nl_info) { ret = -EOPNOTSUPP; break; } ret = client->get_nl_info(ibdev, client_data, res); if (WARN_ON(ret == -ENOENT)) ret = -EINVAL; /* * The cdev is guaranteed valid as long as we are inside the * client_data_rwsem as remove_one can't be called. Keep it * valid for the caller. */ if (!ret && res->cdev) get_device(res->cdev); break; } up_read(&ibdev->client_data_rwsem); return ret; } /** * ib_get_client_nl_info - Fetch the nl_info from a client * @ibdev: IB device * @client_name: Name of the client * @res: Result of the query */ int ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name, struct ib_client_nl_info *res) { int ret; if (ibdev) ret = __ib_get_client_nl_info(ibdev, client_name, res); else ret = __ib_get_global_client_nl_info(client_name, res); #ifdef CONFIG_MODULES if (ret == -ENOENT) { request_module("rdma-client-%s", client_name); if (ibdev) ret = __ib_get_client_nl_info(ibdev, client_name, res); else ret = __ib_get_global_client_nl_info(client_name, res); } #endif if (ret) { if (ret == -ENOENT) return -EOPNOTSUPP; return ret; } if (WARN_ON(!res->cdev)) return -EINVAL; return 0; } /** * ib_set_client_data - Set IB client context * @device:Device to set context for * @client:Client to set context for * @data:Context to set * * ib_set_client_data() sets client context data that can be retrieved with * ib_get_client_data(). This can only be called while the client is * registered to the device, once the ib_client remove() callback returns this * cannot be called. */ void ib_set_client_data(struct ib_device *device, struct ib_client *client, void *data) { void *rc; if (WARN_ON(IS_ERR(data))) data = NULL; rc = xa_store(&device->client_data, client->client_id, data, GFP_KERNEL); WARN_ON(xa_is_err(rc)); } EXPORT_SYMBOL(ib_set_client_data); /** * ib_register_event_handler - Register an IB event handler * @event_handler:Handler to register * * ib_register_event_handler() registers an event handler that will be * called back when asynchronous IB events occur (as defined in * chapter 11 of the InfiniBand Architecture Specification). This * callback occurs in workqueue context. */ void ib_register_event_handler(struct ib_event_handler *event_handler) { down_write(&event_handler->device->event_handler_rwsem); list_add_tail(&event_handler->list, &event_handler->device->event_handler_list); up_write(&event_handler->device->event_handler_rwsem); } EXPORT_SYMBOL(ib_register_event_handler); /** * ib_unregister_event_handler - Unregister an event handler * @event_handler:Handler to unregister * * Unregister an event handler registered with * ib_register_event_handler(). */ void ib_unregister_event_handler(struct ib_event_handler *event_handler) { down_write(&event_handler->device->event_handler_rwsem); list_del(&event_handler->list); up_write(&event_handler->device->event_handler_rwsem); } EXPORT_SYMBOL(ib_unregister_event_handler); void ib_dispatch_event_clients(struct ib_event *event) { struct ib_event_handler *handler; down_read(&event->device->event_handler_rwsem); list_for_each_entry(handler, &event->device->event_handler_list, list) handler->handler(handler, event); up_read(&event->device->event_handler_rwsem); } static int iw_query_port(struct ib_device *device, u32 port_num, struct ib_port_attr *port_attr) { struct in_device *inetdev; struct net_device *netdev; memset(port_attr, 0, sizeof(*port_attr)); netdev = ib_device_get_netdev(device, port_num); if (!netdev) return -ENODEV; port_attr->max_mtu = IB_MTU_4096; port_attr->active_mtu = ib_mtu_int_to_enum(netdev->mtu); if (!netif_carrier_ok(netdev)) { port_attr->state = IB_PORT_DOWN; port_attr->phys_state = IB_PORT_PHYS_STATE_DISABLED; } else { rcu_read_lock(); inetdev = __in_dev_get_rcu(netdev); if (inetdev && inetdev->ifa_list) { port_attr->state = IB_PORT_ACTIVE; port_attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP; } else { port_attr->state = IB_PORT_INIT; port_attr->phys_state = IB_PORT_PHYS_STATE_PORT_CONFIGURATION_TRAINING; } rcu_read_unlock(); } dev_put(netdev); return device->ops.query_port(device, port_num, port_attr); } static int __ib_query_port(struct ib_device *device, u32 port_num, struct ib_port_attr *port_attr) { int err; memset(port_attr, 0, sizeof(*port_attr)); err = device->ops.query_port(device, port_num, port_attr); if (err || port_attr->subnet_prefix) return err; if (rdma_port_get_link_layer(device, port_num) != IB_LINK_LAYER_INFINIBAND) return 0; ib_get_cached_subnet_prefix(device, port_num, &port_attr->subnet_prefix); return 0; } /** * ib_query_port - Query IB port attributes * @device:Device to query * @port_num:Port number to query * @port_attr:Port attributes * * ib_query_port() returns the attributes of a port through the * @port_attr pointer. */ int ib_query_port(struct ib_device *device, u32 port_num, struct ib_port_attr *port_attr) { if (!rdma_is_port_valid(device, port_num)) return -EINVAL; if (rdma_protocol_iwarp(device, port_num)) return iw_query_port(device, port_num, port_attr); else return __ib_query_port(device, port_num, port_attr); } EXPORT_SYMBOL(ib_query_port); static void add_ndev_hash(struct ib_port_data *pdata) { unsigned long flags; might_sleep(); spin_lock_irqsave(&ndev_hash_lock, flags); if (hash_hashed(&pdata->ndev_hash_link)) { hash_del_rcu(&pdata->ndev_hash_link); spin_unlock_irqrestore(&ndev_hash_lock, flags); /* * We cannot do hash_add_rcu after a hash_del_rcu until the * grace period */ synchronize_rcu(); spin_lock_irqsave(&ndev_hash_lock, flags); } if (pdata->netdev) hash_add_rcu(ndev_hash, &pdata->ndev_hash_link, (uintptr_t)pdata->netdev); spin_unlock_irqrestore(&ndev_hash_lock, flags); } /** * ib_device_set_netdev - Associate the ib_dev with an underlying net_device * @ib_dev: Device to modify * @ndev: net_device to affiliate, may be NULL * @port: IB port the net_device is connected to * * Drivers should use this to link the ib_device to a netdev so the netdev * shows up in interfaces like ib_enum_roce_netdev. Only one netdev may be * affiliated with any port. * * The caller must ensure that the given ndev is not unregistered or * unregistering, and that either the ib_device is unregistered or * ib_device_set_netdev() is called with NULL when the ndev sends a * NETDEV_UNREGISTER event. */ int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, u32 port) { enum rdma_nl_notify_event_type etype; struct net_device *old_ndev; struct ib_port_data *pdata; unsigned long flags; int ret; if (!rdma_is_port_valid(ib_dev, port)) return -EINVAL; /* * Drivers wish to call this before ib_register_driver, so we have to * setup the port data early. */ ret = alloc_port_data(ib_dev); if (ret) return ret; pdata = &ib_dev->port_data[port]; spin_lock_irqsave(&pdata->netdev_lock, flags); old_ndev = rcu_dereference_protected( pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); if (old_ndev == ndev) { spin_unlock_irqrestore(&pdata->netdev_lock, flags); return 0; } rcu_assign_pointer(pdata->netdev, ndev); netdev_put(old_ndev, &pdata->netdev_tracker); netdev_hold(ndev, &pdata->netdev_tracker, GFP_ATOMIC); spin_unlock_irqrestore(&pdata->netdev_lock, flags); add_ndev_hash(pdata); /* Make sure that the device is registered before we send events */ if (xa_load(&devices, ib_dev->index) != ib_dev) return 0; etype = ndev ? RDMA_NETDEV_ATTACH_EVENT : RDMA_NETDEV_DETACH_EVENT; rdma_nl_notify_event(ib_dev, port, etype); return 0; } EXPORT_SYMBOL(ib_device_set_netdev); static void free_netdevs(struct ib_device *ib_dev) { unsigned long flags; u32 port; if (!ib_dev->port_data) return; rdma_for_each_port (ib_dev, port) { struct ib_port_data *pdata = &ib_dev->port_data[port]; struct net_device *ndev; spin_lock_irqsave(&pdata->netdev_lock, flags); ndev = rcu_dereference_protected( pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); if (ndev) { spin_lock(&ndev_hash_lock); hash_del_rcu(&pdata->ndev_hash_link); spin_unlock(&ndev_hash_lock); /* * If this is the last dev_put there is still a * synchronize_rcu before the netdev is kfreed, so we * can continue to rely on unlocked pointer * comparisons after the put */ rcu_assign_pointer(pdata->netdev, NULL); netdev_put(ndev, &pdata->netdev_tracker); } spin_unlock_irqrestore(&pdata->netdev_lock, flags); } } struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, u32 port) { struct ib_port_data *pdata; struct net_device *res; if (!rdma_is_port_valid(ib_dev, port)) return NULL; if (!ib_dev->port_data) return NULL; pdata = &ib_dev->port_data[port]; /* * New drivers should use ib_device_set_netdev() not the legacy * get_netdev(). */ if (ib_dev->ops.get_netdev) res = ib_dev->ops.get_netdev(ib_dev, port); else { spin_lock(&pdata->netdev_lock); res = rcu_dereference_protected( pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); dev_hold(res); spin_unlock(&pdata->netdev_lock); } return res; } EXPORT_SYMBOL(ib_device_get_netdev); /** * ib_query_netdev_port - Query the port number of a net_device * associated with an ibdev * @ibdev: IB device * @ndev: Network device * @port: IB port the net_device is connected to */ int ib_query_netdev_port(struct ib_device *ibdev, struct net_device *ndev, u32 *port) { struct net_device *ib_ndev; u32 port_num; rdma_for_each_port(ibdev, port_num) { ib_ndev = ib_device_get_netdev(ibdev, port_num); if (ndev == ib_ndev) { *port = port_num; dev_put(ib_ndev); return 0; } dev_put(ib_ndev); } return -ENOENT; } EXPORT_SYMBOL(ib_query_netdev_port); /** * ib_device_get_by_netdev - Find an IB device associated with a netdev * @ndev: netdev to locate * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) * * Find and hold an ib_device that is associated with a netdev via * ib_device_set_netdev(). The caller must call ib_device_put() on the * returned pointer. */ struct ib_device *ib_device_get_by_netdev(struct net_device *ndev, enum rdma_driver_id driver_id) { struct ib_device *res = NULL; struct ib_port_data *cur; rcu_read_lock(); hash_for_each_possible_rcu (ndev_hash, cur, ndev_hash_link, (uintptr_t)ndev) { if (rcu_access_pointer(cur->netdev) == ndev && (driver_id == RDMA_DRIVER_UNKNOWN || cur->ib_dev->ops.driver_id == driver_id) && ib_device_try_get(cur->ib_dev)) { res = cur->ib_dev; break; } } rcu_read_unlock(); return res; } EXPORT_SYMBOL(ib_device_get_by_netdev); /** * ib_enum_roce_netdev - enumerate all RoCE ports * @ib_dev : IB device we want to query * @filter: Should we call the callback? * @filter_cookie: Cookie passed to filter * @cb: Callback to call for each found RoCE ports * @cookie: Cookie passed back to the callback * * Enumerates all of the physical RoCE ports of ib_dev * which are related to netdevice and calls callback() on each * device for which filter() function returns non zero. */ void ib_enum_roce_netdev(struct ib_device *ib_dev, roce_netdev_filter filter, void *filter_cookie, roce_netdev_callback cb, void *cookie) { u32 port; rdma_for_each_port (ib_dev, port) if (rdma_protocol_roce(ib_dev, port)) { struct net_device *idev = ib_device_get_netdev(ib_dev, port); if (filter(ib_dev, port, idev, filter_cookie)) cb(ib_dev, port, idev, cookie); dev_put(idev); } } /** * ib_enum_all_roce_netdevs - enumerate all RoCE devices * @filter: Should we call the callback? * @filter_cookie: Cookie passed to filter * @cb: Callback to call for each found RoCE ports * @cookie: Cookie passed back to the callback * * Enumerates all RoCE devices' physical ports which are related * to netdevices and calls callback() on each device for which * filter() function returns non zero. */ void ib_enum_all_roce_netdevs(roce_netdev_filter filter, void *filter_cookie, roce_netdev_callback cb, void *cookie) { struct ib_device *dev; unsigned long index; down_read(&devices_rwsem); xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie); up_read(&devices_rwsem); } /* * ib_enum_all_devs - enumerate all ib_devices * @cb: Callback to call for each found ib_device * * Enumerates all ib_devices and calls callback() on each device. */ int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb, struct netlink_callback *cb) { unsigned long index; struct ib_device *dev; unsigned int idx = 0; int ret = 0; down_read(&devices_rwsem); xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { if (!rdma_dev_access_netns(dev, sock_net(skb->sk))) continue; ret = nldev_cb(dev, skb, cb, idx); if (ret) break; idx++; } up_read(&devices_rwsem); return ret; } /** * ib_query_pkey - Get P_Key table entry * @device:Device to query * @port_num:Port number to query * @index:P_Key table index to query * @pkey:Returned P_Key * * ib_query_pkey() fetches the specified P_Key table entry. */ int ib_query_pkey(struct ib_device *device, u32 port_num, u16 index, u16 *pkey) { if (!rdma_is_port_valid(device, port_num)) return -EINVAL; if (!device->ops.query_pkey) return -EOPNOTSUPP; return device->ops.query_pkey(device, port_num, index, pkey); } EXPORT_SYMBOL(ib_query_pkey); /** * ib_modify_device - Change IB device attributes * @device:Device to modify * @device_modify_mask:Mask of attributes to change * @device_modify:New attribute values * * ib_modify_device() changes a device's attributes as specified by * the @device_modify_mask and @device_modify structure. */ int ib_modify_device(struct ib_device *device, int device_modify_mask, struct ib_device_modify *device_modify) { if (!device->ops.modify_device) return -EOPNOTSUPP; return device->ops.modify_device(device, device_modify_mask, device_modify); } EXPORT_SYMBOL(ib_modify_device); /** * ib_modify_port - Modifies the attributes for the specified port. * @device: The device to modify. * @port_num: The number of the port to modify. * @port_modify_mask: Mask used to specify which attributes of the port * to change. * @port_modify: New attribute values for the port. * * ib_modify_port() changes a port's attributes as specified by the * @port_modify_mask and @port_modify structure. */ int ib_modify_port(struct ib_device *device, u32 port_num, int port_modify_mask, struct ib_port_modify *port_modify) { int rc; if (!rdma_is_port_valid(device, port_num)) return -EINVAL; if (device->ops.modify_port) rc = device->ops.modify_port(device, port_num, port_modify_mask, port_modify); else if (rdma_protocol_roce(device, port_num) && ((port_modify->set_port_cap_mask & ~IB_PORT_CM_SUP) == 0 || (port_modify->clr_port_cap_mask & ~IB_PORT_CM_SUP) == 0)) rc = 0; else rc = -EOPNOTSUPP; return rc; } EXPORT_SYMBOL(ib_modify_port); /** * ib_find_gid - Returns the port number and GID table index where * a specified GID value occurs. Its searches only for IB link layer. * @device: The device to query. * @gid: The GID value to search for. * @port_num: The port number of the device where the GID value was found. * @index: The index into the GID table where the GID was found. This * parameter may be NULL. */ int ib_find_gid(struct ib_device *device, union ib_gid *gid, u32 *port_num, u16 *index) { union ib_gid tmp_gid; u32 port; int ret, i; rdma_for_each_port (device, port) { if (!rdma_protocol_ib(device, port)) continue; for (i = 0; i < device->port_data[port].immutable.gid_tbl_len; ++i) { ret = rdma_query_gid(device, port, i, &tmp_gid); if (ret) continue; if (!memcmp(&tmp_gid, gid, sizeof *gid)) { *port_num = port; if (index) *index = i; return 0; } } } return -ENOENT; } EXPORT_SYMBOL(ib_find_gid); /** * ib_find_pkey - Returns the PKey table index where a specified * PKey value occurs. * @device: The device to query. * @port_num: The port number of the device to search for the PKey. * @pkey: The PKey value to search for. * @index: The index into the PKey table where the PKey was found. */ int ib_find_pkey(struct ib_device *device, u32 port_num, u16 pkey, u16 *index) { int ret, i; u16 tmp_pkey; int partial_ix = -1; for (i = 0; i < device->port_data[port_num].immutable.pkey_tbl_len; ++i) { ret = ib_query_pkey(device, port_num, i, &tmp_pkey); if (ret) return ret; if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) { /* if there is full-member pkey take it.*/ if (tmp_pkey & 0x8000) { *index = i; return 0; } if (partial_ix < 0) partial_ix = i; } } /*no full-member, if exists take the limited*/ if (partial_ix >= 0) { *index = partial_ix; return 0; } return -ENOENT; } EXPORT_SYMBOL(ib_find_pkey); /** * ib_get_net_dev_by_params() - Return the appropriate net_dev * for a received CM request * @dev: An RDMA device on which the request has been received. * @port: Port number on the RDMA device. * @pkey: The Pkey the request came on. * @gid: A GID that the net_dev uses to communicate. * @addr: Contains the IP address that the request specified as its * destination. * */ struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u32 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr) { struct net_device *net_dev = NULL; unsigned long index; void *client_data; if (!rdma_protocol_ib(dev, port)) return NULL; /* * Holding the read side guarantees that the client will not become * unregistered while we are calling get_net_dev_by_params() */ down_read(&dev->client_data_rwsem); xan_for_each_marked (&dev->client_data, index, client_data, CLIENT_DATA_REGISTERED) { struct ib_client *client = xa_load(&clients, index); if (!client || !client->get_net_dev_by_params) continue; net_dev = client->get_net_dev_by_params(dev, port, pkey, gid, addr, client_data); if (net_dev) break; } up_read(&dev->client_data_rwsem); return net_dev; } EXPORT_SYMBOL(ib_get_net_dev_by_params); void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) { struct ib_device_ops *dev_ops = &dev->ops; #define SET_DEVICE_OP(ptr, name) \ do { \ if (ops->name) \ if (!((ptr)->name)) \ (ptr)->name = ops->name; \ } while (0) #define SET_OBJ_SIZE(ptr, name) SET_DEVICE_OP(ptr, size_##name) if (ops->driver_id != RDMA_DRIVER_UNKNOWN) { WARN_ON(dev_ops->driver_id != RDMA_DRIVER_UNKNOWN && dev_ops->driver_id != ops->driver_id); dev_ops->driver_id = ops->driver_id; } if (ops->owner) { WARN_ON(dev_ops->owner && dev_ops->owner != ops->owner); dev_ops->owner = ops->owner; } if (ops->uverbs_abi_ver) dev_ops->uverbs_abi_ver = ops->uverbs_abi_ver; dev_ops->uverbs_no_driver_id_binding |= ops->uverbs_no_driver_id_binding; SET_DEVICE_OP(dev_ops, add_gid); SET_DEVICE_OP(dev_ops, add_sub_dev); SET_DEVICE_OP(dev_ops, advise_mr); SET_DEVICE_OP(dev_ops, alloc_dm); SET_DEVICE_OP(dev_ops, alloc_hw_device_stats); SET_DEVICE_OP(dev_ops, alloc_hw_port_stats); SET_DEVICE_OP(dev_ops, alloc_mr); SET_DEVICE_OP(dev_ops, alloc_mr_integrity); SET_DEVICE_OP(dev_ops, alloc_mw); SET_DEVICE_OP(dev_ops, alloc_pd); SET_DEVICE_OP(dev_ops, alloc_rdma_netdev); SET_DEVICE_OP(dev_ops, alloc_ucontext); SET_DEVICE_OP(dev_ops, alloc_xrcd); SET_DEVICE_OP(dev_ops, attach_mcast); SET_DEVICE_OP(dev_ops, check_mr_status); SET_DEVICE_OP(dev_ops, counter_alloc_stats); SET_DEVICE_OP(dev_ops, counter_bind_qp); SET_DEVICE_OP(dev_ops, counter_dealloc); SET_DEVICE_OP(dev_ops, counter_init); SET_DEVICE_OP(dev_ops, counter_unbind_qp); SET_DEVICE_OP(dev_ops, counter_update_stats); SET_DEVICE_OP(dev_ops, create_ah); SET_DEVICE_OP(dev_ops, create_counters); SET_DEVICE_OP(dev_ops, create_cq); SET_DEVICE_OP(dev_ops, create_flow); SET_DEVICE_OP(dev_ops, create_qp); SET_DEVICE_OP(dev_ops, create_rwq_ind_table); SET_DEVICE_OP(dev_ops, create_srq); SET_DEVICE_OP(dev_ops, create_user_ah); SET_DEVICE_OP(dev_ops, create_wq); SET_DEVICE_OP(dev_ops, dealloc_dm); SET_DEVICE_OP(dev_ops, dealloc_driver); SET_DEVICE_OP(dev_ops, dealloc_mw); SET_DEVICE_OP(dev_ops, dealloc_pd); SET_DEVICE_OP(dev_ops, dealloc_ucontext); SET_DEVICE_OP(dev_ops, dealloc_xrcd); SET_DEVICE_OP(dev_ops, del_gid); SET_DEVICE_OP(dev_ops, del_sub_dev); SET_DEVICE_OP(dev_ops, dereg_mr); SET_DEVICE_OP(dev_ops, destroy_ah); SET_DEVICE_OP(dev_ops, destroy_counters); SET_DEVICE_OP(dev_ops, destroy_cq); SET_DEVICE_OP(dev_ops, destroy_flow); SET_DEVICE_OP(dev_ops, destroy_flow_action); SET_DEVICE_OP(dev_ops, destroy_qp); SET_DEVICE_OP(dev_ops, destroy_rwq_ind_table); SET_DEVICE_OP(dev_ops, destroy_srq); SET_DEVICE_OP(dev_ops, destroy_wq); SET_DEVICE_OP(dev_ops, device_group); SET_DEVICE_OP(dev_ops, detach_mcast); SET_DEVICE_OP(dev_ops, disassociate_ucontext); SET_DEVICE_OP(dev_ops, drain_rq); SET_DEVICE_OP(dev_ops, drain_sq); SET_DEVICE_OP(dev_ops, enable_driver); SET_DEVICE_OP(dev_ops, fill_res_cm_id_entry); SET_DEVICE_OP(dev_ops, fill_res_cq_entry); SET_DEVICE_OP(dev_ops, fill_res_cq_entry_raw); SET_DEVICE_OP(dev_ops, fill_res_mr_entry); SET_DEVICE_OP(dev_ops, fill_res_mr_entry_raw); SET_DEVICE_OP(dev_ops, fill_res_qp_entry); SET_DEVICE_OP(dev_ops, fill_res_qp_entry_raw); SET_DEVICE_OP(dev_ops, fill_res_srq_entry); SET_DEVICE_OP(dev_ops, fill_res_srq_entry_raw); SET_DEVICE_OP(dev_ops, fill_stat_mr_entry); SET_DEVICE_OP(dev_ops, get_dev_fw_str); SET_DEVICE_OP(dev_ops, get_dma_mr); SET_DEVICE_OP(dev_ops, get_hw_stats); SET_DEVICE_OP(dev_ops, get_link_layer); SET_DEVICE_OP(dev_ops, get_netdev); SET_DEVICE_OP(dev_ops, get_numa_node); SET_DEVICE_OP(dev_ops, get_port_immutable); SET_DEVICE_OP(dev_ops, get_vector_affinity); SET_DEVICE_OP(dev_ops, get_vf_config); SET_DEVICE_OP(dev_ops, get_vf_guid); SET_DEVICE_OP(dev_ops, get_vf_stats); SET_DEVICE_OP(dev_ops, iw_accept); SET_DEVICE_OP(dev_ops, iw_add_ref); SET_DEVICE_OP(dev_ops, iw_connect); SET_DEVICE_OP(dev_ops, iw_create_listen); SET_DEVICE_OP(dev_ops, iw_destroy_listen); SET_DEVICE_OP(dev_ops, iw_get_qp); SET_DEVICE_OP(dev_ops, iw_reject); SET_DEVICE_OP(dev_ops, iw_rem_ref); SET_DEVICE_OP(dev_ops, map_mr_sg); SET_DEVICE_OP(dev_ops, map_mr_sg_pi); SET_DEVICE_OP(dev_ops, mmap); SET_DEVICE_OP(dev_ops, mmap_free); SET_DEVICE_OP(dev_ops, modify_ah); SET_DEVICE_OP(dev_ops, modify_cq); SET_DEVICE_OP(dev_ops, modify_device); SET_DEVICE_OP(dev_ops, modify_hw_stat); SET_DEVICE_OP(dev_ops, modify_port); SET_DEVICE_OP(dev_ops, modify_qp); SET_DEVICE_OP(dev_ops, modify_srq); SET_DEVICE_OP(dev_ops, modify_wq); SET_DEVICE_OP(dev_ops, peek_cq); SET_DEVICE_OP(dev_ops, poll_cq); SET_DEVICE_OP(dev_ops, port_groups); SET_DEVICE_OP(dev_ops, post_recv); SET_DEVICE_OP(dev_ops, post_send); SET_DEVICE_OP(dev_ops, post_srq_recv); SET_DEVICE_OP(dev_ops, process_mad); SET_DEVICE_OP(dev_ops, query_ah); SET_DEVICE_OP(dev_ops, query_device); SET_DEVICE_OP(dev_ops, query_gid); SET_DEVICE_OP(dev_ops, query_pkey); SET_DEVICE_OP(dev_ops, query_port); SET_DEVICE_OP(dev_ops, query_qp); SET_DEVICE_OP(dev_ops, query_srq); SET_DEVICE_OP(dev_ops, query_ucontext); SET_DEVICE_OP(dev_ops, rdma_netdev_get_params); SET_DEVICE_OP(dev_ops, read_counters); SET_DEVICE_OP(dev_ops, reg_dm_mr); SET_DEVICE_OP(dev_ops, reg_user_mr); SET_DEVICE_OP(dev_ops, reg_user_mr_dmabuf); SET_DEVICE_OP(dev_ops, req_notify_cq); SET_DEVICE_OP(dev_ops, rereg_user_mr); SET_DEVICE_OP(dev_ops, resize_cq); SET_DEVICE_OP(dev_ops, set_vf_guid); SET_DEVICE_OP(dev_ops, set_vf_link_state); SET_DEVICE_OP(dev_ops, ufile_hw_cleanup); SET_DEVICE_OP(dev_ops, report_port_event); SET_OBJ_SIZE(dev_ops, ib_ah); SET_OBJ_SIZE(dev_ops, ib_counters); SET_OBJ_SIZE(dev_ops, ib_cq); SET_OBJ_SIZE(dev_ops, ib_mw); SET_OBJ_SIZE(dev_ops, ib_pd); SET_OBJ_SIZE(dev_ops, ib_qp); SET_OBJ_SIZE(dev_ops, ib_rwq_ind_table); SET_OBJ_SIZE(dev_ops, ib_srq); SET_OBJ_SIZE(dev_ops, ib_ucontext); SET_OBJ_SIZE(dev_ops, ib_xrcd); SET_OBJ_SIZE(dev_ops, rdma_counter); } EXPORT_SYMBOL(ib_set_device_ops); int ib_add_sub_device(struct ib_device *parent, enum rdma_nl_dev_type type, const char *name) { struct ib_device *sub; int ret = 0; if (!parent->ops.add_sub_dev || !parent->ops.del_sub_dev) return -EOPNOTSUPP; if (!ib_device_try_get(parent)) return -EINVAL; sub = parent->ops.add_sub_dev(parent, type, name); if (IS_ERR(sub)) { ib_device_put(parent); return PTR_ERR(sub); } sub->type = type; sub->parent = parent; mutex_lock(&parent->subdev_lock); list_add_tail(&parent->subdev_list_head, &sub->subdev_list); mutex_unlock(&parent->subdev_lock); return ret; } EXPORT_SYMBOL(ib_add_sub_device); int ib_del_sub_device_and_put(struct ib_device *sub) { struct ib_device *parent = sub->parent; if (!parent) return -EOPNOTSUPP; mutex_lock(&parent->subdev_lock); list_del(&sub->subdev_list); mutex_unlock(&parent->subdev_lock); ib_device_put(sub); parent->ops.del_sub_dev(sub); ib_device_put(parent); return 0; } EXPORT_SYMBOL(ib_del_sub_device_and_put); #ifdef CONFIG_INFINIBAND_VIRT_DMA int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents) { struct scatterlist *s; int i; for_each_sg(sg, s, nents, i) { sg_dma_address(s) = (uintptr_t)sg_virt(s); sg_dma_len(s) = s->length; } return nents; } EXPORT_SYMBOL(ib_dma_virt_map_sg); #endif /* CONFIG_INFINIBAND_VIRT_DMA */ static const struct rdma_nl_cbs ibnl_ls_cb_table[RDMA_NL_LS_NUM_OPS] = { [RDMA_NL_LS_OP_RESOLVE] = { .doit = ib_nl_handle_resolve_resp, .flags = RDMA_NL_ADMIN_PERM, }, [RDMA_NL_LS_OP_SET_TIMEOUT] = { .doit = ib_nl_handle_set_timeout, .flags = RDMA_NL_ADMIN_PERM, }, [RDMA_NL_LS_OP_IP_RESOLVE] = { .doit = ib_nl_handle_ip_res_resp, .flags = RDMA_NL_ADMIN_PERM, }, }; void ib_dispatch_port_state_event(struct ib_device *ibdev, struct net_device *ndev) { enum ib_port_state curr_state; struct ib_event ibevent = {}; u32 port; if (ib_query_netdev_port(ibdev, ndev, &port)) return; curr_state = ib_get_curr_port_state(ndev); write_lock_irq(&ibdev->cache_lock); if (ibdev->port_data[port].cache.last_port_state == curr_state) { write_unlock_irq(&ibdev->cache_lock); return; } ibdev->port_data[port].cache.last_port_state = curr_state; write_unlock_irq(&ibdev->cache_lock); ibevent.event = (curr_state == IB_PORT_DOWN) ? IB_EVENT_PORT_ERR : IB_EVENT_PORT_ACTIVE; ibevent.device = ibdev; ibevent.element.port_num = port; ib_dispatch_event(&ibevent); } EXPORT_SYMBOL(ib_dispatch_port_state_event); static void handle_port_event(struct net_device *ndev, unsigned long event) { struct ib_device *ibdev; /* Currently, link events in bonding scenarios are still * reported by drivers that support bonding. */ if (netif_is_lag_master(ndev) || netif_is_lag_port(ndev)) return; ibdev = ib_device_get_by_netdev(ndev, RDMA_DRIVER_UNKNOWN); if (!ibdev) return; if (ibdev->ops.report_port_event) { ibdev->ops.report_port_event(ibdev, ndev, event); goto put_ibdev; } ib_dispatch_port_state_event(ibdev, ndev); put_ibdev: ib_device_put(ibdev); }; static int ib_netdevice_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *ndev = netdev_notifier_info_to_dev(ptr); struct ib_device *ibdev; u32 port; switch (event) { case NETDEV_CHANGENAME: ibdev = ib_device_get_by_netdev(ndev, RDMA_DRIVER_UNKNOWN); if (!ibdev) return NOTIFY_DONE; if (ib_query_netdev_port(ibdev, ndev, &port)) { ib_device_put(ibdev); break; } rdma_nl_notify_event(ibdev, port, RDMA_NETDEV_RENAME_EVENT); ib_device_put(ibdev); break; case NETDEV_UP: case NETDEV_CHANGE: case NETDEV_DOWN: handle_port_event(ndev, event); break; default: break; } return NOTIFY_DONE; } static struct notifier_block nb_netdevice = { .notifier_call = ib_netdevice_event, }; static int __init ib_core_init(void) { int ret = -ENOMEM; ib_wq = alloc_workqueue("infiniband", 0, 0); if (!ib_wq) return -ENOMEM; ib_unreg_wq = alloc_workqueue("ib-unreg-wq", WQ_UNBOUND, WQ_UNBOUND_MAX_ACTIVE); if (!ib_unreg_wq) goto err; ib_comp_wq = alloc_workqueue("ib-comp-wq", WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS, 0); if (!ib_comp_wq) goto err_unbound; ib_comp_unbound_wq = alloc_workqueue("ib-comp-unb-wq", WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS, WQ_UNBOUND_MAX_ACTIVE); if (!ib_comp_unbound_wq) goto err_comp; ret = class_register(&ib_class); if (ret) { pr_warn("Couldn't create InfiniBand device class\n"); goto err_comp_unbound; } rdma_nl_init(); ret = addr_init(); if (ret) { pr_warn("Couldn't init IB address resolution\n"); goto err_ibnl; } ret = ib_mad_init(); if (ret) { pr_warn("Couldn't init IB MAD\n"); goto err_addr; } ret = ib_sa_init(); if (ret) { pr_warn("Couldn't init SA\n"); goto err_mad; } ret = register_blocking_lsm_notifier(&ibdev_lsm_nb); if (ret) { pr_warn("Couldn't register LSM notifier. ret %d\n", ret); goto err_sa; } ret = register_pernet_device(&rdma_dev_net_ops); if (ret) { pr_warn("Couldn't init compat dev. ret %d\n", ret); goto err_compat; } nldev_init(); rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table); ret = roce_gid_mgmt_init(); if (ret) { pr_warn("Couldn't init RoCE GID management\n"); goto err_parent; } register_netdevice_notifier(&nb_netdevice); return 0; err_parent: rdma_nl_unregister(RDMA_NL_LS); nldev_exit(); unregister_pernet_device(&rdma_dev_net_ops); err_compat: unregister_blocking_lsm_notifier(&ibdev_lsm_nb); err_sa: ib_sa_cleanup(); err_mad: ib_mad_cleanup(); err_addr: addr_cleanup(); err_ibnl: class_unregister(&ib_class); err_comp_unbound: destroy_workqueue(ib_comp_unbound_wq); err_comp: destroy_workqueue(ib_comp_wq); err_unbound: destroy_workqueue(ib_unreg_wq); err: destroy_workqueue(ib_wq); return ret; } static void __exit ib_core_cleanup(void) { unregister_netdevice_notifier(&nb_netdevice); roce_gid_mgmt_cleanup(); rdma_nl_unregister(RDMA_NL_LS); nldev_exit(); unregister_pernet_device(&rdma_dev_net_ops); unregister_blocking_lsm_notifier(&ibdev_lsm_nb); ib_sa_cleanup(); ib_mad_cleanup(); addr_cleanup(); rdma_nl_exit(); class_unregister(&ib_class); destroy_workqueue(ib_comp_unbound_wq); destroy_workqueue(ib_comp_wq); /* Make sure that any pending umem accounting work is done. */ destroy_workqueue(ib_wq); destroy_workqueue(ib_unreg_wq); WARN_ON(!xa_empty(&clients)); WARN_ON(!xa_empty(&devices)); } MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4); /* ib core relies on netdev stack to first register net_ns_type_operations * ns kobject type before ib_core initialization. */ fs_initcall(ib_core_init); module_exit(ib_core_cleanup);
52 50 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 // SPDX-License-Identifier: GPL-2.0-or-later /* * Directory notifications for Linux. * * Copyright (C) 2000,2001,2002 Stephen Rothwell * * Copyright (C) 2009 Eric Paris <Red Hat Inc> * dnotify was largly rewritten to use the new fsnotify infrastructure */ #include <linux/fs.h> #include <linux/module.h> #include <linux/sched.h> #include <linux/sched/signal.h> #include <linux/dnotify.h> #include <linux/init.h> #include <linux/security.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/fsnotify_backend.h> static int dir_notify_enable __read_mostly = 1; #ifdef CONFIG_SYSCTL static const struct ctl_table dnotify_sysctls[] = { { .procname = "dir-notify-enable", .data = &dir_notify_enable, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, }; static void __init dnotify_sysctl_init(void) { register_sysctl_init("fs", dnotify_sysctls); } #else #define dnotify_sysctl_init() do { } while (0) #endif static struct kmem_cache *dnotify_struct_cache __ro_after_init; static struct kmem_cache *dnotify_mark_cache __ro_after_init; static struct fsnotify_group *dnotify_group __ro_after_init; /* * dnotify will attach one of these to each inode (i_fsnotify_marks) which * is being watched by dnotify. If multiple userspace applications are watching * the same directory with dnotify their information is chained in dn */ struct dnotify_mark { struct fsnotify_mark fsn_mark; struct dnotify_struct *dn; }; /* * When a process starts or stops watching an inode the set of events which * dnotify cares about for that inode may change. This function runs the * list of everything receiving dnotify events about this directory and calculates * the set of all those events. After it updates what dnotify is interested in * it calls the fsnotify function so it can update the set of all events relevant * to this inode. */ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark) { __u32 new_mask = 0; struct dnotify_struct *dn; struct dnotify_mark *dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark); assert_spin_locked(&fsn_mark->lock); for (dn = dn_mark->dn; dn != NULL; dn = dn->dn_next) new_mask |= (dn->dn_mask & ~FS_DN_MULTISHOT); if (fsn_mark->mask == new_mask) return; fsn_mark->mask = new_mask; fsnotify_recalc_mask(fsn_mark->connector); } /* * Mains fsnotify call where events are delivered to dnotify. * Find the dnotify mark on the relevant inode, run the list of dnotify structs * on that mark and determine which of them has expressed interest in receiving * events of this type. When found send the correct process and signal and * destroy the dnotify struct if it was not registered to receive multiple * events. */ static int dnotify_handle_event(struct fsnotify_mark *inode_mark, u32 mask, struct inode *inode, struct inode *dir, const struct qstr *name, u32 cookie) { struct dnotify_mark *dn_mark; struct dnotify_struct *dn; struct dnotify_struct **prev; struct fown_struct *fown; __u32 test_mask = mask & ~FS_EVENT_ON_CHILD; /* not a dir, dnotify doesn't care */ if (!dir && !(mask & FS_ISDIR)) return 0; dn_mark = container_of(inode_mark, struct dnotify_mark, fsn_mark); spin_lock(&inode_mark->lock); prev = &dn_mark->dn; while ((dn = *prev) != NULL) { if ((dn->dn_mask & test_mask) == 0) { prev = &dn->dn_next; continue; } fown = file_f_owner(dn->dn_filp); send_sigio(fown, dn->dn_fd, POLL_MSG); if (dn->dn_mask & FS_DN_MULTISHOT) prev = &dn->dn_next; else { *prev = dn->dn_next; kmem_cache_free(dnotify_struct_cache, dn); dnotify_recalc_inode_mask(inode_mark); } } spin_unlock(&inode_mark->lock); return 0; } static void dnotify_free_mark(struct fsnotify_mark *fsn_mark) { struct dnotify_mark *dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark); BUG_ON(dn_mark->dn); kmem_cache_free(dnotify_mark_cache, dn_mark); } static const struct fsnotify_ops dnotify_fsnotify_ops = { .handle_inode_event = dnotify_handle_event, .free_mark = dnotify_free_mark, }; /* * Called every time a file is closed. Looks first for a dnotify mark on the * inode. If one is found run all of the ->dn structures attached to that * mark for one relevant to this process closing the file and remove that * dnotify_struct. If that was the last dnotify_struct also remove the * fsnotify_mark. */ void dnotify_flush(struct file *filp, fl_owner_t id) { struct fsnotify_mark *fsn_mark; struct dnotify_mark *dn_mark; struct dnotify_struct *dn; struct dnotify_struct **prev; struct inode *inode; bool free = false; inode = file_inode(filp); if (!S_ISDIR(inode->i_mode)) return; fsn_mark = fsnotify_find_inode_mark(inode, dnotify_group); if (!fsn_mark) return; dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark); fsnotify_group_lock(dnotify_group); spin_lock(&fsn_mark->lock); prev = &dn_mark->dn; while ((dn = *prev) != NULL) { if ((dn->dn_owner == id) && (dn->dn_filp == filp)) { *prev = dn->dn_next; kmem_cache_free(dnotify_struct_cache, dn); dnotify_recalc_inode_mask(fsn_mark); break; } prev = &dn->dn_next; } spin_unlock(&fsn_mark->lock); /* nothing else could have found us thanks to the dnotify_groups mark_mutex */ if (dn_mark->dn == NULL) { fsnotify_detach_mark(fsn_mark); free = true; } fsnotify_group_unlock(dnotify_group); if (free) fsnotify_free_mark(fsn_mark); fsnotify_put_mark(fsn_mark); } /* this conversion is done only at watch creation */ static __u32 convert_arg(unsigned int arg) { __u32 new_mask = FS_EVENT_ON_CHILD; if (arg & DN_MULTISHOT) new_mask |= FS_DN_MULTISHOT; if (arg & DN_DELETE) new_mask |= (FS_DELETE | FS_MOVED_FROM); if (arg & DN_MODIFY) new_mask |= FS_MODIFY; if (arg & DN_ACCESS) new_mask |= FS_ACCESS; if (arg & DN_ATTRIB) new_mask |= FS_ATTRIB; if (arg & DN_RENAME) new_mask |= FS_RENAME; if (arg & DN_CREATE) new_mask |= (FS_CREATE | FS_MOVED_TO); return new_mask; } /* * If multiple processes watch the same inode with dnotify there is only one * dnotify mark in inode->i_fsnotify_marks but we chain a dnotify_struct * onto that mark. This function either attaches the new dnotify_struct onto * that list, or it |= the mask onto an existing dnofiy_struct. */ static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark *dn_mark, fl_owner_t id, int fd, struct file *filp, __u32 mask) { struct dnotify_struct *odn; odn = dn_mark->dn; while (odn != NULL) { /* adding more events to existing dnofiy_struct? */ if ((odn->dn_owner == id) && (odn->dn_filp == filp)) { odn->dn_fd = fd; odn->dn_mask |= mask; return -EEXIST; } odn = odn->dn_next; } dn->dn_mask = mask; dn->dn_fd = fd; dn->dn_filp = filp; dn->dn_owner = id; dn->dn_next = dn_mark->dn; dn_mark->dn = dn; return 0; } /* * When a process calls fcntl to attach a dnotify watch to a directory it ends * up here. Allocate both a mark for fsnotify to add and a dnotify_struct to be * attached to the fsnotify_mark. */ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg) { struct dnotify_mark *new_dn_mark, *dn_mark; struct fsnotify_mark *new_fsn_mark, *fsn_mark; struct dnotify_struct *dn; struct inode *inode; fl_owner_t id = current->files; struct file *f = NULL; int destroy = 0, error = 0; __u32 mask; /* we use these to tell if we need to kfree */ new_fsn_mark = NULL; dn = NULL; if (!dir_notify_enable) { error = -EINVAL; goto out_err; } /* a 0 mask means we are explicitly removing the watch */ if ((arg & ~DN_MULTISHOT) == 0) { dnotify_flush(filp, id); error = 0; goto out_err; } /* dnotify only works on directories */ inode = file_inode(filp); if (!S_ISDIR(inode->i_mode)) { error = -ENOTDIR; goto out_err; } /* * convert the userspace DN_* "arg" to the internal FS_* * defined in fsnotify */ mask = convert_arg(arg); error = security_path_notify(&filp->f_path, mask, FSNOTIFY_OBJ_TYPE_INODE); if (error) goto out_err; /* expect most fcntl to add new rather than augment old */ dn = kmem_cache_alloc(dnotify_struct_cache, GFP_KERNEL); if (!dn) { error = -ENOMEM; goto out_err; } /* new fsnotify mark, we expect most fcntl calls to add a new mark */ new_dn_mark = kmem_cache_alloc(dnotify_mark_cache, GFP_KERNEL); if (!new_dn_mark) { error = -ENOMEM; goto out_err; } error = file_f_owner_allocate(filp); if (error) goto out_err; /* set up the new_fsn_mark and new_dn_mark */ new_fsn_mark = &new_dn_mark->fsn_mark; fsnotify_init_mark(new_fsn_mark, dnotify_group); new_fsn_mark->mask = mask; new_dn_mark->dn = NULL; /* this is needed to prevent the fcntl/close race described below */ fsnotify_group_lock(dnotify_group); /* add the new_fsn_mark or find an old one. */ fsn_mark = fsnotify_find_inode_mark(inode, dnotify_group); if (fsn_mark) { dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark); spin_lock(&fsn_mark->lock); } else { error = fsnotify_add_inode_mark_locked(new_fsn_mark, inode, 0); if (error) { fsnotify_group_unlock(dnotify_group); goto out_err; } spin_lock(&new_fsn_mark->lock); fsn_mark = new_fsn_mark; dn_mark = new_dn_mark; /* we used new_fsn_mark, so don't free it */ new_fsn_mark = NULL; } f = fget_raw(fd); /* if (f != filp) means that we lost a race and another task/thread * actually closed the fd we are still playing with before we grabbed * the dnotify_groups mark_mutex and fsn_mark->lock. Since closing the * fd is the only time we clean up the marks we need to get our mark * off the list. */ if (f != filp) { /* if we added ourselves, shoot ourselves, it's possible that * the flush actually did shoot this fsn_mark. That's fine too * since multiple calls to destroy_mark is perfectly safe, if * we found a dn_mark already attached to the inode, just sod * off silently as the flush at close time dealt with it. */ if (dn_mark == new_dn_mark) destroy = 1; error = 0; goto out; } __f_setown(filp, task_pid(current), PIDTYPE_TGID, 0); error = attach_dn(dn, dn_mark, id, fd, filp, mask); /* !error means that we attached the dn to the dn_mark, so don't free it */ if (!error) dn = NULL; /* -EEXIST means that we didn't add this new dn and used an old one. * that isn't an error (and the unused dn should be freed) */ else if (error == -EEXIST) error = 0; dnotify_recalc_inode_mask(fsn_mark); out: spin_unlock(&fsn_mark->lock); if (destroy) fsnotify_detach_mark(fsn_mark); fsnotify_group_unlock(dnotify_group); if (destroy) fsnotify_free_mark(fsn_mark); fsnotify_put_mark(fsn_mark); out_err: if (new_fsn_mark) fsnotify_put_mark(new_fsn_mark); if (dn) kmem_cache_free(dnotify_struct_cache, dn); if (f) fput(f); return error; } static int __init dnotify_init(void) { dnotify_struct_cache = KMEM_CACHE(dnotify_struct, SLAB_PANIC|SLAB_ACCOUNT); dnotify_mark_cache = KMEM_CACHE(dnotify_mark, SLAB_PANIC|SLAB_ACCOUNT); dnotify_group = fsnotify_alloc_group(&dnotify_fsnotify_ops, 0); if (IS_ERR(dnotify_group)) panic("unable to allocate fsnotify group for dnotify\n"); dnotify_sysctl_init(); return 0; } module_init(dnotify_init)
142 141 51 52 165 166 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/locks.c * * We implement four types of file locks: BSD locks, posix locks, open * file description locks, and leases. For details about BSD locks, * see the flock(2) man page; for details about the other three, see * fcntl(2). * * * Locking conflicts and dependencies: * If multiple threads attempt to lock the same byte (or flock the same file) * only one can be granted the lock, and other must wait their turn. * The first lock has been "applied" or "granted", the others are "waiting" * and are "blocked" by the "applied" lock.. * * Waiting and applied locks are all kept in trees whose properties are: * * - the root of a tree may be an applied or waiting lock. * - every other node in the tree is a waiting lock that * conflicts with every ancestor of that node. * * Every such tree begins life as a waiting singleton which obviously * satisfies the above properties. * * The only ways we modify trees preserve these properties: * * 1. We may add a new leaf node, but only after first verifying that it * conflicts with all of its ancestors. * 2. We may remove the root of a tree, creating a new singleton * tree from the root and N new trees rooted in the immediate * children. * 3. If the root of a tree is not currently an applied lock, we may * apply it (if possible). * 4. We may upgrade the root of the tree (either extend its range, * or upgrade its entire range from read to write). * * When an applied lock is modified in a way that reduces or downgrades any * part of its range, we remove all its children (2 above). This particularly * happens when a lock is unlocked. * * For each of those child trees we "wake up" the thread which is * waiting for the lock so it can continue handling as follows: if the * root of the tree applies, we do so (3). If it doesn't, it must * conflict with some applied lock. We remove (wake up) all of its children * (2), and add it is a new leaf to the tree rooted in the applied * lock (1). We then repeat the process recursively with those * children. * */ #include <linux/capability.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/filelock.h> #include <linux/fs.h> #include <linux/init.h> #include <linux/security.h> #include <linux/slab.h> #include <linux/syscalls.h> #include <linux/time.h> #include <linux/rcupdate.h> #include <linux/pid_namespace.h> #include <linux/hashtable.h> #include <linux/percpu.h> #include <linux/sysctl.h> #define CREATE_TRACE_POINTS #include <trace/events/filelock.h> #include <linux/uaccess.h> static struct file_lock *file_lock(struct file_lock_core *flc) { return container_of(flc, struct file_lock, c); } static struct file_lease *file_lease(struct file_lock_core *flc) { return container_of(flc, struct file_lease, c); } static bool lease_breaking(struct file_lease *fl) { return fl->c.flc_flags & (FL_UNLOCK_PENDING | FL_DOWNGRADE_PENDING); } static int target_leasetype(struct file_lease *fl) { if (fl->c.flc_flags & FL_UNLOCK_PENDING) return F_UNLCK; if (fl->c.flc_flags & FL_DOWNGRADE_PENDING) return F_RDLCK; return fl->c.flc_type; } static int leases_enable = 1; static int lease_break_time = 45; #ifdef CONFIG_SYSCTL static const struct ctl_table locks_sysctls[] = { { .procname = "leases-enable", .data = &leases_enable, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, #ifdef CONFIG_MMU { .procname = "lease-break-time", .data = &lease_break_time, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, #endif /* CONFIG_MMU */ }; static int __init init_fs_locks_sysctls(void) { register_sysctl_init("fs", locks_sysctls); return 0; } early_initcall(init_fs_locks_sysctls); #endif /* CONFIG_SYSCTL */ /* * The global file_lock_list is only used for displaying /proc/locks, so we * keep a list on each CPU, with each list protected by its own spinlock. * Global serialization is done using file_rwsem. * * Note that alterations to the list also require that the relevant flc_lock is * held. */ struct file_lock_list_struct { spinlock_t lock; struct hlist_head hlist; }; static DEFINE_PER_CPU(struct file_lock_list_struct, file_lock_list); DEFINE_STATIC_PERCPU_RWSEM(file_rwsem); /* * The blocked_hash is used to find POSIX lock loops for deadlock detection. * It is protected by blocked_lock_lock. * * We hash locks by lockowner in order to optimize searching for the lock a * particular lockowner is waiting on. * * FIXME: make this value scale via some heuristic? We generally will want more * buckets when we have more lockowners holding locks, but that's a little * difficult to determine without knowing what the workload will look like. */ #define BLOCKED_HASH_BITS 7 static DEFINE_HASHTABLE(blocked_hash, BLOCKED_HASH_BITS); /* * This lock protects the blocked_hash. Generally, if you're accessing it, you * want to be holding this lock. * * In addition, it also protects the fl->fl_blocked_requests list, and the * fl->fl_blocker pointer for file_lock structures that are acting as lock * requests (in contrast to those that are acting as records of acquired locks). * * Note that when we acquire this lock in order to change the above fields, * we often hold the flc_lock as well. In certain cases, when reading the fields * protected by this lock, we can skip acquiring it iff we already hold the * flc_lock. */ static DEFINE_SPINLOCK(blocked_lock_lock); static struct kmem_cache *flctx_cache __ro_after_init; static struct kmem_cache *filelock_cache __ro_after_init; static struct kmem_cache *filelease_cache __ro_after_init; static struct file_lock_context * locks_get_lock_context(struct inode *inode, int type) { struct file_lock_context *ctx; /* paired with cmpxchg() below */ ctx = locks_inode_context(inode); if (likely(ctx) || type == F_UNLCK) goto out; ctx = kmem_cache_alloc(flctx_cache, GFP_KERNEL); if (!ctx) goto out; spin_lock_init(&ctx->flc_lock); INIT_LIST_HEAD(&ctx->flc_flock); INIT_LIST_HEAD(&ctx->flc_posix); INIT_LIST_HEAD(&ctx->flc_lease); /* * Assign the pointer if it's not already assigned. If it is, then * free the context we just allocated. */ if (cmpxchg(&inode->i_flctx, NULL, ctx)) { kmem_cache_free(flctx_cache, ctx); ctx = locks_inode_context(inode); } out: trace_locks_get_lock_context(inode, type, ctx); return ctx; } static void locks_dump_ctx_list(struct list_head *list, char *list_type) { struct file_lock_core *flc; list_for_each_entry(flc, list, flc_list) pr_warn("%s: fl_owner=%p fl_flags=0x%x fl_type=0x%x fl_pid=%u\n", list_type, flc->flc_owner, flc->flc_flags, flc->flc_type, flc->flc_pid); } static void locks_check_ctx_lists(struct inode *inode) { struct file_lock_context *ctx = inode->i_flctx; if (unlikely(!list_empty(&ctx->flc_flock) || !list_empty(&ctx->flc_posix) || !list_empty(&ctx->flc_lease))) { pr_warn("Leaked locks on dev=0x%x:0x%x ino=0x%lx:\n", MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), inode->i_ino); locks_dump_ctx_list(&ctx->flc_flock, "FLOCK"); locks_dump_ctx_list(&ctx->flc_posix, "POSIX"); locks_dump_ctx_list(&ctx->flc_lease, "LEASE"); } } static void locks_check_ctx_file_list(struct file *filp, struct list_head *list, char *list_type) { struct file_lock_core *flc; struct inode *inode = file_inode(filp); list_for_each_entry(flc, list, flc_list) if (flc->flc_file == filp) pr_warn("Leaked %s lock on dev=0x%x:0x%x ino=0x%lx " " fl_owner=%p fl_flags=0x%x fl_type=0x%x fl_pid=%u\n", list_type, MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), inode->i_ino, flc->flc_owner, flc->flc_flags, flc->flc_type, flc->flc_pid); } void locks_free_lock_context(struct inode *inode) { struct file_lock_context *ctx = locks_inode_context(inode); if (unlikely(ctx)) { locks_check_ctx_lists(inode); kmem_cache_free(flctx_cache, ctx); } } static void locks_init_lock_heads(struct file_lock_core *flc) { INIT_HLIST_NODE(&flc->flc_link); INIT_LIST_HEAD(&flc->flc_list); INIT_LIST_HEAD(&flc->flc_blocked_requests); INIT_LIST_HEAD(&flc->flc_blocked_member); init_waitqueue_head(&flc->flc_wait); } /* Allocate an empty lock structure. */ struct file_lock *locks_alloc_lock(void) { struct file_lock *fl = kmem_cache_zalloc(filelock_cache, GFP_KERNEL); if (fl) locks_init_lock_heads(&fl->c); return fl; } EXPORT_SYMBOL_GPL(locks_alloc_lock); /* Allocate an empty lock structure. */ struct file_lease *locks_alloc_lease(void) { struct file_lease *fl = kmem_cache_zalloc(filelease_cache, GFP_KERNEL); if (fl) locks_init_lock_heads(&fl->c); return fl; } EXPORT_SYMBOL_GPL(locks_alloc_lease); void locks_release_private(struct file_lock *fl) { struct file_lock_core *flc = &fl->c; BUG_ON(waitqueue_active(&flc->flc_wait)); BUG_ON(!list_empty(&flc->flc_list)); BUG_ON(!list_empty(&flc->flc_blocked_requests)); BUG_ON(!list_empty(&flc->flc_blocked_member)); BUG_ON(!hlist_unhashed(&flc->flc_link)); if (fl->fl_ops) { if (fl->fl_ops->fl_release_private) fl->fl_ops->fl_release_private(fl); fl->fl_ops = NULL; } if (fl->fl_lmops) { if (fl->fl_lmops->lm_put_owner) { fl->fl_lmops->lm_put_owner(flc->flc_owner); flc->flc_owner = NULL; } fl->fl_lmops = NULL; } } EXPORT_SYMBOL_GPL(locks_release_private); /** * locks_owner_has_blockers - Check for blocking lock requests * @flctx: file lock context * @owner: lock owner * * Return values: * %true: @owner has at least one blocker * %false: @owner has no blockers */ bool locks_owner_has_blockers(struct file_lock_context *flctx, fl_owner_t owner) { struct file_lock_core *flc; spin_lock(&flctx->flc_lock); list_for_each_entry(flc, &flctx->flc_posix, flc_list) { if (flc->flc_owner != owner) continue; if (!list_empty(&flc->flc_blocked_requests)) { spin_unlock(&flctx->flc_lock); return true; } } spin_unlock(&flctx->flc_lock); return false; } EXPORT_SYMBOL_GPL(locks_owner_has_blockers); /* Free a lock which is not in use. */ void locks_free_lock(struct file_lock *fl) { locks_release_private(fl); kmem_cache_free(filelock_cache, fl); } EXPORT_SYMBOL(locks_free_lock); /* Free a lease which is not in use. */ void locks_free_lease(struct file_lease *fl) { kmem_cache_free(filelease_cache, fl); } EXPORT_SYMBOL(locks_free_lease); static void locks_dispose_list(struct list_head *dispose) { struct file_lock_core *flc; while (!list_empty(dispose)) { flc = list_first_entry(dispose, struct file_lock_core, flc_list); list_del_init(&flc->flc_list); if (flc->flc_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT)) locks_free_lease(file_lease(flc)); else locks_free_lock(file_lock(flc)); } } void locks_init_lock(struct file_lock *fl) { memset(fl, 0, sizeof(struct file_lock)); locks_init_lock_heads(&fl->c); } EXPORT_SYMBOL(locks_init_lock); void locks_init_lease(struct file_lease *fl) { memset(fl, 0, sizeof(*fl)); locks_init_lock_heads(&fl->c); } EXPORT_SYMBOL(locks_init_lease); /* * Initialize a new lock from an existing file_lock structure. */ void locks_copy_conflock(struct file_lock *new, struct file_lock *fl) { new->c.flc_owner = fl->c.flc_owner; new->c.flc_pid = fl->c.flc_pid; new->c.flc_file = NULL; new->c.flc_flags = fl->c.flc_flags; new->c.flc_type = fl->c.flc_type; new->fl_start = fl->fl_start; new->fl_end = fl->fl_end; new->fl_lmops = fl->fl_lmops; new->fl_ops = NULL; if (fl->fl_lmops) { if (fl->fl_lmops->lm_get_owner) fl->fl_lmops->lm_get_owner(fl->c.flc_owner); } } EXPORT_SYMBOL(locks_copy_conflock); void locks_copy_lock(struct file_lock *new, struct file_lock *fl) { /* "new" must be a freshly-initialized lock */ WARN_ON_ONCE(new->fl_ops); locks_copy_conflock(new, fl); new->c.flc_file = fl->c.flc_file; new->fl_ops = fl->fl_ops; if (fl->fl_ops) { if (fl->fl_ops->fl_copy_lock) fl->fl_ops->fl_copy_lock(new, fl); } } EXPORT_SYMBOL(locks_copy_lock); static void locks_move_blocks(struct file_lock *new, struct file_lock *fl) { struct file_lock *f; /* * As ctx->flc_lock is held, new requests cannot be added to * ->flc_blocked_requests, so we don't need a lock to check if it * is empty. */ if (list_empty(&fl->c.flc_blocked_requests)) return; spin_lock(&blocked_lock_lock); list_splice_init(&fl->c.flc_blocked_requests, &new->c.flc_blocked_requests); list_for_each_entry(f, &new->c.flc_blocked_requests, c.flc_blocked_member) f->c.flc_blocker = &new->c; spin_unlock(&blocked_lock_lock); } static inline int flock_translate_cmd(int cmd) { switch (cmd) { case LOCK_SH: return F_RDLCK; case LOCK_EX: return F_WRLCK; case LOCK_UN: return F_UNLCK; } return -EINVAL; } /* Fill in a file_lock structure with an appropriate FLOCK lock. */ static void flock_make_lock(struct file *filp, struct file_lock *fl, int type) { locks_init_lock(fl); fl->c.flc_file = filp; fl->c.flc_owner = filp; fl->c.flc_pid = current->tgid; fl->c.flc_flags = FL_FLOCK; fl->c.flc_type = type; fl->fl_end = OFFSET_MAX; } static int assign_type(struct file_lock_core *flc, int type) { switch (type) { case F_RDLCK: case F_WRLCK: case F_UNLCK: flc->flc_type = type; break; default: return -EINVAL; } return 0; } static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl, struct flock64 *l) { switch (l->l_whence) { case SEEK_SET: fl->fl_start = 0; break; case SEEK_CUR: fl->fl_start = filp->f_pos; break; case SEEK_END: fl->fl_start = i_size_read(file_inode(filp)); break; default: return -EINVAL; } if (l->l_start > OFFSET_MAX - fl->fl_start) return -EOVERFLOW; fl->fl_start += l->l_start; if (fl->fl_start < 0) return -EINVAL; /* POSIX-1996 leaves the case l->l_len < 0 undefined; POSIX-2001 defines it. */ if (l->l_len > 0) { if (l->l_len - 1 > OFFSET_MAX - fl->fl_start) return -EOVERFLOW; fl->fl_end = fl->fl_start + (l->l_len - 1); } else if (l->l_len < 0) { if (fl->fl_start + l->l_len < 0) return -EINVAL; fl->fl_end = fl->fl_start - 1; fl->fl_start += l->l_len; } else fl->fl_end = OFFSET_MAX; fl->c.flc_owner = current->files; fl->c.flc_pid = current->tgid; fl->c.flc_file = filp; fl->c.flc_flags = FL_POSIX; fl->fl_ops = NULL; fl->fl_lmops = NULL; return assign_type(&fl->c, l->l_type); } /* Verify a "struct flock" and copy it to a "struct file_lock" as a POSIX * style lock. */ static int flock_to_posix_lock(struct file *filp, struct file_lock *fl, struct flock *l) { struct flock64 ll = { .l_type = l->l_type, .l_whence = l->l_whence, .l_start = l->l_start, .l_len = l->l_len, }; return flock64_to_posix_lock(filp, fl, &ll); } /* default lease lock manager operations */ static bool lease_break_callback(struct file_lease *fl) { kill_fasync(&fl->fl_fasync, SIGIO, POLL_MSG); return false; } static void lease_setup(struct file_lease *fl, void **priv) { struct file *filp = fl->c.flc_file; struct fasync_struct *fa = *priv; /* * fasync_insert_entry() returns the old entry if any. If there was no * old entry, then it used "priv" and inserted it into the fasync list. * Clear the pointer to indicate that it shouldn't be freed. */ if (!fasync_insert_entry(fa->fa_fd, filp, &fl->fl_fasync, fa)) *priv = NULL; __f_setown(filp, task_pid(current), PIDTYPE_TGID, 0); } static const struct lease_manager_operations lease_manager_ops = { .lm_break = lease_break_callback, .lm_change = lease_modify, .lm_setup = lease_setup, }; /* * Initialize a lease, use the default lock manager operations */ static int lease_init(struct file *filp, int type, struct file_lease *fl) { if (assign_type(&fl->c, type) != 0) return -EINVAL; fl->c.flc_owner = filp; fl->c.flc_pid = current->tgid; fl->c.flc_file = filp; fl->c.flc_flags = FL_LEASE; fl->fl_lmops = &lease_manager_ops; return 0; } /* Allocate a file_lock initialised to this type of lease */ static struct file_lease *lease_alloc(struct file *filp, int type) { struct file_lease *fl = locks_alloc_lease(); int error = -ENOMEM; if (fl == NULL) return ERR_PTR(error); error = lease_init(filp, type, fl); if (error) { locks_free_lease(fl); return ERR_PTR(error); } return fl; } /* Check if two locks overlap each other. */ static inline int locks_overlap(struct file_lock *fl1, struct file_lock *fl2) { return ((fl1->fl_end >= fl2->fl_start) && (fl2->fl_end >= fl1->fl_start)); } /* * Check whether two locks have the same owner. */ static int posix_same_owner(struct file_lock_core *fl1, struct file_lock_core *fl2) { return fl1->flc_owner == fl2->flc_owner; } /* Must be called with the flc_lock held! */ static void locks_insert_global_locks(struct file_lock_core *flc) { struct file_lock_list_struct *fll = this_cpu_ptr(&file_lock_list); percpu_rwsem_assert_held(&file_rwsem); spin_lock(&fll->lock); flc->flc_link_cpu = smp_processor_id(); hlist_add_head(&flc->flc_link, &fll->hlist); spin_unlock(&fll->lock); } /* Must be called with the flc_lock held! */ static void locks_delete_global_locks(struct file_lock_core *flc) { struct file_lock_list_struct *fll; percpu_rwsem_assert_held(&file_rwsem); /* * Avoid taking lock if already unhashed. This is safe since this check * is done while holding the flc_lock, and new insertions into the list * also require that it be held. */ if (hlist_unhashed(&flc->flc_link)) return; fll = per_cpu_ptr(&file_lock_list, flc->flc_link_cpu); spin_lock(&fll->lock); hlist_del_init(&flc->flc_link); spin_unlock(&fll->lock); } static unsigned long posix_owner_key(struct file_lock_core *flc) { return (unsigned long) flc->flc_owner; } static void locks_insert_global_blocked(struct file_lock_core *waiter) { lockdep_assert_held(&blocked_lock_lock); hash_add(blocked_hash, &waiter->flc_link, posix_owner_key(waiter)); } static void locks_delete_global_blocked(struct file_lock_core *waiter) { lockdep_assert_held(&blocked_lock_lock); hash_del(&waiter->flc_link); } /* Remove waiter from blocker's block list. * When blocker ends up pointing to itself then the list is empty. * * Must be called with blocked_lock_lock held. */ static void __locks_unlink_block(struct file_lock_core *waiter) { locks_delete_global_blocked(waiter); list_del_init(&waiter->flc_blocked_member); } static void __locks_wake_up_blocks(struct file_lock_core *blocker) { while (!list_empty(&blocker->flc_blocked_requests)) { struct file_lock_core *waiter; struct file_lock *fl; waiter = list_first_entry(&blocker->flc_blocked_requests, struct file_lock_core, flc_blocked_member); fl = file_lock(waiter); __locks_unlink_block(waiter); if ((waiter->flc_flags & (FL_POSIX | FL_FLOCK)) && fl->fl_lmops && fl->fl_lmops->lm_notify) fl->fl_lmops->lm_notify(fl); else locks_wake_up(fl); /* * The setting of flc_blocker to NULL marks the "done" * point in deleting a block. Paired with acquire at the top * of locks_delete_block(). */ smp_store_release(&waiter->flc_blocker, NULL); } } static int __locks_delete_block(struct file_lock_core *waiter) { int status = -ENOENT; /* * If fl_blocker is NULL, it won't be set again as this thread "owns" * the lock and is the only one that might try to claim the lock. * * We use acquire/release to manage fl_blocker so that we can * optimize away taking the blocked_lock_lock in many cases. * * The smp_load_acquire guarantees two things: * * 1/ that fl_blocked_requests can be tested locklessly. If something * was recently added to that list it must have been in a locked region * *before* the locked region when fl_blocker was set to NULL. * * 2/ that no other thread is accessing 'waiter', so it is safe to free * it. __locks_wake_up_blocks is careful not to touch waiter after * fl_blocker is released. * * If a lockless check of fl_blocker shows it to be NULL, we know that * no new locks can be inserted into its fl_blocked_requests list, and * can avoid doing anything further if the list is empty. */ if (!smp_load_acquire(&waiter->flc_blocker) && list_empty(&waiter->flc_blocked_requests)) return status; spin_lock(&blocked_lock_lock); if (waiter->flc_blocker) status = 0; __locks_wake_up_blocks(waiter); __locks_unlink_block(waiter); /* * The setting of fl_blocker to NULL marks the "done" point in deleting * a block. Paired with acquire at the top of this function. */ smp_store_release(&waiter->flc_blocker, NULL); spin_unlock(&blocked_lock_lock); return status; } /** * locks_delete_block - stop waiting for a file lock * @waiter: the lock which was waiting * * lockd/nfsd need to disconnect the lock while working on it. */ int locks_delete_block(struct file_lock *waiter) { return __locks_delete_block(&waiter->c); } EXPORT_SYMBOL(locks_delete_block); /* Insert waiter into blocker's block list. * We use a circular list so that processes can be easily woken up in * the order they blocked. The documentation doesn't require this but * it seems like the reasonable thing to do. * * Must be called with both the flc_lock and blocked_lock_lock held. The * fl_blocked_requests list itself is protected by the blocked_lock_lock, * but by ensuring that the flc_lock is also held on insertions we can avoid * taking the blocked_lock_lock in some cases when we see that the * fl_blocked_requests list is empty. * * Rather than just adding to the list, we check for conflicts with any existing * waiters, and add beneath any waiter that blocks the new waiter. * Thus wakeups don't happen until needed. */ static void __locks_insert_block(struct file_lock_core *blocker, struct file_lock_core *waiter, bool conflict(struct file_lock_core *, struct file_lock_core *)) { struct file_lock_core *flc; BUG_ON(!list_empty(&waiter->flc_blocked_member)); new_blocker: list_for_each_entry(flc, &blocker->flc_blocked_requests, flc_blocked_member) if (conflict(flc, waiter)) { blocker = flc; goto new_blocker; } waiter->flc_blocker = blocker; list_add_tail(&waiter->flc_blocked_member, &blocker->flc_blocked_requests); if ((blocker->flc_flags & (FL_POSIX|FL_OFDLCK)) == FL_POSIX) locks_insert_global_blocked(waiter); /* The requests in waiter->flc_blocked are known to conflict with * waiter, but might not conflict with blocker, or the requests * and lock which block it. So they all need to be woken. */ __locks_wake_up_blocks(waiter); } /* Must be called with flc_lock held. */ static void locks_insert_block(struct file_lock_core *blocker, struct file_lock_core *waiter, bool conflict(struct file_lock_core *, struct file_lock_core *)) { spin_lock(&blocked_lock_lock); __locks_insert_block(blocker, waiter, conflict); spin_unlock(&blocked_lock_lock); } /* * Wake up processes blocked waiting for blocker. * * Must be called with the inode->flc_lock held! */ static void locks_wake_up_blocks(struct file_lock_core *blocker) { /* * Avoid taking global lock if list is empty. This is safe since new * blocked requests are only added to the list under the flc_lock, and * the flc_lock is always held here. Note that removal from the * fl_blocked_requests list does not require the flc_lock, so we must * recheck list_empty() after acquiring the blocked_lock_lock. */ if (list_empty(&blocker->flc_blocked_requests)) return; spin_lock(&blocked_lock_lock); __locks_wake_up_blocks(blocker); spin_unlock(&blocked_lock_lock); } static void locks_insert_lock_ctx(struct file_lock_core *fl, struct list_head *before) { list_add_tail(&fl->flc_list, before); locks_insert_global_locks(fl); } static void locks_unlink_lock_ctx(struct file_lock_core *fl) { locks_delete_global_locks(fl); list_del_init(&fl->flc_list); locks_wake_up_blocks(fl); } static void locks_delete_lock_ctx(struct file_lock_core *fl, struct list_head *dispose) { locks_unlink_lock_ctx(fl); if (dispose) list_add(&fl->flc_list, dispose); else locks_free_lock(file_lock(fl)); } /* Determine if lock sys_fl blocks lock caller_fl. Common functionality * checks for shared/exclusive status of overlapping locks. */ static bool locks_conflict(struct file_lock_core *caller_flc, struct file_lock_core *sys_flc) { if (sys_flc->flc_type == F_WRLCK) return true; if (caller_flc->flc_type == F_WRLCK) return true; return false; } /* Determine if lock sys_fl blocks lock caller_fl. POSIX specific * checking before calling the locks_conflict(). */ static bool posix_locks_conflict(struct file_lock_core *caller_flc, struct file_lock_core *sys_flc) { struct file_lock *caller_fl = file_lock(caller_flc); struct file_lock *sys_fl = file_lock(sys_flc); /* POSIX locks owned by the same process do not conflict with * each other. */ if (posix_same_owner(caller_flc, sys_flc)) return false; /* Check whether they overlap */ if (!locks_overlap(caller_fl, sys_fl)) return false; return locks_conflict(caller_flc, sys_flc); } /* Determine if lock sys_fl blocks lock caller_fl. Used on xx_GETLK * path so checks for additional GETLK-specific things like F_UNLCK. */ static bool posix_test_locks_conflict(struct file_lock *caller_fl, struct file_lock *sys_fl) { struct file_lock_core *caller = &caller_fl->c; struct file_lock_core *sys = &sys_fl->c; /* F_UNLCK checks any locks on the same fd. */ if (lock_is_unlock(caller_fl)) { if (!posix_same_owner(caller, sys)) return false; return locks_overlap(caller_fl, sys_fl); } return posix_locks_conflict(caller, sys); } /* Determine if lock sys_fl blocks lock caller_fl. FLOCK specific * checking before calling the locks_conflict(). */ static bool flock_locks_conflict(struct file_lock_core *caller_flc, struct file_lock_core *sys_flc) { /* FLOCK locks referring to the same filp do not conflict with * each other. */ if (caller_flc->flc_file == sys_flc->flc_file) return false; return locks_conflict(caller_flc, sys_flc); } void posix_test_lock(struct file *filp, struct file_lock *fl) { struct file_lock *cfl; struct file_lock_context *ctx; struct inode *inode = file_inode(filp); void *owner; void (*func)(void); ctx = locks_inode_context(inode); if (!ctx || list_empty_careful(&ctx->flc_posix)) { fl->c.flc_type = F_UNLCK; return; } retry: spin_lock(&ctx->flc_lock); list_for_each_entry(cfl, &ctx->flc_posix, c.flc_list) { if (!posix_test_locks_conflict(fl, cfl)) continue; if (cfl->fl_lmops && cfl->fl_lmops->lm_lock_expirable && (*cfl->fl_lmops->lm_lock_expirable)(cfl)) { owner = cfl->fl_lmops->lm_mod_owner; func = cfl->fl_lmops->lm_expire_lock; __module_get(owner); spin_unlock(&ctx->flc_lock); (*func)(); module_put(owner); goto retry; } locks_copy_conflock(fl, cfl); goto out; } fl->c.flc_type = F_UNLCK; out: spin_unlock(&ctx->flc_lock); return; } EXPORT_SYMBOL(posix_test_lock); /* * Deadlock detection: * * We attempt to detect deadlocks that are due purely to posix file * locks. * * We assume that a task can be waiting for at most one lock at a time. * So for any acquired lock, the process holding that lock may be * waiting on at most one other lock. That lock in turns may be held by * someone waiting for at most one other lock. Given a requested lock * caller_fl which is about to wait for a conflicting lock block_fl, we * follow this chain of waiters to ensure we are not about to create a * cycle. * * Since we do this before we ever put a process to sleep on a lock, we * are ensured that there is never a cycle; that is what guarantees that * the while() loop in posix_locks_deadlock() eventually completes. * * Note: the above assumption may not be true when handling lock * requests from a broken NFS client. It may also fail in the presence * of tasks (such as posix threads) sharing the same open file table. * To handle those cases, we just bail out after a few iterations. * * For FL_OFDLCK locks, the owner is the filp, not the files_struct. * Because the owner is not even nominally tied to a thread of * execution, the deadlock detection below can't reasonably work well. Just * skip it for those. * * In principle, we could do a more limited deadlock detection on FL_OFDLCK * locks that just checks for the case where two tasks are attempting to * upgrade from read to write locks on the same inode. */ #define MAX_DEADLK_ITERATIONS 10 /* Find a lock that the owner of the given @blocker is blocking on. */ static struct file_lock_core *what_owner_is_waiting_for(struct file_lock_core *blocker) { struct file_lock_core *flc; hash_for_each_possible(blocked_hash, flc, flc_link, posix_owner_key(blocker)) { if (posix_same_owner(flc, blocker)) { while (flc->flc_blocker) flc = flc->flc_blocker; return flc; } } return NULL; } /* Must be called with the blocked_lock_lock held! */ static bool posix_locks_deadlock(struct file_lock *caller_fl, struct file_lock *block_fl) { struct file_lock_core *caller = &caller_fl->c; struct file_lock_core *blocker = &block_fl->c; int i = 0; lockdep_assert_held(&blocked_lock_lock); /* * This deadlock detector can't reasonably detect deadlocks with * FL_OFDLCK locks, since they aren't owned by a process, per-se. */ if (caller->flc_flags & FL_OFDLCK) return false; while ((blocker = what_owner_is_waiting_for(blocker))) { if (i++ > MAX_DEADLK_ITERATIONS) return false; if (posix_same_owner(caller, blocker)) return true; } return false; } /* Try to create a FLOCK lock on filp. We always insert new FLOCK locks * after any leases, but before any posix locks. * * Note that if called with an FL_EXISTS argument, the caller may determine * whether or not a lock was successfully freed by testing the return * value for -ENOENT. */ static int flock_lock_inode(struct inode *inode, struct file_lock *request) { struct file_lock *new_fl = NULL; struct file_lock *fl; struct file_lock_context *ctx; int error = 0; bool found = false; LIST_HEAD(dispose); ctx = locks_get_lock_context(inode, request->c.flc_type); if (!ctx) { if (request->c.flc_type != F_UNLCK) return -ENOMEM; return (request->c.flc_flags & FL_EXISTS) ? -ENOENT : 0; } if (!(request->c.flc_flags & FL_ACCESS) && (request->c.flc_type != F_UNLCK)) { new_fl = locks_alloc_lock(); if (!new_fl) return -ENOMEM; } percpu_down_read(&file_rwsem); spin_lock(&ctx->flc_lock); if (request->c.flc_flags & FL_ACCESS) goto find_conflict; list_for_each_entry(fl, &ctx->flc_flock, c.flc_list) { if (request->c.flc_file != fl->c.flc_file) continue; if (request->c.flc_type == fl->c.flc_type) goto out; found = true; locks_delete_lock_ctx(&fl->c, &dispose); break; } if (lock_is_unlock(request)) { if ((request->c.flc_flags & FL_EXISTS) && !found) error = -ENOENT; goto out; } find_conflict: list_for_each_entry(fl, &ctx->flc_flock, c.flc_list) { if (!flock_locks_conflict(&request->c, &fl->c)) continue; error = -EAGAIN; if (!(request->c.flc_flags & FL_SLEEP)) goto out; error = FILE_LOCK_DEFERRED; locks_insert_block(&fl->c, &request->c, flock_locks_conflict); goto out; } if (request->c.flc_flags & FL_ACCESS) goto out; locks_copy_lock(new_fl, request); locks_move_blocks(new_fl, request); locks_insert_lock_ctx(&new_fl->c, &ctx->flc_flock); new_fl = NULL; error = 0; out: spin_unlock(&ctx->flc_lock); percpu_up_read(&file_rwsem); if (new_fl) locks_free_lock(new_fl); locks_dispose_list(&dispose); trace_flock_lock_inode(inode, request, error); return error; } static int posix_lock_inode(struct inode *inode, struct file_lock *request, struct file_lock *conflock) { struct file_lock *fl, *tmp; struct file_lock *new_fl = NULL; struct file_lock *new_fl2 = NULL; struct file_lock *left = NULL; struct file_lock *right = NULL; struct file_lock_context *ctx; int error; bool added = false; LIST_HEAD(dispose); void *owner; void (*func)(void); ctx = locks_get_lock_context(inode, request->c.flc_type); if (!ctx) return lock_is_unlock(request) ? 0 : -ENOMEM; /* * We may need two file_lock structures for this operation, * so we get them in advance to avoid races. * * In some cases we can be sure, that no new locks will be needed */ if (!(request->c.flc_flags & FL_ACCESS) && (request->c.flc_type != F_UNLCK || request->fl_start != 0 || request->fl_end != OFFSET_MAX)) { new_fl = locks_alloc_lock(); new_fl2 = locks_alloc_lock(); } retry: percpu_down_read(&file_rwsem); spin_lock(&ctx->flc_lock); /* * New lock request. Walk all POSIX locks and look for conflicts. If * there are any, either return error or put the request on the * blocker's list of waiters and the global blocked_hash. */ if (request->c.flc_type != F_UNLCK) { list_for_each_entry(fl, &ctx->flc_posix, c.flc_list) { if (!posix_locks_conflict(&request->c, &fl->c)) continue; if (fl->fl_lmops && fl->fl_lmops->lm_lock_expirable && (*fl->fl_lmops->lm_lock_expirable)(fl)) { owner = fl->fl_lmops->lm_mod_owner; func = fl->fl_lmops->lm_expire_lock; __module_get(owner); spin_unlock(&ctx->flc_lock); percpu_up_read(&file_rwsem); (*func)(); module_put(owner); goto retry; } if (conflock) locks_copy_conflock(conflock, fl); error = -EAGAIN; if (!(request->c.flc_flags & FL_SLEEP)) goto out; /* * Deadlock detection and insertion into the blocked * locks list must be done while holding the same lock! */ error = -EDEADLK; spin_lock(&blocked_lock_lock); /* * Ensure that we don't find any locks blocked on this * request during deadlock detection. */ __locks_wake_up_blocks(&request->c); if (likely(!posix_locks_deadlock(request, fl))) { error = FILE_LOCK_DEFERRED; __locks_insert_block(&fl->c, &request->c, posix_locks_conflict); } spin_unlock(&blocked_lock_lock); goto out; } } /* If we're just looking for a conflict, we're done. */ error = 0; if (request->c.flc_flags & FL_ACCESS) goto out; /* Find the first old lock with the same owner as the new lock */ list_for_each_entry(fl, &ctx->flc_posix, c.flc_list) { if (posix_same_owner(&request->c, &fl->c)) break; } /* Process locks with this owner. */ list_for_each_entry_safe_from(fl, tmp, &ctx->flc_posix, c.flc_list) { if (!posix_same_owner(&request->c, &fl->c)) break; /* Detect adjacent or overlapping regions (if same lock type) */ if (request->c.flc_type == fl->c.flc_type) { /* In all comparisons of start vs end, use * "start - 1" rather than "end + 1". If end * is OFFSET_MAX, end + 1 will become negative. */ if (fl->fl_end < request->fl_start - 1) continue; /* If the next lock in the list has entirely bigger * addresses than the new one, insert the lock here. */ if (fl->fl_start - 1 > request->fl_end) break; /* If we come here, the new and old lock are of the * same type and adjacent or overlapping. Make one * lock yielding from the lower start address of both * locks to the higher end address. */ if (fl->fl_start > request->fl_start) fl->fl_start = request->fl_start; else request->fl_start = fl->fl_start; if (fl->fl_end < request->fl_end) fl->fl_end = request->fl_end; else request->fl_end = fl->fl_end; if (added) { locks_delete_lock_ctx(&fl->c, &dispose); continue; } request = fl; added = true; } else { /* Processing for different lock types is a bit * more complex. */ if (fl->fl_end < request->fl_start) continue; if (fl->fl_start > request->fl_end) break; if (lock_is_unlock(request)) added = true; if (fl->fl_start < request->fl_start) left = fl; /* If the next lock in the list has a higher end * address than the new one, insert the new one here. */ if (fl->fl_end > request->fl_end) { right = fl; break; } if (fl->fl_start >= request->fl_start) { /* The new lock completely replaces an old * one (This may happen several times). */ if (added) { locks_delete_lock_ctx(&fl->c, &dispose); continue; } /* * Replace the old lock with new_fl, and * remove the old one. It's safe to do the * insert here since we know that we won't be * using new_fl later, and that the lock is * just replacing an existing lock. */ error = -ENOLCK; if (!new_fl) goto out; locks_copy_lock(new_fl, request); locks_move_blocks(new_fl, request); request = new_fl; new_fl = NULL; locks_insert_lock_ctx(&request->c, &fl->c.flc_list); locks_delete_lock_ctx(&fl->c, &dispose); added = true; } } } /* * The above code only modifies existing locks in case of merging or * replacing. If new lock(s) need to be inserted all modifications are * done below this, so it's safe yet to bail out. */ error = -ENOLCK; /* "no luck" */ if (right && left == right && !new_fl2) goto out; error = 0; if (!added) { if (lock_is_unlock(request)) { if (request->c.flc_flags & FL_EXISTS) error = -ENOENT; goto out; } if (!new_fl) { error = -ENOLCK; goto out; } locks_copy_lock(new_fl, request); locks_move_blocks(new_fl, request); locks_insert_lock_ctx(&new_fl->c, &fl->c.flc_list); fl = new_fl; new_fl = NULL; } if (right) { if (left == right) { /* The new lock breaks the old one in two pieces, * so we have to use the second new lock. */ left = new_fl2; new_fl2 = NULL; locks_copy_lock(left, right); locks_insert_lock_ctx(&left->c, &fl->c.flc_list); } right->fl_start = request->fl_end + 1; locks_wake_up_blocks(&right->c); } if (left) { left->fl_end = request->fl_start - 1; locks_wake_up_blocks(&left->c); } out: trace_posix_lock_inode(inode, request, error); spin_unlock(&ctx->flc_lock); percpu_up_read(&file_rwsem); /* * Free any unused locks. */ if (new_fl) locks_free_lock(new_fl); if (new_fl2) locks_free_lock(new_fl2); locks_dispose_list(&dispose); return error; } /** * posix_lock_file - Apply a POSIX-style lock to a file * @filp: The file to apply the lock to * @fl: The lock to be applied * @conflock: Place to return a copy of the conflicting lock, if found. * * Add a POSIX style lock to a file. * We merge adjacent & overlapping locks whenever possible. * POSIX locks are sorted by owner task, then by starting address * * Note that if called with an FL_EXISTS argument, the caller may determine * whether or not a lock was successfully freed by testing the return * value for -ENOENT. */ int posix_lock_file(struct file *filp, struct file_lock *fl, struct file_lock *conflock) { return posix_lock_inode(file_inode(filp), fl, conflock); } EXPORT_SYMBOL(posix_lock_file); /** * posix_lock_inode_wait - Apply a POSIX-style lock to a file * @inode: inode of file to which lock request should be applied * @fl: The lock to be applied * * Apply a POSIX style lock request to an inode. */ static int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl) { int error; might_sleep (); for (;;) { error = posix_lock_inode(inode, fl, NULL); if (error != FILE_LOCK_DEFERRED) break; error = wait_event_interruptible(fl->c.flc_wait, list_empty(&fl->c.flc_blocked_member)); if (error) break; } locks_delete_block(fl); return error; } static void lease_clear_pending(struct file_lease *fl, int arg) { switch (arg) { case F_UNLCK: fl->c.flc_flags &= ~FL_UNLOCK_PENDING; fallthrough; case F_RDLCK: fl->c.flc_flags &= ~FL_DOWNGRADE_PENDING; } } /* We already had a lease on this file; just change its type */ int lease_modify(struct file_lease *fl, int arg, struct list_head *dispose) { int error = assign_type(&fl->c, arg); if (error) return error; lease_clear_pending(fl, arg); locks_wake_up_blocks(&fl->c); if (arg == F_UNLCK) { struct file *filp = fl->c.flc_file; f_delown(filp); file_f_owner(filp)->signum = 0; fasync_helper(0, fl->c.flc_file, 0, &fl->fl_fasync); if (fl->fl_fasync != NULL) { printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync); fl->fl_fasync = NULL; } locks_delete_lock_ctx(&fl->c, dispose); } return 0; } EXPORT_SYMBOL(lease_modify); static bool past_time(unsigned long then) { if (!then) /* 0 is a special value meaning "this never expires": */ return false; return time_after(jiffies, then); } static void time_out_leases(struct inode *inode, struct list_head *dispose) { struct file_lock_context *ctx = inode->i_flctx; struct file_lease *fl, *tmp; lockdep_assert_held(&ctx->flc_lock); list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, c.flc_list) { trace_time_out_leases(inode, fl); if (past_time(fl->fl_downgrade_time)) lease_modify(fl, F_RDLCK, dispose); if (past_time(fl->fl_break_time)) lease_modify(fl, F_UNLCK, dispose); } } static bool leases_conflict(struct file_lock_core *lc, struct file_lock_core *bc) { bool rc; struct file_lease *lease = file_lease(lc); struct file_lease *breaker = file_lease(bc); if (lease->fl_lmops->lm_breaker_owns_lease && lease->fl_lmops->lm_breaker_owns_lease(lease)) return false; if ((bc->flc_flags & FL_LAYOUT) != (lc->flc_flags & FL_LAYOUT)) { rc = false; goto trace; } if ((bc->flc_flags & FL_DELEG) && (lc->flc_flags & FL_LEASE)) { rc = false; goto trace; } rc = locks_conflict(bc, lc); trace: trace_leases_conflict(rc, lease, breaker); return rc; } static bool any_leases_conflict(struct inode *inode, struct file_lease *breaker) { struct file_lock_context *ctx = inode->i_flctx; struct file_lock_core *flc; lockdep_assert_held(&ctx->flc_lock); list_for_each_entry(flc, &ctx->flc_lease, flc_list) { if (leases_conflict(flc, &breaker->c)) return true; } return false; } /** * __break_lease - revoke all outstanding leases on file * @inode: the inode of the file to return * @mode: O_RDONLY: break only write leases; O_WRONLY or O_RDWR: * break all leases * @type: FL_LEASE: break leases and delegations; FL_DELEG: break * only delegations * * break_lease (inlined for speed) has checked there already is at least * some kind of lock (maybe a lease) on this file. Leases are broken on * a call to open() or truncate(). This function can sleep unless you * specified %O_NONBLOCK to your open(). */ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) { int error = 0; struct file_lock_context *ctx; struct file_lease *new_fl, *fl, *tmp; unsigned long break_time; int want_write = (mode & O_ACCMODE) != O_RDONLY; LIST_HEAD(dispose); new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK); if (IS_ERR(new_fl)) return PTR_ERR(new_fl); new_fl->c.flc_flags = type; /* typically we will check that ctx is non-NULL before calling */ ctx = locks_inode_context(inode); if (!ctx) { WARN_ON_ONCE(1); goto free_lock; } percpu_down_read(&file_rwsem); spin_lock(&ctx->flc_lock); time_out_leases(inode, &dispose); if (!any_leases_conflict(inode, new_fl)) goto out; break_time = 0; if (lease_break_time > 0) { break_time = jiffies + lease_break_time * HZ; if (break_time == 0) break_time++; /* so that 0 means no break time */ } list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, c.flc_list) { if (!leases_conflict(&fl->c, &new_fl->c)) continue; if (want_write) { if (fl->c.flc_flags & FL_UNLOCK_PENDING) continue; fl->c.flc_flags |= FL_UNLOCK_PENDING; fl->fl_break_time = break_time; } else { if (lease_breaking(fl)) continue; fl->c.flc_flags |= FL_DOWNGRADE_PENDING; fl->fl_downgrade_time = break_time; } if (fl->fl_lmops->lm_break(fl)) locks_delete_lock_ctx(&fl->c, &dispose); } if (list_empty(&ctx->flc_lease)) goto out; if (mode & O_NONBLOCK) { trace_break_lease_noblock(inode, new_fl); error = -EWOULDBLOCK; goto out; } restart: fl = list_first_entry(&ctx->flc_lease, struct file_lease, c.flc_list); break_time = fl->fl_break_time; if (break_time != 0) break_time -= jiffies; if (break_time == 0) break_time++; locks_insert_block(&fl->c, &new_fl->c, leases_conflict); trace_break_lease_block(inode, new_fl); spin_unlock(&ctx->flc_lock); percpu_up_read(&file_rwsem); locks_dispose_list(&dispose); error = wait_event_interruptible_timeout(new_fl->c.flc_wait, list_empty(&new_fl->c.flc_blocked_member), break_time); percpu_down_read(&file_rwsem); spin_lock(&ctx->flc_lock); trace_break_lease_unblock(inode, new_fl); __locks_delete_block(&new_fl->c); if (error >= 0) { /* * Wait for the next conflicting lease that has not been * broken yet */ if (error == 0) time_out_leases(inode, &dispose); if (any_leases_conflict(inode, new_fl)) goto restart; error = 0; } out: spin_unlock(&ctx->flc_lock); percpu_up_read(&file_rwsem); locks_dispose_list(&dispose); free_lock: locks_free_lease(new_fl); return error; } EXPORT_SYMBOL(__break_lease); /** * lease_get_mtime - update modified time of an inode with exclusive lease * @inode: the inode * @time: pointer to a timespec which contains the last modified time * * This is to force NFS clients to flush their caches for files with * exclusive leases. The justification is that if someone has an * exclusive lease, then they could be modifying it. */ void lease_get_mtime(struct inode *inode, struct timespec64 *time) { bool has_lease = false; struct file_lock_context *ctx; struct file_lock_core *flc; ctx = locks_inode_context(inode); if (ctx && !list_empty_careful(&ctx->flc_lease)) { spin_lock(&ctx->flc_lock); flc = list_first_entry_or_null(&ctx->flc_lease, struct file_lock_core, flc_list); if (flc && flc->flc_type == F_WRLCK) has_lease = true; spin_unlock(&ctx->flc_lock); } if (has_lease) *time = current_time(inode); } EXPORT_SYMBOL(lease_get_mtime); /** * fcntl_getlease - Enquire what lease is currently active * @filp: the file * * The value returned by this function will be one of * (if no lease break is pending): * * %F_RDLCK to indicate a shared lease is held. * * %F_WRLCK to indicate an exclusive lease is held. * * %F_UNLCK to indicate no lease is held. * * (if a lease break is pending): * * %F_RDLCK to indicate an exclusive lease needs to be * changed to a shared lease (or removed). * * %F_UNLCK to indicate the lease needs to be removed. * * XXX: sfr & willy disagree over whether F_INPROGRESS * should be returned to userspace. */ int fcntl_getlease(struct file *filp) { struct file_lease *fl; struct inode *inode = file_inode(filp); struct file_lock_context *ctx; int type = F_UNLCK; LIST_HEAD(dispose); ctx = locks_inode_context(inode); if (ctx && !list_empty_careful(&ctx->flc_lease)) { percpu_down_read(&file_rwsem); spin_lock(&ctx->flc_lock); time_out_leases(inode, &dispose); list_for_each_entry(fl, &ctx->flc_lease, c.flc_list) { if (fl->c.flc_file != filp) continue; type = target_leasetype(fl); break; } spin_unlock(&ctx->flc_lock); percpu_up_read(&file_rwsem); locks_dispose_list(&dispose); } return type; } /** * check_conflicting_open - see if the given file points to an inode that has * an existing open that would conflict with the * desired lease. * @filp: file to check * @arg: type of lease that we're trying to acquire * @flags: current lock flags * * Check to see if there's an existing open fd on this file that would * conflict with the lease we're trying to set. */ static int check_conflicting_open(struct file *filp, const int arg, int flags) { struct inode *inode = file_inode(filp); int self_wcount = 0, self_rcount = 0; if (flags & FL_LAYOUT) return 0; if (flags & FL_DELEG) /* We leave these checks to the caller */ return 0; if (arg == F_RDLCK) return inode_is_open_for_write(inode) ? -EAGAIN : 0; else if (arg != F_WRLCK) return 0; /* * Make sure that only read/write count is from lease requestor. * Note that this will result in denying write leases when i_writecount * is negative, which is what we want. (We shouldn't grant write leases * on files open for execution.) */ if (filp->f_mode & FMODE_WRITE) self_wcount = 1; else if (filp->f_mode & FMODE_READ) self_rcount = 1; if (atomic_read(&inode->i_writecount) != self_wcount || atomic_read(&inode->i_readcount) != self_rcount) return -EAGAIN; return 0; } static int generic_add_lease(struct file *filp, int arg, struct file_lease **flp, void **priv) { struct file_lease *fl, *my_fl = NULL, *lease; struct inode *inode = file_inode(filp); struct file_lock_context *ctx; bool is_deleg = (*flp)->c.flc_flags & FL_DELEG; int error; LIST_HEAD(dispose); lease = *flp; trace_generic_add_lease(inode, lease); error = file_f_owner_allocate(filp); if (error) return error; /* Note that arg is never F_UNLCK here */ ctx = locks_get_lock_context(inode, arg); if (!ctx) return -ENOMEM; /* * In the delegation case we need mutual exclusion with * a number of operations that take the i_mutex. We trylock * because delegations are an optional optimization, and if * there's some chance of a conflict--we'd rather not * bother, maybe that's a sign this just isn't a good file to * hand out a delegation on. */ if (is_deleg && !inode_trylock(inode)) return -EAGAIN; percpu_down_read(&file_rwsem); spin_lock(&ctx->flc_lock); time_out_leases(inode, &dispose); error = check_conflicting_open(filp, arg, lease->c.flc_flags); if (error) goto out; /* * At this point, we know that if there is an exclusive * lease on this file, then we hold it on this filp * (otherwise our open of this file would have blocked). * And if we are trying to acquire an exclusive lease, * then the file is not open by anyone (including us) * except for this filp. */ error = -EAGAIN; list_for_each_entry(fl, &ctx->flc_lease, c.flc_list) { if (fl->c.flc_file == filp && fl->c.flc_owner == lease->c.flc_owner) { my_fl = fl; continue; } /* * No exclusive leases if someone else has a lease on * this file: */ if (arg == F_WRLCK) goto out; /* * Modifying our existing lease is OK, but no getting a * new lease if someone else is opening for write: */ if (fl->c.flc_flags & FL_UNLOCK_PENDING) goto out; } if (my_fl != NULL) { lease = my_fl; error = lease->fl_lmops->lm_change(lease, arg, &dispose); if (error) goto out; goto out_setup; } error = -EINVAL; if (!leases_enable) goto out; locks_insert_lock_ctx(&lease->c, &ctx->flc_lease); /* * The check in break_lease() is lockless. It's possible for another * open to race in after we did the earlier check for a conflicting * open but before the lease was inserted. Check again for a * conflicting open and cancel the lease if there is one. * * We also add a barrier here to ensure that the insertion of the lock * precedes these checks. */ smp_mb(); error = check_conflicting_open(filp, arg, lease->c.flc_flags); if (error) { locks_unlink_lock_ctx(&lease->c); goto out; } out_setup: if (lease->fl_lmops->lm_setup) lease->fl_lmops->lm_setup(lease, priv); out: spin_unlock(&ctx->flc_lock); percpu_up_read(&file_rwsem); locks_dispose_list(&dispose); if (is_deleg) inode_unlock(inode); if (!error && !my_fl) *flp = NULL; return error; } static int generic_delete_lease(struct file *filp, void *owner) { int error = -EAGAIN; struct file_lease *fl, *victim = NULL; struct inode *inode = file_inode(filp); struct file_lock_context *ctx; LIST_HEAD(dispose); ctx = locks_inode_context(inode); if (!ctx) { trace_generic_delete_lease(inode, NULL); return error; } percpu_down_read(&file_rwsem); spin_lock(&ctx->flc_lock); list_for_each_entry(fl, &ctx->flc_lease, c.flc_list) { if (fl->c.flc_file == filp && fl->c.flc_owner == owner) { victim = fl; break; } } trace_generic_delete_lease(inode, victim); if (victim) error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose); spin_unlock(&ctx->flc_lock); percpu_up_read(&file_rwsem); locks_dispose_list(&dispose); return error; } /** * generic_setlease - sets a lease on an open file * @filp: file pointer * @arg: type of lease to obtain * @flp: input - file_lock to use, output - file_lock inserted * @priv: private data for lm_setup (may be NULL if lm_setup * doesn't require it) * * The (input) flp->fl_lmops->lm_break function is required * by break_lease(). */ int generic_setlease(struct file *filp, int arg, struct file_lease **flp, void **priv) { switch (arg) { case F_UNLCK: return generic_delete_lease(filp, *priv); case F_RDLCK: case F_WRLCK: if (!(*flp)->fl_lmops->lm_break) { WARN_ON_ONCE(1); return -ENOLCK; } return generic_add_lease(filp, arg, flp, priv); default: return -EINVAL; } } EXPORT_SYMBOL(generic_setlease); /* * Kernel subsystems can register to be notified on any attempt to set * a new lease with the lease_notifier_chain. This is used by (e.g.) nfsd * to close files that it may have cached when there is an attempt to set a * conflicting lease. */ static struct srcu_notifier_head lease_notifier_chain; static inline void lease_notifier_chain_init(void) { srcu_init_notifier_head(&lease_notifier_chain); } static inline void setlease_notifier(int arg, struct file_lease *lease) { if (arg != F_UNLCK) srcu_notifier_call_chain(&lease_notifier_chain, arg, lease); } int lease_register_notifier(struct notifier_block *nb) { return srcu_notifier_chain_register(&lease_notifier_chain, nb); } EXPORT_SYMBOL_GPL(lease_register_notifier); void lease_unregister_notifier(struct notifier_block *nb) { srcu_notifier_chain_unregister(&lease_notifier_chain, nb); } EXPORT_SYMBOL_GPL(lease_unregister_notifier); int kernel_setlease(struct file *filp, int arg, struct file_lease **lease, void **priv) { if (lease) setlease_notifier(arg, *lease); if (filp->f_op->setlease) return filp->f_op->setlease(filp, arg, lease, priv); else return generic_setlease(filp, arg, lease, priv); } EXPORT_SYMBOL_GPL(kernel_setlease); /** * vfs_setlease - sets a lease on an open file * @filp: file pointer * @arg: type of lease to obtain * @lease: file_lock to use when adding a lease * @priv: private info for lm_setup when adding a lease (may be * NULL if lm_setup doesn't require it) * * Call this to establish a lease on the file. The "lease" argument is not * used for F_UNLCK requests and may be NULL. For commands that set or alter * an existing lease, the ``(*lease)->fl_lmops->lm_break`` operation must be * set; if not, this function will return -ENOLCK (and generate a scary-looking * stack trace). * * The "priv" pointer is passed directly to the lm_setup function as-is. It * may be NULL if the lm_setup operation doesn't require it. */ int vfs_setlease(struct file *filp, int arg, struct file_lease **lease, void **priv) { struct inode *inode = file_inode(filp); vfsuid_t vfsuid = i_uid_into_vfsuid(file_mnt_idmap(filp), inode); int error; if ((!vfsuid_eq_kuid(vfsuid, current_fsuid())) && !capable(CAP_LEASE)) return -EACCES; if (!S_ISREG(inode->i_mode)) return -EINVAL; error = security_file_lock(filp, arg); if (error) return error; return kernel_setlease(filp, arg, lease, priv); } EXPORT_SYMBOL_GPL(vfs_setlease); static int do_fcntl_add_lease(unsigned int fd, struct file *filp, int arg) { struct file_lease *fl; struct fasync_struct *new; int error; fl = lease_alloc(filp, arg); if (IS_ERR(fl)) return PTR_ERR(fl); new = fasync_alloc(); if (!new) { locks_free_lease(fl); return -ENOMEM; } new->fa_fd = fd; error = vfs_setlease(filp, arg, &fl, (void **)&new); if (fl) locks_free_lease(fl); if (new) fasync_free(new); return error; } /** * fcntl_setlease - sets a lease on an open file * @fd: open file descriptor * @filp: file pointer * @arg: type of lease to obtain * * Call this fcntl to establish a lease on the file. * Note that you also need to call %F_SETSIG to * receive a signal when the lease is broken. */ int fcntl_setlease(unsigned int fd, struct file *filp, int arg) { if (arg == F_UNLCK) return vfs_setlease(filp, F_UNLCK, NULL, (void **)&filp); return do_fcntl_add_lease(fd, filp, arg); } /** * flock_lock_inode_wait - Apply a FLOCK-style lock to a file * @inode: inode of the file to apply to * @fl: The lock to be applied * * Apply a FLOCK style lock request to an inode. */ static int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl) { int error; might_sleep(); for (;;) { error = flock_lock_inode(inode, fl); if (error != FILE_LOCK_DEFERRED) break; error = wait_event_interruptible(fl->c.flc_wait, list_empty(&fl->c.flc_blocked_member)); if (error) break; } locks_delete_block(fl); return error; } /** * locks_lock_inode_wait - Apply a lock to an inode * @inode: inode of the file to apply to * @fl: The lock to be applied * * Apply a POSIX or FLOCK style lock request to an inode. */ int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl) { int res = 0; switch (fl->c.flc_flags & (FL_POSIX|FL_FLOCK)) { case FL_POSIX: res = posix_lock_inode_wait(inode, fl); break; case FL_FLOCK: res = flock_lock_inode_wait(inode, fl); break; default: BUG(); } return res; } EXPORT_SYMBOL(locks_lock_inode_wait); /** * sys_flock: - flock() system call. * @fd: the file descriptor to lock. * @cmd: the type of lock to apply. * * Apply a %FL_FLOCK style lock to an open file descriptor. * The @cmd can be one of: * * - %LOCK_SH -- a shared lock. * - %LOCK_EX -- an exclusive lock. * - %LOCK_UN -- remove an existing lock. * - %LOCK_MAND -- a 'mandatory' flock. (DEPRECATED) * * %LOCK_MAND support has been removed from the kernel. */ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) { int can_sleep, error, type; struct file_lock fl; /* * LOCK_MAND locks were broken for a long time in that they never * conflicted with one another and didn't prevent any sort of open, * read or write activity. * * Just ignore these requests now, to preserve legacy behavior, but * throw a warning to let people know that they don't actually work. */ if (cmd & LOCK_MAND) { pr_warn_once("%s(%d): Attempt to set a LOCK_MAND lock via flock(2). This support has been removed and the request ignored.\n", current->comm, current->pid); return 0; } type = flock_translate_cmd(cmd & ~LOCK_NB); if (type < 0) return type; CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; if (type != F_UNLCK && !(fd_file(f)->f_mode & (FMODE_READ | FMODE_WRITE))) return -EBADF; flock_make_lock(fd_file(f), &fl, type); error = security_file_lock(fd_file(f), fl.c.flc_type); if (error) return error; can_sleep = !(cmd & LOCK_NB); if (can_sleep) fl.c.flc_flags |= FL_SLEEP; if (fd_file(f)->f_op->flock) error = fd_file(f)->f_op->flock(fd_file(f), (can_sleep) ? F_SETLKW : F_SETLK, &fl); else error = locks_lock_file_wait(fd_file(f), &fl); locks_release_private(&fl); return error; } /** * vfs_test_lock - test file byte range lock * @filp: The file to test lock for * @fl: The lock to test; also used to hold result * * Returns -ERRNO on failure. Indicates presence of conflicting lock by * setting conf->fl_type to something other than F_UNLCK. */ int vfs_test_lock(struct file *filp, struct file_lock *fl) { WARN_ON_ONCE(filp != fl->c.flc_file); if (filp->f_op->lock) return filp->f_op->lock(filp, F_GETLK, fl); posix_test_lock(filp, fl); return 0; } EXPORT_SYMBOL_GPL(vfs_test_lock); /** * locks_translate_pid - translate a file_lock's fl_pid number into a namespace * @fl: The file_lock who's fl_pid should be translated * @ns: The namespace into which the pid should be translated * * Used to translate a fl_pid into a namespace virtual pid number */ static pid_t locks_translate_pid(struct file_lock_core *fl, struct pid_namespace *ns) { pid_t vnr; struct pid *pid; if (fl->flc_flags & FL_OFDLCK) return -1; /* Remote locks report a negative pid value */ if (fl->flc_pid <= 0) return fl->flc_pid; /* * If the flock owner process is dead and its pid has been already * freed, the translation below won't work, but we still want to show * flock owner pid number in init pidns. */ if (ns == &init_pid_ns) return (pid_t) fl->flc_pid; rcu_read_lock(); pid = find_pid_ns(fl->flc_pid, &init_pid_ns); vnr = pid_nr_ns(pid, ns); rcu_read_unlock(); return vnr; } static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl) { flock->l_pid = locks_translate_pid(&fl->c, task_active_pid_ns(current)); #if BITS_PER_LONG == 32 /* * Make sure we can represent the posix lock via * legacy 32bit flock. */ if (fl->fl_start > OFFT_OFFSET_MAX) return -EOVERFLOW; if (fl->fl_end != OFFSET_MAX && fl->fl_end > OFFT_OFFSET_MAX) return -EOVERFLOW; #endif flock->l_start = fl->fl_start; flock->l_len = fl->fl_end == OFFSET_MAX ? 0 : fl->fl_end - fl->fl_start + 1; flock->l_whence = 0; flock->l_type = fl->c.flc_type; return 0; } #if BITS_PER_LONG == 32 static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl) { flock->l_pid = locks_translate_pid(&fl->c, task_active_pid_ns(current)); flock->l_start = fl->fl_start; flock->l_len = fl->fl_end == OFFSET_MAX ? 0 : fl->fl_end - fl->fl_start + 1; flock->l_whence = 0; flock->l_type = fl->c.flc_type; } #endif /* Report the first existing lock that would conflict with l. * This implements the F_GETLK command of fcntl(). */ int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock *flock) { struct file_lock *fl; int error; fl = locks_alloc_lock(); if (fl == NULL) return -ENOMEM; error = -EINVAL; if (cmd != F_OFD_GETLK && flock->l_type != F_RDLCK && flock->l_type != F_WRLCK) goto out; error = flock_to_posix_lock(filp, fl, flock); if (error) goto out; if (cmd == F_OFD_GETLK) { error = -EINVAL; if (flock->l_pid != 0) goto out; fl->c.flc_flags |= FL_OFDLCK; fl->c.flc_owner = filp; } error = vfs_test_lock(filp, fl); if (error) goto out; flock->l_type = fl->c.flc_type; if (fl->c.flc_type != F_UNLCK) { error = posix_lock_to_flock(flock, fl); if (error) goto out; } out: locks_free_lock(fl); return error; } /** * vfs_lock_file - file byte range lock * @filp: The file to apply the lock to * @cmd: type of locking operation (F_SETLK, F_GETLK, etc.) * @fl: The lock to be applied * @conf: Place to return a copy of the conflicting lock, if found. * * A caller that doesn't care about the conflicting lock may pass NULL * as the final argument. * * If the filesystem defines a private ->lock() method, then @conf will * be left unchanged; so a caller that cares should initialize it to * some acceptable default. * * To avoid blocking kernel daemons, such as lockd, that need to acquire POSIX * locks, the ->lock() interface may return asynchronously, before the lock has * been granted or denied by the underlying filesystem, if (and only if) * lm_grant is set. Additionally EXPORT_OP_ASYNC_LOCK in export_operations * flags need to be set. * * Callers expecting ->lock() to return asynchronously will only use F_SETLK, * not F_SETLKW; they will set FL_SLEEP if (and only if) the request is for a * blocking lock. When ->lock() does return asynchronously, it must return * FILE_LOCK_DEFERRED, and call ->lm_grant() when the lock request completes. * If the request is for non-blocking lock the file system should return * FILE_LOCK_DEFERRED then try to get the lock and call the callback routine * with the result. If the request timed out the callback routine will return a * nonzero return code and the file system should release the lock. The file * system is also responsible to keep a corresponding posix lock when it * grants a lock so the VFS can find out which locks are locally held and do * the correct lock cleanup when required. * The underlying filesystem must not drop the kernel lock or call * ->lm_grant() before returning to the caller with a FILE_LOCK_DEFERRED * return code. */ int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf) { WARN_ON_ONCE(filp != fl->c.flc_file); if (filp->f_op->lock) return filp->f_op->lock(filp, cmd, fl); else return posix_lock_file(filp, fl, conf); } EXPORT_SYMBOL_GPL(vfs_lock_file); static int do_lock_file_wait(struct file *filp, unsigned int cmd, struct file_lock *fl) { int error; error = security_file_lock(filp, fl->c.flc_type); if (error) return error; for (;;) { error = vfs_lock_file(filp, cmd, fl, NULL); if (error != FILE_LOCK_DEFERRED) break; error = wait_event_interruptible(fl->c.flc_wait, list_empty(&fl->c.flc_blocked_member)); if (error) break; } locks_delete_block(fl); return error; } /* Ensure that fl->fl_file has compatible f_mode for F_SETLK calls */ static int check_fmode_for_setlk(struct file_lock *fl) { switch (fl->c.flc_type) { case F_RDLCK: if (!(fl->c.flc_file->f_mode & FMODE_READ)) return -EBADF; break; case F_WRLCK: if (!(fl->c.flc_file->f_mode & FMODE_WRITE)) return -EBADF; } return 0; } /* Apply the lock described by l to an open file descriptor. * This implements both the F_SETLK and F_SETLKW commands of fcntl(). */ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, struct flock *flock) { struct file_lock *file_lock = locks_alloc_lock(); struct inode *inode = file_inode(filp); struct file *f; int error; if (file_lock == NULL) return -ENOLCK; error = flock_to_posix_lock(filp, file_lock, flock); if (error) goto out; error = check_fmode_for_setlk(file_lock); if (error) goto out; /* * If the cmd is requesting file-private locks, then set the * FL_OFDLCK flag and override the owner. */ switch (cmd) { case F_OFD_SETLK: error = -EINVAL; if (flock->l_pid != 0) goto out; cmd = F_SETLK; file_lock->c.flc_flags |= FL_OFDLCK; file_lock->c.flc_owner = filp; break; case F_OFD_SETLKW: error = -EINVAL; if (flock->l_pid != 0) goto out; cmd = F_SETLKW; file_lock->c.flc_flags |= FL_OFDLCK; file_lock->c.flc_owner = filp; fallthrough; case F_SETLKW: file_lock->c.flc_flags |= FL_SLEEP; } error = do_lock_file_wait(filp, cmd, file_lock); /* * Detect close/fcntl races and recover by zapping all POSIX locks * associated with this file and our files_struct, just like on * filp_flush(). There is no need to do that when we're * unlocking though, or for OFD locks. */ if (!error && file_lock->c.flc_type != F_UNLCK && !(file_lock->c.flc_flags & FL_OFDLCK)) { struct files_struct *files = current->files; /* * We need that spin_lock here - it prevents reordering between * update of i_flctx->flc_posix and check for it done in * close(). rcu_read_lock() wouldn't do. */ spin_lock(&files->file_lock); f = files_lookup_fd_locked(files, fd); spin_unlock(&files->file_lock); if (f != filp) { locks_remove_posix(filp, files); error = -EBADF; } } out: trace_fcntl_setlk(inode, file_lock, error); locks_free_lock(file_lock); return error; } #if BITS_PER_LONG == 32 /* Report the first existing lock that would conflict with l. * This implements the F_GETLK command of fcntl(). */ int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 *flock) { struct file_lock *fl; int error; fl = locks_alloc_lock(); if (fl == NULL) return -ENOMEM; error = -EINVAL; if (cmd != F_OFD_GETLK && flock->l_type != F_RDLCK && flock->l_type != F_WRLCK) goto out; error = flock64_to_posix_lock(filp, fl, flock); if (error) goto out; if (cmd == F_OFD_GETLK) { error = -EINVAL; if (flock->l_pid != 0) goto out; fl->c.flc_flags |= FL_OFDLCK; fl->c.flc_owner = filp; } error = vfs_test_lock(filp, fl); if (error) goto out; flock->l_type = fl->c.flc_type; if (fl->c.flc_type != F_UNLCK) posix_lock_to_flock64(flock, fl); out: locks_free_lock(fl); return error; } /* Apply the lock described by l to an open file descriptor. * This implements both the F_SETLK and F_SETLKW commands of fcntl(). */ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, struct flock64 *flock) { struct file_lock *file_lock = locks_alloc_lock(); struct file *f; int error; if (file_lock == NULL) return -ENOLCK; error = flock64_to_posix_lock(filp, file_lock, flock); if (error) goto out; error = check_fmode_for_setlk(file_lock); if (error) goto out; /* * If the cmd is requesting file-private locks, then set the * FL_OFDLCK flag and override the owner. */ switch (cmd) { case F_OFD_SETLK: error = -EINVAL; if (flock->l_pid != 0) goto out; cmd = F_SETLK64; file_lock->c.flc_flags |= FL_OFDLCK; file_lock->c.flc_owner = filp; break; case F_OFD_SETLKW: error = -EINVAL; if (flock->l_pid != 0) goto out; cmd = F_SETLKW64; file_lock->c.flc_flags |= FL_OFDLCK; file_lock->c.flc_owner = filp; fallthrough; case F_SETLKW64: file_lock->c.flc_flags |= FL_SLEEP; } error = do_lock_file_wait(filp, cmd, file_lock); /* * Detect close/fcntl races and recover by zapping all POSIX locks * associated with this file and our files_struct, just like on * filp_flush(). There is no need to do that when we're * unlocking though, or for OFD locks. */ if (!error && file_lock->c.flc_type != F_UNLCK && !(file_lock->c.flc_flags & FL_OFDLCK)) { struct files_struct *files = current->files; /* * We need that spin_lock here - it prevents reordering between * update of i_flctx->flc_posix and check for it done in * close(). rcu_read_lock() wouldn't do. */ spin_lock(&files->file_lock); f = files_lookup_fd_locked(files, fd); spin_unlock(&files->file_lock); if (f != filp) { locks_remove_posix(filp, files); error = -EBADF; } } out: locks_free_lock(file_lock); return error; } #endif /* BITS_PER_LONG == 32 */ /* * This function is called when the file is being removed * from the task's fd array. POSIX locks belonging to this task * are deleted at this time. */ void locks_remove_posix(struct file *filp, fl_owner_t owner) { int error; struct inode *inode = file_inode(filp); struct file_lock lock; struct file_lock_context *ctx; /* * If there are no locks held on this file, we don't need to call * posix_lock_file(). Another process could be setting a lock on this * file at the same time, but we wouldn't remove that lock anyway. */ ctx = locks_inode_context(inode); if (!ctx || list_empty(&ctx->flc_posix)) return; locks_init_lock(&lock); lock.c.flc_type = F_UNLCK; lock.c.flc_flags = FL_POSIX | FL_CLOSE; lock.fl_start = 0; lock.fl_end = OFFSET_MAX; lock.c.flc_owner = owner; lock.c.flc_pid = current->tgid; lock.c.flc_file = filp; lock.fl_ops = NULL; lock.fl_lmops = NULL; error = vfs_lock_file(filp, F_SETLK, &lock, NULL); if (lock.fl_ops && lock.fl_ops->fl_release_private) lock.fl_ops->fl_release_private(&lock); trace_locks_remove_posix(inode, &lock, error); } EXPORT_SYMBOL(locks_remove_posix); /* The i_flctx must be valid when calling into here */ static void locks_remove_flock(struct file *filp, struct file_lock_context *flctx) { struct file_lock fl; struct inode *inode = file_inode(filp); if (list_empty(&flctx->flc_flock)) return; flock_make_lock(filp, &fl, F_UNLCK); fl.c.flc_flags |= FL_CLOSE; if (filp->f_op->flock) filp->f_op->flock(filp, F_SETLKW, &fl); else flock_lock_inode(inode, &fl); if (fl.fl_ops && fl.fl_ops->fl_release_private) fl.fl_ops->fl_release_private(&fl); } /* The i_flctx must be valid when calling into here */ static void locks_remove_lease(struct file *filp, struct file_lock_context *ctx) { struct file_lease *fl, *tmp; LIST_HEAD(dispose); if (list_empty(&ctx->flc_lease)) return; percpu_down_read(&file_rwsem); spin_lock(&ctx->flc_lock); list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, c.flc_list) if (filp == fl->c.flc_file) lease_modify(fl, F_UNLCK, &dispose); spin_unlock(&ctx->flc_lock); percpu_up_read(&file_rwsem); locks_dispose_list(&dispose); } /* * This function is called on the last close of an open file. */ void locks_remove_file(struct file *filp) { struct file_lock_context *ctx; ctx = locks_inode_context(file_inode(filp)); if (!ctx) return; /* remove any OFD locks */ locks_remove_posix(filp, filp); /* remove flock locks */ locks_remove_flock(filp, ctx); /* remove any leases */ locks_remove_lease(filp, ctx); spin_lock(&ctx->flc_lock); locks_check_ctx_file_list(filp, &ctx->flc_posix, "POSIX"); locks_check_ctx_file_list(filp, &ctx->flc_flock, "FLOCK"); locks_check_ctx_file_list(filp, &ctx->flc_lease, "LEASE"); spin_unlock(&ctx->flc_lock); } /** * vfs_cancel_lock - file byte range unblock lock * @filp: The file to apply the unblock to * @fl: The lock to be unblocked * * Used by lock managers to cancel blocked requests */ int vfs_cancel_lock(struct file *filp, struct file_lock *fl) { WARN_ON_ONCE(filp != fl->c.flc_file); if (filp->f_op->lock) return filp->f_op->lock(filp, F_CANCELLK, fl); return 0; } EXPORT_SYMBOL_GPL(vfs_cancel_lock); /** * vfs_inode_has_locks - are any file locks held on @inode? * @inode: inode to check for locks * * Return true if there are any FL_POSIX or FL_FLOCK locks currently * set on @inode. */ bool vfs_inode_has_locks(struct inode *inode) { struct file_lock_context *ctx; bool ret; ctx = locks_inode_context(inode); if (!ctx) return false; spin_lock(&ctx->flc_lock); ret = !list_empty(&ctx->flc_posix) || !list_empty(&ctx->flc_flock); spin_unlock(&ctx->flc_lock); return ret; } EXPORT_SYMBOL_GPL(vfs_inode_has_locks); #ifdef CONFIG_PROC_FS #include <linux/proc_fs.h> #include <linux/seq_file.h> struct locks_iterator { int li_cpu; loff_t li_pos; }; static void lock_get_status(struct seq_file *f, struct file_lock_core *flc, loff_t id, char *pfx, int repeat) { struct inode *inode = NULL; unsigned int pid; struct pid_namespace *proc_pidns = proc_pid_ns(file_inode(f->file)->i_sb); int type = flc->flc_type; struct file_lock *fl = file_lock(flc); pid = locks_translate_pid(flc, proc_pidns); /* * If lock owner is dead (and pid is freed) or not visible in current * pidns, zero is shown as a pid value. Check lock info from * init_pid_ns to get saved lock pid value. */ if (flc->flc_file != NULL) inode = file_inode(flc->flc_file); seq_printf(f, "%lld: ", id); if (repeat) seq_printf(f, "%*s", repeat - 1 + (int)strlen(pfx), pfx); if (flc->flc_flags & FL_POSIX) { if (flc->flc_flags & FL_ACCESS) seq_puts(f, "ACCESS"); else if (flc->flc_flags & FL_OFDLCK) seq_puts(f, "OFDLCK"); else seq_puts(f, "POSIX "); seq_printf(f, " %s ", (inode == NULL) ? "*NOINODE*" : "ADVISORY "); } else if (flc->flc_flags & FL_FLOCK) { seq_puts(f, "FLOCK ADVISORY "); } else if (flc->flc_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT)) { struct file_lease *lease = file_lease(flc); type = target_leasetype(lease); if (flc->flc_flags & FL_DELEG) seq_puts(f, "DELEG "); else seq_puts(f, "LEASE "); if (lease_breaking(lease)) seq_puts(f, "BREAKING "); else if (flc->flc_file) seq_puts(f, "ACTIVE "); else seq_puts(f, "BREAKER "); } else { seq_puts(f, "UNKNOWN UNKNOWN "); } seq_printf(f, "%s ", (type == F_WRLCK) ? "WRITE" : (type == F_RDLCK) ? "READ" : "UNLCK"); if (inode) { /* userspace relies on this representation of dev_t */ seq_printf(f, "%d %02x:%02x:%lu ", pid, MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), inode->i_ino); } else { seq_printf(f, "%d <none>:0 ", pid); } if (flc->flc_flags & FL_POSIX) { if (fl->fl_end == OFFSET_MAX) seq_printf(f, "%Ld EOF\n", fl->fl_start); else seq_printf(f, "%Ld %Ld\n", fl->fl_start, fl->fl_end); } else { seq_puts(f, "0 EOF\n"); } } static struct file_lock_core *get_next_blocked_member(struct file_lock_core *node) { struct file_lock_core *tmp; /* NULL node or root node */ if (node == NULL || node->flc_blocker == NULL) return NULL; /* Next member in the linked list could be itself */ tmp = list_next_entry(node, flc_blocked_member); if (list_entry_is_head(tmp, &node->flc_blocker->flc_blocked_requests, flc_blocked_member) || tmp == node) { return NULL; } return tmp; } static int locks_show(struct seq_file *f, void *v) { struct locks_iterator *iter = f->private; struct file_lock_core *cur, *tmp; struct pid_namespace *proc_pidns = proc_pid_ns(file_inode(f->file)->i_sb); int level = 0; cur = hlist_entry(v, struct file_lock_core, flc_link); if (locks_translate_pid(cur, proc_pidns) == 0) return 0; /* View this crossed linked list as a binary tree, the first member of flc_blocked_requests * is the left child of current node, the next silibing in flc_blocked_member is the * right child, we can alse get the parent of current node from flc_blocker, so this * question becomes traversal of a binary tree */ while (cur != NULL) { if (level) lock_get_status(f, cur, iter->li_pos, "-> ", level); else lock_get_status(f, cur, iter->li_pos, "", level); if (!list_empty(&cur->flc_blocked_requests)) { /* Turn left */ cur = list_first_entry_or_null(&cur->flc_blocked_requests, struct file_lock_core, flc_blocked_member); level++; } else { /* Turn right */ tmp = get_next_blocked_member(cur); /* Fall back to parent node */ while (tmp == NULL && cur->flc_blocker != NULL) { cur = cur->flc_blocker; level--; tmp = get_next_blocked_member(cur); } cur = tmp; } } return 0; } static void __show_fd_locks(struct seq_file *f, struct list_head *head, int *id, struct file *filp, struct files_struct *files) { struct file_lock_core *fl; list_for_each_entry(fl, head, flc_list) { if (filp != fl->flc_file) continue; if (fl->flc_owner != files && fl->flc_owner != filp) continue; (*id)++; seq_puts(f, "lock:\t"); lock_get_status(f, fl, *id, "", 0); } } void show_fd_locks(struct seq_file *f, struct file *filp, struct files_struct *files) { struct inode *inode = file_inode(filp); struct file_lock_context *ctx; int id = 0; ctx = locks_inode_context(inode); if (!ctx) return; spin_lock(&ctx->flc_lock); __show_fd_locks(f, &ctx->flc_flock, &id, filp, files); __show_fd_locks(f, &ctx->flc_posix, &id, filp, files); __show_fd_locks(f, &ctx->flc_lease, &id, filp, files); spin_unlock(&ctx->flc_lock); } static void *locks_start(struct seq_file *f, loff_t *pos) __acquires(&blocked_lock_lock) { struct locks_iterator *iter = f->private; iter->li_pos = *pos + 1; percpu_down_write(&file_rwsem); spin_lock(&blocked_lock_lock); return seq_hlist_start_percpu(&file_lock_list.hlist, &iter->li_cpu, *pos); } static void *locks_next(struct seq_file *f, void *v, loff_t *pos) { struct locks_iterator *iter = f->private; ++iter->li_pos; return seq_hlist_next_percpu(v, &file_lock_list.hlist, &iter->li_cpu, pos); } static void locks_stop(struct seq_file *f, void *v) __releases(&blocked_lock_lock) { spin_unlock(&blocked_lock_lock); percpu_up_write(&file_rwsem); } static const struct seq_operations locks_seq_operations = { .start = locks_start, .next = locks_next, .stop = locks_stop, .show = locks_show, }; static int __init proc_locks_init(void) { proc_create_seq_private("locks", 0, NULL, &locks_seq_operations, sizeof(struct locks_iterator), NULL); return 0; } fs_initcall(proc_locks_init); #endif static int __init filelock_init(void) { int i; flctx_cache = kmem_cache_create("file_lock_ctx", sizeof(struct file_lock_context), 0, SLAB_PANIC, NULL); filelock_cache = kmem_cache_create("file_lock_cache", sizeof(struct file_lock), 0, SLAB_PANIC, NULL); filelease_cache = kmem_cache_create("file_lease_cache", sizeof(struct file_lease), 0, SLAB_PANIC, NULL); for_each_possible_cpu(i) { struct file_lock_list_struct *fll = per_cpu_ptr(&file_lock_list, i); spin_lock_init(&fll->lock); INIT_HLIST_HEAD(&fll->hlist); } lease_notifier_chain_init(); return 0; } core_initcall(filelock_init);
13 13 13 13 13 24 24 529 529 166 166 9 156 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2005-2010 IBM Corporation * * Author: * Mimi Zohar <zohar@us.ibm.com> * Kylene Hall <kjhall@us.ibm.com> * * File: evm_main.c * implements evm_inode_setxattr, evm_inode_post_setxattr, * evm_inode_removexattr, evm_verifyxattr, and evm_inode_set_acl. */ #define pr_fmt(fmt) "EVM: "fmt #include <linux/init.h> #include <linux/audit.h> #include <linux/xattr.h> #include <linux/integrity.h> #include <linux/evm.h> #include <linux/magic.h> #include <linux/posix_acl_xattr.h> #include <linux/lsm_hooks.h> #include <crypto/hash.h> #include <crypto/hash_info.h> #include <crypto/utils.h> #include "evm.h" int evm_initialized; static const char * const integrity_status_msg[] = { "pass", "pass_immutable", "fail", "fail_immutable", "no_label", "no_xattrs", "unknown" }; int evm_hmac_attrs; static struct xattr_list evm_config_default_xattrnames[] = { { .name = XATTR_NAME_SELINUX, .enabled = IS_ENABLED(CONFIG_SECURITY_SELINUX) }, { .name = XATTR_NAME_SMACK, .enabled = IS_ENABLED(CONFIG_SECURITY_SMACK) }, { .name = XATTR_NAME_SMACKEXEC, .enabled = IS_ENABLED(CONFIG_EVM_EXTRA_SMACK_XATTRS) }, { .name = XATTR_NAME_SMACKTRANSMUTE, .enabled = IS_ENABLED(CONFIG_EVM_EXTRA_SMACK_XATTRS) }, { .name = XATTR_NAME_SMACKMMAP, .enabled = IS_ENABLED(CONFIG_EVM_EXTRA_SMACK_XATTRS) }, { .name = XATTR_NAME_APPARMOR, .enabled = IS_ENABLED(CONFIG_SECURITY_APPARMOR) }, { .name = XATTR_NAME_IMA, .enabled = IS_ENABLED(CONFIG_IMA_APPRAISE) }, { .name = XATTR_NAME_CAPS, .enabled = true }, }; LIST_HEAD(evm_config_xattrnames); static int evm_fixmode __ro_after_init; static int __init evm_set_fixmode(char *str) { if (strncmp(str, "fix", 3) == 0) evm_fixmode = 1; else pr_err("invalid \"%s\" mode", str); return 1; } __setup("evm=", evm_set_fixmode); static void __init evm_init_config(void) { int i, xattrs; xattrs = ARRAY_SIZE(evm_config_default_xattrnames); pr_info("Initialising EVM extended attributes:\n"); for (i = 0; i < xattrs; i++) { pr_info("%s%s\n", evm_config_default_xattrnames[i].name, !evm_config_default_xattrnames[i].enabled ? " (disabled)" : ""); list_add_tail(&evm_config_default_xattrnames[i].list, &evm_config_xattrnames); } #ifdef CONFIG_EVM_ATTR_FSUUID evm_hmac_attrs |= EVM_ATTR_FSUUID; #endif pr_info("HMAC attrs: 0x%x\n", evm_hmac_attrs); } static bool evm_key_loaded(void) { return (bool)(evm_initialized & EVM_KEY_MASK); } /* * This function determines whether or not it is safe to ignore verification * errors, based on the ability of EVM to calculate HMACs. If the HMAC key * is not loaded, and it cannot be loaded in the future due to the * EVM_SETUP_COMPLETE initialization flag, allowing an operation despite the * attrs/xattrs being found invalid will not make them valid. */ static bool evm_hmac_disabled(void) { if (evm_initialized & EVM_INIT_HMAC) return false; if (!(evm_initialized & EVM_SETUP_COMPLETE)) return false; return true; } static int evm_find_protected_xattrs(struct dentry *dentry) { struct inode *inode = d_backing_inode(dentry); struct xattr_list *xattr; int error; int count = 0; if (!(inode->i_opflags & IOP_XATTR)) return -EOPNOTSUPP; list_for_each_entry_lockless(xattr, &evm_config_xattrnames, list) { error = __vfs_getxattr(dentry, inode, xattr->name, NULL, 0); if (error < 0) { if (error == -ENODATA) continue; return error; } count++; } return count; } static int is_unsupported_hmac_fs(struct dentry *dentry) { struct inode *inode = d_backing_inode(dentry); if (inode->i_sb->s_iflags & SB_I_EVM_HMAC_UNSUPPORTED) { pr_info_once("%s not supported\n", inode->i_sb->s_type->name); return 1; } return 0; } /* * evm_verify_hmac - calculate and compare the HMAC with the EVM xattr * * Compute the HMAC on the dentry's protected set of extended attributes * and compare it against the stored security.evm xattr. * * For performance: * - use the previously retrieved xattr value and length to calculate the * HMAC.) * - cache the verification result in the iint, when available. * * Returns integrity status */ static enum integrity_status evm_verify_hmac(struct dentry *dentry, const char *xattr_name, char *xattr_value, size_t xattr_value_len) { struct evm_ima_xattr_data *xattr_data = NULL; struct signature_v2_hdr *hdr; enum integrity_status evm_status = INTEGRITY_PASS; struct evm_digest digest; struct inode *inode = d_backing_inode(dentry); struct evm_iint_cache *iint = evm_iint_inode(inode); int rc, xattr_len, evm_immutable = 0; if (iint && (iint->evm_status == INTEGRITY_PASS || iint->evm_status == INTEGRITY_PASS_IMMUTABLE)) return iint->evm_status; /* * On unsupported filesystems without EVM_INIT_X509 enabled, skip * signature verification. */ if (!(evm_initialized & EVM_INIT_X509) && is_unsupported_hmac_fs(dentry)) return INTEGRITY_UNKNOWN; /* if status is not PASS, try to check again - against -ENOMEM */ /* first need to know the sig type */ rc = vfs_getxattr_alloc(&nop_mnt_idmap, dentry, XATTR_NAME_EVM, (char **)&xattr_data, 0, GFP_NOFS); if (rc <= 0) { evm_status = INTEGRITY_FAIL; if (rc == -ENODATA) { rc = evm_find_protected_xattrs(dentry); if (rc > 0) evm_status = INTEGRITY_NOLABEL; else if (rc == 0) evm_status = INTEGRITY_NOXATTRS; /* new file */ } else if (rc == -EOPNOTSUPP) { evm_status = INTEGRITY_UNKNOWN; } goto out; } xattr_len = rc; /* check value type */ switch (xattr_data->type) { case EVM_XATTR_HMAC: if (xattr_len != sizeof(struct evm_xattr)) { evm_status = INTEGRITY_FAIL; goto out; } digest.hdr.algo = HASH_ALGO_SHA1; rc = evm_calc_hmac(dentry, xattr_name, xattr_value, xattr_value_len, &digest, iint); if (rc) break; rc = crypto_memneq(xattr_data->data, digest.digest, SHA1_DIGEST_SIZE); if (rc) rc = -EINVAL; break; case EVM_XATTR_PORTABLE_DIGSIG: evm_immutable = 1; fallthrough; case EVM_IMA_XATTR_DIGSIG: /* accept xattr with non-empty signature field */ if (xattr_len <= sizeof(struct signature_v2_hdr)) { evm_status = INTEGRITY_FAIL; goto out; } hdr = (struct signature_v2_hdr *)xattr_data; digest.hdr.algo = hdr->hash_algo; rc = evm_calc_hash(dentry, xattr_name, xattr_value, xattr_value_len, xattr_data->type, &digest, iint); if (rc) break; rc = integrity_digsig_verify(INTEGRITY_KEYRING_EVM, (const char *)xattr_data, xattr_len, digest.digest, digest.hdr.length); if (!rc) { if (xattr_data->type == EVM_XATTR_PORTABLE_DIGSIG) { if (iint) iint->flags |= EVM_IMMUTABLE_DIGSIG; evm_status = INTEGRITY_PASS_IMMUTABLE; } else if (!IS_RDONLY(inode) && !(inode->i_sb->s_readonly_remount) && !IS_IMMUTABLE(inode) && !is_unsupported_hmac_fs(dentry)) { evm_update_evmxattr(dentry, xattr_name, xattr_value, xattr_value_len); } } break; default: rc = -EINVAL; break; } if (rc) { if (rc == -ENODATA) evm_status = INTEGRITY_NOXATTRS; else if (evm_immutable) evm_status = INTEGRITY_FAIL_IMMUTABLE; else evm_status = INTEGRITY_FAIL; } pr_debug("digest: (%d) [%*phN]\n", digest.hdr.length, digest.hdr.length, digest.digest); out: if (iint) iint->evm_status = evm_status; kfree(xattr_data); return evm_status; } static int evm_protected_xattr_common(const char *req_xattr_name, bool all_xattrs) { int namelen; int found = 0; struct xattr_list *xattr; namelen = strlen(req_xattr_name); list_for_each_entry_lockless(xattr, &evm_config_xattrnames, list) { if (!all_xattrs && !xattr->enabled) continue; if ((strlen(xattr->name) == namelen) && (strncmp(req_xattr_name, xattr->name, namelen) == 0)) { found = 1; break; } if (strncmp(req_xattr_name, xattr->name + XATTR_SECURITY_PREFIX_LEN, strlen(req_xattr_name)) == 0) { found = 1; break; } } return found; } int evm_protected_xattr(const char *req_xattr_name) { return evm_protected_xattr_common(req_xattr_name, false); } int evm_protected_xattr_if_enabled(const char *req_xattr_name) { return evm_protected_xattr_common(req_xattr_name, true); } /** * evm_read_protected_xattrs - read EVM protected xattr names, lengths, values * @dentry: dentry of the read xattrs * @buffer: buffer xattr names, lengths or values are copied to * @buffer_size: size of buffer * @type: n: names, l: lengths, v: values * @canonical_fmt: data format (true: little endian, false: native format) * * Read protected xattr names (separated by |), lengths (u32) or values for a * given dentry and return the total size of copied data. If buffer is NULL, * just return the total size. * * Returns the total size on success, a negative value on error. */ int evm_read_protected_xattrs(struct dentry *dentry, u8 *buffer, int buffer_size, char type, bool canonical_fmt) { struct xattr_list *xattr; int rc, size, total_size = 0; list_for_each_entry_lockless(xattr, &evm_config_xattrnames, list) { rc = __vfs_getxattr(dentry, d_backing_inode(dentry), xattr->name, NULL, 0); if (rc < 0 && rc == -ENODATA) continue; else if (rc < 0) return rc; switch (type) { case 'n': size = strlen(xattr->name) + 1; if (buffer) { if (total_size) *(buffer + total_size - 1) = '|'; memcpy(buffer + total_size, xattr->name, size); } break; case 'l': size = sizeof(u32); if (buffer) { if (canonical_fmt) rc = (__force int)cpu_to_le32(rc); *(u32 *)(buffer + total_size) = rc; } break; case 'v': size = rc; if (buffer) { rc = __vfs_getxattr(dentry, d_backing_inode(dentry), xattr->name, buffer + total_size, buffer_size - total_size); if (rc < 0) return rc; } break; default: return -EINVAL; } total_size += size; } return total_size; } /** * evm_verifyxattr - verify the integrity of the requested xattr * @dentry: object of the verify xattr * @xattr_name: requested xattr * @xattr_value: requested xattr value * @xattr_value_len: requested xattr value length * * Calculate the HMAC for the given dentry and verify it against the stored * security.evm xattr. For performance, use the xattr value and length * previously retrieved to calculate the HMAC. * * Returns the xattr integrity status. * * This function requires the caller to lock the inode's i_mutex before it * is executed. */ enum integrity_status evm_verifyxattr(struct dentry *dentry, const char *xattr_name, void *xattr_value, size_t xattr_value_len) { if (!evm_key_loaded() || !evm_protected_xattr(xattr_name)) return INTEGRITY_UNKNOWN; return evm_verify_hmac(dentry, xattr_name, xattr_value, xattr_value_len); } EXPORT_SYMBOL_GPL(evm_verifyxattr); /* * evm_verify_current_integrity - verify the dentry's metadata integrity * @dentry: pointer to the affected dentry * * Verify and return the dentry's metadata integrity. The exceptions are * before EVM is initialized or in 'fix' mode. */ static enum integrity_status evm_verify_current_integrity(struct dentry *dentry) { struct inode *inode = d_backing_inode(dentry); if (!evm_key_loaded() || !S_ISREG(inode->i_mode) || evm_fixmode) return INTEGRITY_PASS; return evm_verify_hmac(dentry, NULL, NULL, 0); } /* * evm_xattr_change - check if passed xattr value differs from current value * @idmap: idmap of the mount * @dentry: pointer to the affected dentry * @xattr_name: requested xattr * @xattr_value: requested xattr value * @xattr_value_len: requested xattr value length * * Check if passed xattr value differs from current value. * * Returns 1 if passed xattr value differs from current value, 0 otherwise. */ static int evm_xattr_change(struct mnt_idmap *idmap, struct dentry *dentry, const char *xattr_name, const void *xattr_value, size_t xattr_value_len) { char *xattr_data = NULL; int rc = 0; rc = vfs_getxattr_alloc(&nop_mnt_idmap, dentry, xattr_name, &xattr_data, 0, GFP_NOFS); if (rc < 0) { rc = 1; goto out; } if (rc == xattr_value_len) rc = !!memcmp(xattr_value, xattr_data, rc); else rc = 1; out: kfree(xattr_data); return rc; } /* * evm_protect_xattr - protect the EVM extended attribute * * Prevent security.evm from being modified or removed without the * necessary permissions or when the existing value is invalid. * * The posix xattr acls are 'system' prefixed, which normally would not * affect security.evm. An interesting side affect of writing posix xattr * acls is their modifying of the i_mode, which is included in security.evm. * For posix xattr acls only, permit security.evm, even if it currently * doesn't exist, to be updated unless the EVM signature is immutable. */ static int evm_protect_xattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *xattr_name, const void *xattr_value, size_t xattr_value_len) { enum integrity_status evm_status; if (strcmp(xattr_name, XATTR_NAME_EVM) == 0) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (is_unsupported_hmac_fs(dentry)) return -EPERM; } else if (!evm_protected_xattr(xattr_name)) { if (!posix_xattr_acl(xattr_name)) return 0; if (is_unsupported_hmac_fs(dentry)) return 0; evm_status = evm_verify_current_integrity(dentry); if ((evm_status == INTEGRITY_PASS) || (evm_status == INTEGRITY_NOXATTRS)) return 0; goto out; } else if (is_unsupported_hmac_fs(dentry)) return 0; evm_status = evm_verify_current_integrity(dentry); if (evm_status == INTEGRITY_NOXATTRS) { struct evm_iint_cache *iint; /* Exception if the HMAC is not going to be calculated. */ if (evm_hmac_disabled()) return 0; iint = evm_iint_inode(d_backing_inode(dentry)); if (iint && (iint->flags & EVM_NEW_FILE)) return 0; /* exception for pseudo filesystems */ if (dentry->d_sb->s_magic == TMPFS_MAGIC || dentry->d_sb->s_magic == SYSFS_MAGIC) return 0; integrity_audit_msg(AUDIT_INTEGRITY_METADATA, dentry->d_inode, dentry->d_name.name, "update_metadata", integrity_status_msg[evm_status], -EPERM, 0); } out: /* Exception if the HMAC is not going to be calculated. */ if (evm_hmac_disabled() && (evm_status == INTEGRITY_NOLABEL || evm_status == INTEGRITY_UNKNOWN)) return 0; /* * Writing other xattrs is safe for portable signatures, as portable * signatures are immutable and can never be updated. */ if (evm_status == INTEGRITY_FAIL_IMMUTABLE) return 0; if (evm_status == INTEGRITY_PASS_IMMUTABLE && !evm_xattr_change(idmap, dentry, xattr_name, xattr_value, xattr_value_len)) return 0; if (evm_status != INTEGRITY_PASS && evm_status != INTEGRITY_PASS_IMMUTABLE) integrity_audit_msg(AUDIT_INTEGRITY_METADATA, d_backing_inode(dentry), dentry->d_name.name, "appraise_metadata", integrity_status_msg[evm_status], -EPERM, 0); return evm_status == INTEGRITY_PASS ? 0 : -EPERM; } /** * evm_inode_setxattr - protect the EVM extended attribute * @idmap: idmap of the mount * @dentry: pointer to the affected dentry * @xattr_name: pointer to the affected extended attribute name * @xattr_value: pointer to the new extended attribute value * @xattr_value_len: pointer to the new extended attribute value length * @flags: flags to pass into filesystem operations * * Before allowing the 'security.evm' protected xattr to be updated, * verify the existing value is valid. As only the kernel should have * access to the EVM encrypted key needed to calculate the HMAC, prevent * userspace from writing HMAC value. Writing 'security.evm' requires * requires CAP_SYS_ADMIN privileges. */ static int evm_inode_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *xattr_name, const void *xattr_value, size_t xattr_value_len, int flags) { const struct evm_ima_xattr_data *xattr_data = xattr_value; /* Policy permits modification of the protected xattrs even though * there's no HMAC key loaded */ if (evm_initialized & EVM_ALLOW_METADATA_WRITES) return 0; if (strcmp(xattr_name, XATTR_NAME_EVM) == 0) { if (!xattr_value_len) return -EINVAL; if (xattr_data->type != EVM_IMA_XATTR_DIGSIG && xattr_data->type != EVM_XATTR_PORTABLE_DIGSIG) return -EPERM; } return evm_protect_xattr(idmap, dentry, xattr_name, xattr_value, xattr_value_len); } /** * evm_inode_removexattr - protect the EVM extended attribute * @idmap: idmap of the mount * @dentry: pointer to the affected dentry * @xattr_name: pointer to the affected extended attribute name * * Removing 'security.evm' requires CAP_SYS_ADMIN privileges and that * the current value is valid. */ static int evm_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *xattr_name) { /* Policy permits modification of the protected xattrs even though * there's no HMAC key loaded */ if (evm_initialized & EVM_ALLOW_METADATA_WRITES) return 0; return evm_protect_xattr(idmap, dentry, xattr_name, NULL, 0); } #ifdef CONFIG_FS_POSIX_ACL static int evm_inode_set_acl_change(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, struct posix_acl *kacl) { int rc; umode_t mode; struct inode *inode = d_backing_inode(dentry); if (!kacl) return 1; rc = posix_acl_update_mode(idmap, inode, &mode, &kacl); if (rc || (inode->i_mode != mode)) return 1; return 0; } #else static inline int evm_inode_set_acl_change(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, struct posix_acl *kacl) { return 0; } #endif /** * evm_inode_set_acl - protect the EVM extended attribute from posix acls * @idmap: idmap of the idmapped mount * @dentry: pointer to the affected dentry * @acl_name: name of the posix acl * @kacl: pointer to the posix acls * * Prevent modifying posix acls causing the EVM HMAC to be re-calculated * and 'security.evm' xattr updated, unless the existing 'security.evm' is * valid. * * Return: zero on success, -EPERM on failure. */ static int evm_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) { enum integrity_status evm_status; /* Policy permits modification of the protected xattrs even though * there's no HMAC key loaded */ if (evm_initialized & EVM_ALLOW_METADATA_WRITES) return 0; evm_status = evm_verify_current_integrity(dentry); if ((evm_status == INTEGRITY_PASS) || (evm_status == INTEGRITY_NOXATTRS)) return 0; /* Exception if the HMAC is not going to be calculated. */ if (evm_hmac_disabled() && (evm_status == INTEGRITY_NOLABEL || evm_status == INTEGRITY_UNKNOWN)) return 0; /* * Writing other xattrs is safe for portable signatures, as portable * signatures are immutable and can never be updated. */ if (evm_status == INTEGRITY_FAIL_IMMUTABLE) return 0; if (evm_status == INTEGRITY_PASS_IMMUTABLE && !evm_inode_set_acl_change(idmap, dentry, acl_name, kacl)) return 0; if (evm_status != INTEGRITY_PASS_IMMUTABLE) integrity_audit_msg(AUDIT_INTEGRITY_METADATA, d_backing_inode(dentry), dentry->d_name.name, "appraise_metadata", integrity_status_msg[evm_status], -EPERM, 0); return -EPERM; } /** * evm_inode_remove_acl - Protect the EVM extended attribute from posix acls * @idmap: idmap of the mount * @dentry: pointer to the affected dentry * @acl_name: name of the posix acl * * Prevent removing posix acls causing the EVM HMAC to be re-calculated * and 'security.evm' xattr updated, unless the existing 'security.evm' is * valid. * * Return: zero on success, -EPERM on failure. */ static int evm_inode_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { return evm_inode_set_acl(idmap, dentry, acl_name, NULL); } static void evm_reset_status(struct inode *inode) { struct evm_iint_cache *iint; iint = evm_iint_inode(inode); if (iint) iint->evm_status = INTEGRITY_UNKNOWN; } /** * evm_metadata_changed: Detect changes to the metadata * @inode: a file's inode * @metadata_inode: metadata inode * * On a stacked filesystem detect whether the metadata has changed. If this is * the case reset the evm_status associated with the inode that represents the * file. */ bool evm_metadata_changed(struct inode *inode, struct inode *metadata_inode) { struct evm_iint_cache *iint = evm_iint_inode(inode); bool ret = false; if (iint) { ret = (!IS_I_VERSION(metadata_inode) || integrity_inode_attrs_changed(&iint->metadata_inode, metadata_inode)); if (ret) iint->evm_status = INTEGRITY_UNKNOWN; } return ret; } /** * evm_revalidate_status - report whether EVM status re-validation is necessary * @xattr_name: pointer to the affected extended attribute name * * Report whether callers of evm_verifyxattr() should re-validate the * EVM status. * * Return true if re-validation is necessary, false otherwise. */ bool evm_revalidate_status(const char *xattr_name) { if (!evm_key_loaded()) return false; /* evm_inode_post_setattr() passes NULL */ if (!xattr_name) return true; if (!evm_protected_xattr(xattr_name) && !posix_xattr_acl(xattr_name) && strcmp(xattr_name, XATTR_NAME_EVM)) return false; return true; } /** * evm_inode_post_setxattr - update 'security.evm' to reflect the changes * @dentry: pointer to the affected dentry * @xattr_name: pointer to the affected extended attribute name * @xattr_value: pointer to the new extended attribute value * @xattr_value_len: pointer to the new extended attribute value length * @flags: flags to pass into filesystem operations * * Update the HMAC stored in 'security.evm' to reflect the change. * * No need to take the i_mutex lock here, as this function is called from * __vfs_setxattr_noperm(). The caller of which has taken the inode's * i_mutex lock. */ static void evm_inode_post_setxattr(struct dentry *dentry, const char *xattr_name, const void *xattr_value, size_t xattr_value_len, int flags) { if (!evm_revalidate_status(xattr_name)) return; evm_reset_status(dentry->d_inode); if (!strcmp(xattr_name, XATTR_NAME_EVM)) return; if (!(evm_initialized & EVM_INIT_HMAC)) return; if (is_unsupported_hmac_fs(dentry)) return; evm_update_evmxattr(dentry, xattr_name, xattr_value, xattr_value_len); } /** * evm_inode_post_set_acl - Update the EVM extended attribute from posix acls * @dentry: pointer to the affected dentry * @acl_name: name of the posix acl * @kacl: pointer to the posix acls * * Update the 'security.evm' xattr with the EVM HMAC re-calculated after setting * posix acls. */ static void evm_inode_post_set_acl(struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) { return evm_inode_post_setxattr(dentry, acl_name, NULL, 0, 0); } /** * evm_inode_post_removexattr - update 'security.evm' after removing the xattr * @dentry: pointer to the affected dentry * @xattr_name: pointer to the affected extended attribute name * * Update the HMAC stored in 'security.evm' to reflect removal of the xattr. * * No need to take the i_mutex lock here, as this function is called from * vfs_removexattr() which takes the i_mutex. */ static void evm_inode_post_removexattr(struct dentry *dentry, const char *xattr_name) { if (!evm_revalidate_status(xattr_name)) return; evm_reset_status(dentry->d_inode); if (!strcmp(xattr_name, XATTR_NAME_EVM)) return; if (!(evm_initialized & EVM_INIT_HMAC)) return; evm_update_evmxattr(dentry, xattr_name, NULL, 0); } /** * evm_inode_post_remove_acl - Update the EVM extended attribute from posix acls * @idmap: idmap of the mount * @dentry: pointer to the affected dentry * @acl_name: name of the posix acl * * Update the 'security.evm' xattr with the EVM HMAC re-calculated after * removing posix acls. */ static inline void evm_inode_post_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { evm_inode_post_removexattr(dentry, acl_name); } static int evm_attr_change(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_backing_inode(dentry); unsigned int ia_valid = attr->ia_valid; if (!i_uid_needs_update(idmap, attr, inode) && !i_gid_needs_update(idmap, attr, inode) && (!(ia_valid & ATTR_MODE) || attr->ia_mode == inode->i_mode)) return 0; return 1; } /** * evm_inode_setattr - prevent updating an invalid EVM extended attribute * @idmap: idmap of the mount * @dentry: pointer to the affected dentry * @attr: iattr structure containing the new file attributes * * Permit update of file attributes when files have a valid EVM signature, * except in the case of them having an immutable portable signature. */ static int evm_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { unsigned int ia_valid = attr->ia_valid; enum integrity_status evm_status; /* Policy permits modification of the protected attrs even though * there's no HMAC key loaded */ if (evm_initialized & EVM_ALLOW_METADATA_WRITES) return 0; if (is_unsupported_hmac_fs(dentry)) return 0; if (!(ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID))) return 0; evm_status = evm_verify_current_integrity(dentry); /* * Writing attrs is safe for portable signatures, as portable signatures * are immutable and can never be updated. */ if ((evm_status == INTEGRITY_PASS) || (evm_status == INTEGRITY_NOXATTRS) || (evm_status == INTEGRITY_FAIL_IMMUTABLE) || (evm_hmac_disabled() && (evm_status == INTEGRITY_NOLABEL || evm_status == INTEGRITY_UNKNOWN))) return 0; if (evm_status == INTEGRITY_PASS_IMMUTABLE && !evm_attr_change(idmap, dentry, attr)) return 0; integrity_audit_msg(AUDIT_INTEGRITY_METADATA, d_backing_inode(dentry), dentry->d_name.name, "appraise_metadata", integrity_status_msg[evm_status], -EPERM, 0); return -EPERM; } /** * evm_inode_post_setattr - update 'security.evm' after modifying metadata * @idmap: idmap of the idmapped mount * @dentry: pointer to the affected dentry * @ia_valid: for the UID and GID status * * For now, update the HMAC stored in 'security.evm' to reflect UID/GID * changes. * * This function is called from notify_change(), which expects the caller * to lock the inode's i_mutex. */ static void evm_inode_post_setattr(struct mnt_idmap *idmap, struct dentry *dentry, int ia_valid) { if (!evm_revalidate_status(NULL)) return; evm_reset_status(dentry->d_inode); if (!(evm_initialized & EVM_INIT_HMAC)) return; if (is_unsupported_hmac_fs(dentry)) return; if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)) evm_update_evmxattr(dentry, NULL, NULL, 0); } static int evm_inode_copy_up_xattr(struct dentry *src, const char *name) { struct evm_ima_xattr_data *xattr_data = NULL; int rc; if (strcmp(name, XATTR_NAME_EVM) != 0) return -EOPNOTSUPP; /* first need to know the sig type */ rc = vfs_getxattr_alloc(&nop_mnt_idmap, src, XATTR_NAME_EVM, (char **)&xattr_data, 0, GFP_NOFS); if (rc <= 0) return -EPERM; if (rc < offsetof(struct evm_ima_xattr_data, type) + sizeof(xattr_data->type)) return -EPERM; switch (xattr_data->type) { case EVM_XATTR_PORTABLE_DIGSIG: rc = 0; /* allow copy-up */ break; case EVM_XATTR_HMAC: case EVM_IMA_XATTR_DIGSIG: default: rc = -ECANCELED; /* discard */ } kfree(xattr_data); return rc; } /* * evm_inode_init_security - initializes security.evm HMAC value */ int evm_inode_init_security(struct inode *inode, struct inode *dir, const struct qstr *qstr, struct xattr *xattrs, int *xattr_count) { struct evm_xattr *xattr_data; struct xattr *xattr, *evm_xattr; bool evm_protected_xattrs = false; int rc; if (!(evm_initialized & EVM_INIT_HMAC) || !xattrs) return 0; /* * security_inode_init_security() makes sure that the xattrs array is * contiguous, there is enough space for security.evm, and that there is * a terminator at the end of the array. */ for (xattr = xattrs; xattr->name; xattr++) { if (evm_protected_xattr(xattr->name)) evm_protected_xattrs = true; } /* EVM xattr not needed. */ if (!evm_protected_xattrs) return 0; evm_xattr = lsm_get_xattr_slot(xattrs, xattr_count); /* * Array terminator (xattr name = NULL) must be the first non-filled * xattr slot. */ WARN_ONCE(evm_xattr != xattr, "%s: xattrs terminator is not the first non-filled slot\n", __func__); xattr_data = kzalloc(sizeof(*xattr_data), GFP_NOFS); if (!xattr_data) return -ENOMEM; xattr_data->data.type = EVM_XATTR_HMAC; rc = evm_init_hmac(inode, xattrs, xattr_data->digest); if (rc < 0) goto out; evm_xattr->value = xattr_data; evm_xattr->value_len = sizeof(*xattr_data); evm_xattr->name = XATTR_EVM_SUFFIX; return 0; out: kfree(xattr_data); return rc; } EXPORT_SYMBOL_GPL(evm_inode_init_security); static int evm_inode_alloc_security(struct inode *inode) { struct evm_iint_cache *iint = evm_iint_inode(inode); /* Called by security_inode_alloc(), it cannot be NULL. */ iint->flags = 0UL; iint->evm_status = INTEGRITY_UNKNOWN; return 0; } static void evm_file_release(struct file *file) { struct inode *inode = file_inode(file); struct evm_iint_cache *iint = evm_iint_inode(inode); fmode_t mode = file->f_mode; if (!S_ISREG(inode->i_mode) || !(mode & FMODE_WRITE)) return; if (iint && iint->flags & EVM_NEW_FILE && atomic_read(&inode->i_writecount) == 1) iint->flags &= ~EVM_NEW_FILE; } static void evm_post_path_mknod(struct mnt_idmap *idmap, struct dentry *dentry) { struct inode *inode = d_backing_inode(dentry); struct evm_iint_cache *iint = evm_iint_inode(inode); if (!S_ISREG(inode->i_mode)) return; if (iint) iint->flags |= EVM_NEW_FILE; } #ifdef CONFIG_EVM_LOAD_X509 void __init evm_load_x509(void) { int rc; rc = integrity_load_x509(INTEGRITY_KEYRING_EVM, CONFIG_EVM_X509_PATH); if (!rc) evm_initialized |= EVM_INIT_X509; } #endif static int __init init_evm(void) { int error; struct list_head *pos, *q; evm_init_config(); error = integrity_init_keyring(INTEGRITY_KEYRING_EVM); if (error) goto error; error = evm_init_secfs(); if (error < 0) { pr_info("Error registering secfs\n"); goto error; } error: if (error != 0) { if (!list_empty(&evm_config_xattrnames)) { list_for_each_safe(pos, q, &evm_config_xattrnames) list_del(pos); } } return error; } static struct security_hook_list evm_hooks[] __ro_after_init = { LSM_HOOK_INIT(inode_setattr, evm_inode_setattr), LSM_HOOK_INIT(inode_post_setattr, evm_inode_post_setattr), LSM_HOOK_INIT(inode_copy_up_xattr, evm_inode_copy_up_xattr), LSM_HOOK_INIT(inode_setxattr, evm_inode_setxattr), LSM_HOOK_INIT(inode_post_setxattr, evm_inode_post_setxattr), LSM_HOOK_INIT(inode_set_acl, evm_inode_set_acl), LSM_HOOK_INIT(inode_post_set_acl, evm_inode_post_set_acl), LSM_HOOK_INIT(inode_remove_acl, evm_inode_remove_acl), LSM_HOOK_INIT(inode_post_remove_acl, evm_inode_post_remove_acl), LSM_HOOK_INIT(inode_removexattr, evm_inode_removexattr), LSM_HOOK_INIT(inode_post_removexattr, evm_inode_post_removexattr), LSM_HOOK_INIT(inode_init_security, evm_inode_init_security), LSM_HOOK_INIT(inode_alloc_security, evm_inode_alloc_security), LSM_HOOK_INIT(file_release, evm_file_release), LSM_HOOK_INIT(path_post_mknod, evm_post_path_mknod), }; static const struct lsm_id evm_lsmid = { .name = "evm", .id = LSM_ID_EVM, }; static int __init init_evm_lsm(void) { security_add_hooks(evm_hooks, ARRAY_SIZE(evm_hooks), &evm_lsmid); return 0; } struct lsm_blob_sizes evm_blob_sizes __ro_after_init = { .lbs_inode = sizeof(struct evm_iint_cache), .lbs_xattr_count = 1, }; DEFINE_LSM(evm) = { .name = "evm", .init = init_evm_lsm, .order = LSM_ORDER_LAST, .blobs = &evm_blob_sizes, }; late_initcall(init_evm);
371 369 6 6 365 314 57 6 366 368 367 366 367 1 6 188 30 218 6 215 344 344 219 218 12 219 344 344 343 346 321 30 345 346 346 346 56 28 56 56 28 320 321 321 317 317 315 317 316 317 316 317 317 316 317 5 6 6 6 6 6 6 370 371 8 8 6 6 3 3 3 3 3 3 3 3 5 5 5 3 3 340 6 6 6 6 6 6 6 7 7 7 6 6 6 6 6 25 1 25 2 14 2 15 15 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 // SPDX-License-Identifier: GPL-2.0-or-later /* * Routines having to do with the 'struct sk_buff' memory handlers. * * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk> * Florian La Roche <rzsfl@rz.uni-sb.de> * * Fixes: * Alan Cox : Fixed the worst of the load * balancer bugs. * Dave Platt : Interrupt stacking fix. * Richard Kooijman : Timestamp fixes. * Alan Cox : Changed buffer format. * Alan Cox : destructor hook for AF_UNIX etc. * Linus Torvalds : Better skb_clone. * Alan Cox : Added skb_copy. * Alan Cox : Added all the changed routines Linus * only put in the headers * Ray VanTassle : Fixed --skb->lock in free * Alan Cox : skb_copy copy arp field * Andi Kleen : slabified it. * Robert Olsson : Removed skb_head_pool * * NOTE: * The __skb_ routines should be called with interrupts * disabled, or you better be *real* sure that the operation is atomic * with respect to whatever list is being frobbed (e.g. via lock_sock() * or via disabling bottom half handlers, etc). */ /* * The functions in this file will not compile correctly with gcc 2.4.x */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/slab.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/sctp.h> #include <linux/netdevice.h> #ifdef CONFIG_NET_CLS_ACT #include <net/pkt_sched.h> #endif #include <linux/string.h> #include <linux/skbuff.h> #include <linux/skbuff_ref.h> #include <linux/splice.h> #include <linux/cache.h> #include <linux/rtnetlink.h> #include <linux/init.h> #include <linux/scatterlist.h> #include <linux/errqueue.h> #include <linux/prefetch.h> #include <linux/bitfield.h> #include <linux/if_vlan.h> #include <linux/mpls.h> #include <linux/kcov.h> #include <linux/iov_iter.h> #include <linux/crc32.h> #include <net/protocol.h> #include <net/dst.h> #include <net/sock.h> #include <net/checksum.h> #include <net/gro.h> #include <net/gso.h> #include <net/hotdata.h> #include <net/ip6_checksum.h> #include <net/xfrm.h> #include <net/mpls.h> #include <net/mptcp.h> #include <net/mctp.h> #include <net/page_pool/helpers.h> #include <net/dropreason.h> #include <linux/uaccess.h> #include <trace/events/skb.h> #include <linux/highmem.h> #include <linux/capability.h> #include <linux/user_namespace.h> #include <linux/indirect_call_wrapper.h> #include <linux/textsearch.h> #include "dev.h" #include "devmem.h" #include "netmem_priv.h" #include "sock_destructor.h" #ifdef CONFIG_SKB_EXTENSIONS static struct kmem_cache *skbuff_ext_cache __ro_after_init; #endif #define GRO_MAX_HEAD_PAD (GRO_MAX_HEAD + NET_SKB_PAD + NET_IP_ALIGN) #define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(max(MAX_TCP_HEADER, \ GRO_MAX_HEAD_PAD)) /* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two. * This should ensure that SKB_SMALL_HEAD_HEADROOM is a unique * size, and we can differentiate heads from skb_small_head_cache * vs system slabs by looking at their size (skb_end_offset()). */ #define SKB_SMALL_HEAD_CACHE_SIZE \ (is_power_of_2(SKB_SMALL_HEAD_SIZE) ? \ (SKB_SMALL_HEAD_SIZE + L1_CACHE_BYTES) : \ SKB_SMALL_HEAD_SIZE) #define SKB_SMALL_HEAD_HEADROOM \ SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) /* kcm_write_msgs() relies on casting paged frags to bio_vec to use * iov_iter_bvec(). These static asserts ensure the cast is valid is long as the * netmem is a page. */ static_assert(offsetof(struct bio_vec, bv_page) == offsetof(skb_frag_t, netmem)); static_assert(sizeof_field(struct bio_vec, bv_page) == sizeof_field(skb_frag_t, netmem)); static_assert(offsetof(struct bio_vec, bv_len) == offsetof(skb_frag_t, len)); static_assert(sizeof_field(struct bio_vec, bv_len) == sizeof_field(skb_frag_t, len)); static_assert(offsetof(struct bio_vec, bv_offset) == offsetof(skb_frag_t, offset)); static_assert(sizeof_field(struct bio_vec, bv_offset) == sizeof_field(skb_frag_t, offset)); #undef FN #define FN(reason) [SKB_DROP_REASON_##reason] = #reason, static const char * const drop_reasons[] = { [SKB_CONSUMED] = "CONSUMED", DEFINE_DROP_REASON(FN, FN) }; static const struct drop_reason_list drop_reasons_core = { .reasons = drop_reasons, .n_reasons = ARRAY_SIZE(drop_reasons), }; const struct drop_reason_list __rcu * drop_reasons_by_subsys[SKB_DROP_REASON_SUBSYS_NUM] = { [SKB_DROP_REASON_SUBSYS_CORE] = RCU_INITIALIZER(&drop_reasons_core), }; EXPORT_SYMBOL(drop_reasons_by_subsys); /** * drop_reasons_register_subsys - register another drop reason subsystem * @subsys: the subsystem to register, must not be the core * @list: the list of drop reasons within the subsystem, must point to * a statically initialized list */ void drop_reasons_register_subsys(enum skb_drop_reason_subsys subsys, const struct drop_reason_list *list) { if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE || subsys >= ARRAY_SIZE(drop_reasons_by_subsys), "invalid subsystem %d\n", subsys)) return; /* must point to statically allocated memory, so INIT is OK */ RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], list); } EXPORT_SYMBOL_GPL(drop_reasons_register_subsys); /** * drop_reasons_unregister_subsys - unregister a drop reason subsystem * @subsys: the subsystem to remove, must not be the core * * Note: This will synchronize_rcu() to ensure no users when it returns. */ void drop_reasons_unregister_subsys(enum skb_drop_reason_subsys subsys) { if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE || subsys >= ARRAY_SIZE(drop_reasons_by_subsys), "invalid subsystem %d\n", subsys)) return; RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], NULL); synchronize_rcu(); } EXPORT_SYMBOL_GPL(drop_reasons_unregister_subsys); /** * skb_panic - private function for out-of-line support * @skb: buffer * @sz: size * @addr: address * @msg: skb_over_panic or skb_under_panic * * Out-of-line support for skb_put() and skb_push(). * Called via the wrapper skb_over_panic() or skb_under_panic(). * Keep out of line to prevent kernel bloat. * __builtin_return_address is not used because it is not always reliable. */ static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr, const char msg[]) { pr_emerg("%s: text:%px len:%d put:%d head:%px data:%px tail:%#lx end:%#lx dev:%s\n", msg, addr, skb->len, sz, skb->head, skb->data, (unsigned long)skb->tail, (unsigned long)skb->end, skb->dev ? skb->dev->name : "<NULL>"); BUG(); } static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr) { skb_panic(skb, sz, addr, __func__); } static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr) { skb_panic(skb, sz, addr, __func__); } #define NAPI_SKB_CACHE_SIZE 64 #define NAPI_SKB_CACHE_BULK 16 #define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2) struct napi_alloc_cache { local_lock_t bh_lock; struct page_frag_cache page; unsigned int skb_count; void *skb_cache[NAPI_SKB_CACHE_SIZE]; }; static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache) = { .bh_lock = INIT_LOCAL_LOCK(bh_lock), }; void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) { struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); void *data; fragsz = SKB_DATA_ALIGN(fragsz); local_lock_nested_bh(&napi_alloc_cache.bh_lock); data = __page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC | __GFP_NOWARN, align_mask); local_unlock_nested_bh(&napi_alloc_cache.bh_lock); return data; } EXPORT_SYMBOL(__napi_alloc_frag_align); void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) { void *data; if (in_hardirq() || irqs_disabled()) { struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache); fragsz = SKB_DATA_ALIGN(fragsz); data = __page_frag_alloc_align(nc, fragsz, GFP_ATOMIC | __GFP_NOWARN, align_mask); } else { local_bh_disable(); data = __napi_alloc_frag_align(fragsz, align_mask); local_bh_enable(); } return data; } EXPORT_SYMBOL(__netdev_alloc_frag_align); static struct sk_buff *napi_skb_cache_get(void) { struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); struct sk_buff *skb; local_lock_nested_bh(&napi_alloc_cache.bh_lock); if (unlikely(!nc->skb_count)) { nc->skb_count = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, GFP_ATOMIC | __GFP_NOWARN, NAPI_SKB_CACHE_BULK, nc->skb_cache); if (unlikely(!nc->skb_count)) { local_unlock_nested_bh(&napi_alloc_cache.bh_lock); return NULL; } } skb = nc->skb_cache[--nc->skb_count]; local_unlock_nested_bh(&napi_alloc_cache.bh_lock); kasan_mempool_unpoison_object(skb, kmem_cache_size(net_hotdata.skbuff_cache)); return skb; } /** * napi_skb_cache_get_bulk - obtain a number of zeroed skb heads from the cache * @skbs: pointer to an at least @n-sized array to fill with skb pointers * @n: number of entries to provide * * Tries to obtain @n &sk_buff entries from the NAPI percpu cache and writes * the pointers into the provided array @skbs. If there are less entries * available, tries to replenish the cache and bulk-allocates the diff from * the MM layer if needed. * The heads are being zeroed with either memset() or %__GFP_ZERO, so they are * ready for {,__}build_skb_around() and don't have any data buffers attached. * Must be called *only* from the BH context. * * Return: number of successfully allocated skbs (@n if no actual allocation * needed or kmem_cache_alloc_bulk() didn't fail). */ u32 napi_skb_cache_get_bulk(void **skbs, u32 n) { struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); u32 bulk, total = n; local_lock_nested_bh(&napi_alloc_cache.bh_lock); if (nc->skb_count >= n) goto get; /* No enough cached skbs. Try refilling the cache first */ bulk = min(NAPI_SKB_CACHE_SIZE - nc->skb_count, NAPI_SKB_CACHE_BULK); nc->skb_count += kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, GFP_ATOMIC | __GFP_NOWARN, bulk, &nc->skb_cache[nc->skb_count]); if (likely(nc->skb_count >= n)) goto get; /* Still not enough. Bulk-allocate the missing part directly, zeroed */ n -= kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, GFP_ATOMIC | __GFP_ZERO | __GFP_NOWARN, n - nc->skb_count, &skbs[nc->skb_count]); if (likely(nc->skb_count >= n)) goto get; /* kmem_cache didn't allocate the number we need, limit the output */ total -= n - nc->skb_count; n = nc->skb_count; get: for (u32 base = nc->skb_count - n, i = 0; i < n; i++) { u32 cache_size = kmem_cache_size(net_hotdata.skbuff_cache); skbs[i] = nc->skb_cache[base + i]; kasan_mempool_unpoison_object(skbs[i], cache_size); memset(skbs[i], 0, offsetof(struct sk_buff, tail)); } nc->skb_count -= n; local_unlock_nested_bh(&napi_alloc_cache.bh_lock); return total; } EXPORT_SYMBOL_GPL(napi_skb_cache_get_bulk); static inline void __finalize_skb_around(struct sk_buff *skb, void *data, unsigned int size) { struct skb_shared_info *shinfo; size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); /* Assumes caller memset cleared SKB */ skb->truesize = SKB_TRUESIZE(size); refcount_set(&skb->users, 1); skb->head = data; skb->data = data; skb_reset_tail_pointer(skb); skb_set_end_offset(skb, size); skb->mac_header = (typeof(skb->mac_header))~0U; skb->transport_header = (typeof(skb->transport_header))~0U; skb->alloc_cpu = raw_smp_processor_id(); /* make sure we initialize shinfo sequentially */ shinfo = skb_shinfo(skb); memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); atomic_set(&shinfo->dataref, 1); skb_set_kcov_handle(skb, kcov_common_handle()); } static inline void *__slab_build_skb(struct sk_buff *skb, void *data, unsigned int *size) { void *resized; /* Must find the allocation size (and grow it to match). */ *size = ksize(data); /* krealloc() will immediately return "data" when * "ksize(data)" is requested: it is the existing upper * bounds. As a result, GFP_ATOMIC will be ignored. Note * that this "new" pointer needs to be passed back to the * caller for use so the __alloc_size hinting will be * tracked correctly. */ resized = krealloc(data, *size, GFP_ATOMIC); WARN_ON_ONCE(resized != data); return resized; } /* build_skb() variant which can operate on slab buffers. * Note that this should be used sparingly as slab buffers * cannot be combined efficiently by GRO! */ struct sk_buff *slab_build_skb(void *data) { struct sk_buff *skb; unsigned int size; skb = kmem_cache_alloc(net_hotdata.skbuff_cache, GFP_ATOMIC | __GFP_NOWARN); if (unlikely(!skb)) return NULL; memset(skb, 0, offsetof(struct sk_buff, tail)); data = __slab_build_skb(skb, data, &size); __finalize_skb_around(skb, data, size); return skb; } EXPORT_SYMBOL(slab_build_skb); /* Caller must provide SKB that is memset cleared */ static void __build_skb_around(struct sk_buff *skb, void *data, unsigned int frag_size) { unsigned int size = frag_size; /* frag_size == 0 is considered deprecated now. Callers * using slab buffer should use slab_build_skb() instead. */ if (WARN_ONCE(size == 0, "Use slab_build_skb() instead")) data = __slab_build_skb(skb, data, &size); __finalize_skb_around(skb, data, size); } /** * __build_skb - build a network buffer * @data: data buffer provided by caller * @frag_size: size of data (must not be 0) * * Allocate a new &sk_buff. Caller provides space holding head and * skb_shared_info. @data must have been allocated from the page * allocator or vmalloc(). (A @frag_size of 0 to indicate a kmalloc() * allocation is deprecated, and callers should use slab_build_skb() * instead.) * The return is the new skb buffer. * On a failure the return is %NULL, and @data is not freed. * Notes : * Before IO, driver allocates only data buffer where NIC put incoming frame * Driver should add room at head (NET_SKB_PAD) and * MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info)) * After IO, driver calls build_skb(), to allocate sk_buff and populate it * before giving packet to stack. * RX rings only contains data buffers, not full skbs. */ struct sk_buff *__build_skb(void *data, unsigned int frag_size) { struct sk_buff *skb; skb = kmem_cache_alloc(net_hotdata.skbuff_cache, GFP_ATOMIC | __GFP_NOWARN); if (unlikely(!skb)) return NULL; memset(skb, 0, offsetof(struct sk_buff, tail)); __build_skb_around(skb, data, frag_size); return skb; } /* build_skb() is wrapper over __build_skb(), that specifically * takes care of skb->head and skb->pfmemalloc */ struct sk_buff *build_skb(void *data, unsigned int frag_size) { struct sk_buff *skb = __build_skb(data, frag_size); if (likely(skb && frag_size)) { skb->head_frag = 1; skb_propagate_pfmemalloc(virt_to_head_page(data), skb); } return skb; } EXPORT_SYMBOL(build_skb); /** * build_skb_around - build a network buffer around provided skb * @skb: sk_buff provide by caller, must be memset cleared * @data: data buffer provided by caller * @frag_size: size of data */ struct sk_buff *build_skb_around(struct sk_buff *skb, void *data, unsigned int frag_size) { if (unlikely(!skb)) return NULL; __build_skb_around(skb, data, frag_size); if (frag_size) { skb->head_frag = 1; skb_propagate_pfmemalloc(virt_to_head_page(data), skb); } return skb; } EXPORT_SYMBOL(build_skb_around); /** * __napi_build_skb - build a network buffer * @data: data buffer provided by caller * @frag_size: size of data * * Version of __build_skb() that uses NAPI percpu caches to obtain * skbuff_head instead of inplace allocation. * * Returns a new &sk_buff on success, %NULL on allocation failure. */ static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size) { struct sk_buff *skb; skb = napi_skb_cache_get(); if (unlikely(!skb)) return NULL; memset(skb, 0, offsetof(struct sk_buff, tail)); __build_skb_around(skb, data, frag_size); return skb; } /** * napi_build_skb - build a network buffer * @data: data buffer provided by caller * @frag_size: size of data * * Version of __napi_build_skb() that takes care of skb->head_frag * and skb->pfmemalloc when the data is a page or page fragment. * * Returns a new &sk_buff on success, %NULL on allocation failure. */ struct sk_buff *napi_build_skb(void *data, unsigned int frag_size) { struct sk_buff *skb = __napi_build_skb(data, frag_size); if (likely(skb) && frag_size) { skb->head_frag = 1; skb_propagate_pfmemalloc(virt_to_head_page(data), skb); } return skb; } EXPORT_SYMBOL(napi_build_skb); /* * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells * the caller if emergency pfmemalloc reserves are being used. If it is and * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves * may be used. Otherwise, the packet data may be discarded until enough * memory is free */ static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node, bool *pfmemalloc) { bool ret_pfmemalloc = false; size_t obj_size; void *obj; obj_size = SKB_HEAD_ALIGN(*size); if (obj_size <= SKB_SMALL_HEAD_CACHE_SIZE && !(flags & KMALLOC_NOT_NORMAL_BITS)) { obj = kmem_cache_alloc_node(net_hotdata.skb_small_head_cache, flags | __GFP_NOMEMALLOC | __GFP_NOWARN, node); *size = SKB_SMALL_HEAD_CACHE_SIZE; if (obj || !(gfp_pfmemalloc_allowed(flags))) goto out; /* Try again but now we are using pfmemalloc reserves */ ret_pfmemalloc = true; obj = kmem_cache_alloc_node(net_hotdata.skb_small_head_cache, flags, node); goto out; } obj_size = kmalloc_size_roundup(obj_size); /* The following cast might truncate high-order bits of obj_size, this * is harmless because kmalloc(obj_size >= 2^32) will fail anyway. */ *size = (unsigned int)obj_size; /* * Try a regular allocation, when that fails and we're not entitled * to the reserves, fail. */ obj = kmalloc_node_track_caller(obj_size, flags | __GFP_NOMEMALLOC | __GFP_NOWARN, node); if (obj || !(gfp_pfmemalloc_allowed(flags))) goto out; /* Try again but now we are using pfmemalloc reserves */ ret_pfmemalloc = true; obj = kmalloc_node_track_caller(obj_size, flags, node); out: if (pfmemalloc) *pfmemalloc = ret_pfmemalloc; return obj; } /* Allocate a new skbuff. We do this ourselves so we can fill in a few * 'private' fields and also do memory statistics to find all the * [BEEP] leaks. * */ /** * __alloc_skb - allocate a network buffer * @size: size to allocate * @gfp_mask: allocation mask * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache * instead of head cache and allocate a cloned (child) skb. * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for * allocations in case the data is required for writeback * @node: numa node to allocate memory on * * Allocate a new &sk_buff. The returned buffer has no headroom and a * tail room of at least size bytes. The object has a reference count * of one. The return is the buffer. On a failure the return is %NULL. * * Buffers may only be allocated from interrupts using a @gfp_mask of * %GFP_ATOMIC. */ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, int flags, int node) { struct kmem_cache *cache; struct sk_buff *skb; bool pfmemalloc; u8 *data; cache = (flags & SKB_ALLOC_FCLONE) ? net_hotdata.skbuff_fclone_cache : net_hotdata.skbuff_cache; if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX)) gfp_mask |= __GFP_MEMALLOC; /* Get the HEAD */ if ((flags & (SKB_ALLOC_FCLONE | SKB_ALLOC_NAPI)) == SKB_ALLOC_NAPI && likely(node == NUMA_NO_NODE || node == numa_mem_id())) skb = napi_skb_cache_get(); else skb = kmem_cache_alloc_node(cache, gfp_mask & ~GFP_DMA, node); if (unlikely(!skb)) return NULL; prefetchw(skb); /* We do our best to align skb_shared_info on a separate cache * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives * aligned memory blocks, unless SLUB/SLAB debug is enabled. * Both skb->head and skb_shared_info are cache line aligned. */ data = kmalloc_reserve(&size, gfp_mask, node, &pfmemalloc); if (unlikely(!data)) goto nodata; /* kmalloc_size_roundup() might give us more room than requested. * Put skb_shared_info exactly at the end of allocated zone, * to allow max possible filling before reallocation. */ prefetchw(data + SKB_WITH_OVERHEAD(size)); /* * Only clear those fields we need to clear, not those that we will * actually initialise below. Hence, don't put any more fields after * the tail pointer in struct sk_buff! */ memset(skb, 0, offsetof(struct sk_buff, tail)); __build_skb_around(skb, data, size); skb->pfmemalloc = pfmemalloc; if (flags & SKB_ALLOC_FCLONE) { struct sk_buff_fclones *fclones; fclones = container_of(skb, struct sk_buff_fclones, skb1); skb->fclone = SKB_FCLONE_ORIG; refcount_set(&fclones->fclone_ref, 1); } return skb; nodata: kmem_cache_free(cache, skb); return NULL; } EXPORT_SYMBOL(__alloc_skb); /** * __netdev_alloc_skb - allocate an skbuff for rx on a specific device * @dev: network device to receive on * @len: length to allocate * @gfp_mask: get_free_pages mask, passed to alloc_skb * * Allocate a new &sk_buff and assign it a usage count of one. The * buffer has NET_SKB_PAD headroom built in. Users should allocate * the headroom they think they need without accounting for the * built in space. The built in space is used for optimisations. * * %NULL is returned if there is no free memory. */ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, gfp_t gfp_mask) { struct page_frag_cache *nc; struct sk_buff *skb; bool pfmemalloc; void *data; len += NET_SKB_PAD; /* If requested length is either too small or too big, * we use kmalloc() for skb->head allocation. */ if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) || len > SKB_WITH_OVERHEAD(PAGE_SIZE) || (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); if (!skb) goto skb_fail; goto skb_success; } len = SKB_HEAD_ALIGN(len); if (sk_memalloc_socks()) gfp_mask |= __GFP_MEMALLOC; if (in_hardirq() || irqs_disabled()) { nc = this_cpu_ptr(&netdev_alloc_cache); data = page_frag_alloc(nc, len, gfp_mask); pfmemalloc = page_frag_cache_is_pfmemalloc(nc); } else { local_bh_disable(); local_lock_nested_bh(&napi_alloc_cache.bh_lock); nc = this_cpu_ptr(&napi_alloc_cache.page); data = page_frag_alloc(nc, len, gfp_mask); pfmemalloc = page_frag_cache_is_pfmemalloc(nc); local_unlock_nested_bh(&napi_alloc_cache.bh_lock); local_bh_enable(); } if (unlikely(!data)) return NULL; skb = __build_skb(data, len); if (unlikely(!skb)) { skb_free_frag(data); return NULL; } if (pfmemalloc) skb->pfmemalloc = 1; skb->head_frag = 1; skb_success: skb_reserve(skb, NET_SKB_PAD); skb->dev = dev; skb_fail: return skb; } EXPORT_SYMBOL(__netdev_alloc_skb); /** * napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance * @napi: napi instance this buffer was allocated for * @len: length to allocate * * Allocate a new sk_buff for use in NAPI receive. This buffer will * attempt to allocate the head from a special reserved region used * only for NAPI Rx allocation. By doing this we can save several * CPU cycles by avoiding having to disable and re-enable IRQs. * * %NULL is returned if there is no free memory. */ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len) { gfp_t gfp_mask = GFP_ATOMIC | __GFP_NOWARN; struct napi_alloc_cache *nc; struct sk_buff *skb; bool pfmemalloc; void *data; DEBUG_NET_WARN_ON_ONCE(!in_softirq()); len += NET_SKB_PAD + NET_IP_ALIGN; /* If requested length is either too small or too big, * we use kmalloc() for skb->head allocation. */ if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) || len > SKB_WITH_OVERHEAD(PAGE_SIZE) || (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI, NUMA_NO_NODE); if (!skb) goto skb_fail; goto skb_success; } len = SKB_HEAD_ALIGN(len); if (sk_memalloc_socks()) gfp_mask |= __GFP_MEMALLOC; local_lock_nested_bh(&napi_alloc_cache.bh_lock); nc = this_cpu_ptr(&napi_alloc_cache); data = page_frag_alloc(&nc->page, len, gfp_mask); pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page); local_unlock_nested_bh(&napi_alloc_cache.bh_lock); if (unlikely(!data)) return NULL; skb = __napi_build_skb(data, len); if (unlikely(!skb)) { skb_free_frag(data); return NULL; } if (pfmemalloc) skb->pfmemalloc = 1; skb->head_frag = 1; skb_success: skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); skb->dev = napi->dev; skb_fail: return skb; } EXPORT_SYMBOL(napi_alloc_skb); void skb_add_rx_frag_netmem(struct sk_buff *skb, int i, netmem_ref netmem, int off, int size, unsigned int truesize) { DEBUG_NET_WARN_ON_ONCE(size > truesize); skb_fill_netmem_desc(skb, i, netmem, off, size); skb->len += size; skb->data_len += size; skb->truesize += truesize; } EXPORT_SYMBOL(skb_add_rx_frag_netmem); void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size, unsigned int truesize) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; DEBUG_NET_WARN_ON_ONCE(size > truesize); skb_frag_size_add(frag, size); skb->len += size; skb->data_len += size; skb->truesize += truesize; } EXPORT_SYMBOL(skb_coalesce_rx_frag); static void skb_drop_list(struct sk_buff **listp) { kfree_skb_list(*listp); *listp = NULL; } static inline void skb_drop_fraglist(struct sk_buff *skb) { skb_drop_list(&skb_shinfo(skb)->frag_list); } static void skb_clone_fraglist(struct sk_buff *skb) { struct sk_buff *list; skb_walk_frags(skb, list) skb_get(list); } int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb, unsigned int headroom) { #if IS_ENABLED(CONFIG_PAGE_POOL) u32 size, truesize, len, max_head_size, off; struct sk_buff *skb = *pskb, *nskb; int err, i, head_off; void *data; /* XDP does not support fraglist so we need to linearize * the skb. */ if (skb_has_frag_list(skb)) return -EOPNOTSUPP; max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE - headroom); if (skb->len > max_head_size + MAX_SKB_FRAGS * PAGE_SIZE) return -ENOMEM; size = min_t(u32, skb->len, max_head_size); truesize = SKB_HEAD_ALIGN(size) + headroom; data = page_pool_dev_alloc_va(pool, &truesize); if (!data) return -ENOMEM; nskb = napi_build_skb(data, truesize); if (!nskb) { page_pool_free_va(pool, data, true); return -ENOMEM; } skb_reserve(nskb, headroom); skb_copy_header(nskb, skb); skb_mark_for_recycle(nskb); err = skb_copy_bits(skb, 0, nskb->data, size); if (err) { consume_skb(nskb); return err; } skb_put(nskb, size); head_off = skb_headroom(nskb) - skb_headroom(skb); skb_headers_offset_update(nskb, head_off); off = size; len = skb->len - off; for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) { struct page *page; u32 page_off; size = min_t(u32, len, PAGE_SIZE); truesize = size; page = page_pool_dev_alloc(pool, &page_off, &truesize); if (!page) { consume_skb(nskb); return -ENOMEM; } skb_add_rx_frag(nskb, i, page, page_off, size, truesize); err = skb_copy_bits(skb, off, page_address(page) + page_off, size); if (err) { consume_skb(nskb); return err; } len -= size; off += size; } consume_skb(skb); *pskb = nskb; return 0; #else return -EOPNOTSUPP; #endif } EXPORT_SYMBOL(skb_pp_cow_data); int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb, const struct bpf_prog *prog) { if (!prog->aux->xdp_has_frags) return -EINVAL; return skb_pp_cow_data(pool, pskb, XDP_PACKET_HEADROOM); } EXPORT_SYMBOL(skb_cow_data_for_xdp); #if IS_ENABLED(CONFIG_PAGE_POOL) bool napi_pp_put_page(netmem_ref netmem) { netmem = netmem_compound_head(netmem); if (unlikely(!netmem_is_pp(netmem))) return false; page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, false); return true; } EXPORT_SYMBOL(napi_pp_put_page); #endif static bool skb_pp_recycle(struct sk_buff *skb, void *data) { if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle) return false; return napi_pp_put_page(page_to_netmem(virt_to_page(data))); } /** * skb_pp_frag_ref() - Increase fragment references of a page pool aware skb * @skb: page pool aware skb * * Increase the fragment reference count (pp_ref_count) of a skb. This is * intended to gain fragment references only for page pool aware skbs, * i.e. when skb->pp_recycle is true, and not for fragments in a * non-pp-recycling skb. It has a fallback to increase references on normal * pages, as page pool aware skbs may also have normal page fragments. */ static int skb_pp_frag_ref(struct sk_buff *skb) { struct skb_shared_info *shinfo; netmem_ref head_netmem; int i; if (!skb->pp_recycle) return -EINVAL; shinfo = skb_shinfo(skb); for (i = 0; i < shinfo->nr_frags; i++) { head_netmem = netmem_compound_head(shinfo->frags[i].netmem); if (likely(netmem_is_pp(head_netmem))) page_pool_ref_netmem(head_netmem); else page_ref_inc(netmem_to_page(head_netmem)); } return 0; } static void skb_kfree_head(void *head, unsigned int end_offset) { if (end_offset == SKB_SMALL_HEAD_HEADROOM) kmem_cache_free(net_hotdata.skb_small_head_cache, head); else kfree(head); } static void skb_free_head(struct sk_buff *skb) { unsigned char *head = skb->head; if (skb->head_frag) { if (skb_pp_recycle(skb, head)) return; skb_free_frag(head); } else { skb_kfree_head(head, skb_end_offset(skb)); } } static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason) { struct skb_shared_info *shinfo = skb_shinfo(skb); int i; if (!skb_data_unref(skb, shinfo)) goto exit; if (skb_zcopy(skb)) { bool skip_unref = shinfo->flags & SKBFL_MANAGED_FRAG_REFS; skb_zcopy_clear(skb, true); if (skip_unref) goto free_head; } for (i = 0; i < shinfo->nr_frags; i++) __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle); free_head: if (shinfo->frag_list) kfree_skb_list_reason(shinfo->frag_list, reason); skb_free_head(skb); exit: /* When we clone an SKB we copy the reycling bit. The pp_recycle * bit is only set on the head though, so in order to avoid races * while trying to recycle fragments on __skb_frag_unref() we need * to make one SKB responsible for triggering the recycle path. * So disable the recycling bit if an SKB is cloned and we have * additional references to the fragmented part of the SKB. * Eventually the last SKB will have the recycling bit set and it's * dataref set to 0, which will trigger the recycling */ skb->pp_recycle = 0; } /* * Free an skbuff by memory without cleaning the state. */ static void kfree_skbmem(struct sk_buff *skb) { struct sk_buff_fclones *fclones; switch (skb->fclone) { case SKB_FCLONE_UNAVAILABLE: kmem_cache_free(net_hotdata.skbuff_cache, skb); return; case SKB_FCLONE_ORIG: fclones = container_of(skb, struct sk_buff_fclones, skb1); /* We usually free the clone (TX completion) before original skb * This test would have no chance to be true for the clone, * while here, branch prediction will be good. */ if (refcount_read(&fclones->fclone_ref) == 1) goto fastpath; break; default: /* SKB_FCLONE_CLONE */ fclones = container_of(skb, struct sk_buff_fclones, skb2); break; } if (!refcount_dec_and_test(&fclones->fclone_ref)) return; fastpath: kmem_cache_free(net_hotdata.skbuff_fclone_cache, fclones); } void skb_release_head_state(struct sk_buff *skb) { skb_dst_drop(skb); if (skb->destructor) { DEBUG_NET_WARN_ON_ONCE(in_hardirq()); skb->destructor(skb); } #if IS_ENABLED(CONFIG_NF_CONNTRACK) nf_conntrack_put(skb_nfct(skb)); #endif skb_ext_put(skb); } /* Free everything but the sk_buff shell. */ static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason) { skb_release_head_state(skb); if (likely(skb->head)) skb_release_data(skb, reason); } /** * __kfree_skb - private function * @skb: buffer * * Free an sk_buff. Release anything attached to the buffer. * Clean the state. This is an internal helper function. Users should * always call kfree_skb */ void __kfree_skb(struct sk_buff *skb) { skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED); kfree_skbmem(skb); } EXPORT_SYMBOL(__kfree_skb); static __always_inline bool __sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason reason) { if (unlikely(!skb_unref(skb))) return false; DEBUG_NET_WARN_ON_ONCE(reason == SKB_NOT_DROPPED_YET || u32_get_bits(reason, SKB_DROP_REASON_SUBSYS_MASK) >= SKB_DROP_REASON_SUBSYS_NUM); if (reason == SKB_CONSUMED) trace_consume_skb(skb, __builtin_return_address(0)); else trace_kfree_skb(skb, __builtin_return_address(0), reason, sk); return true; } /** * sk_skb_reason_drop - free an sk_buff with special reason * @sk: the socket to receive @skb, or NULL if not applicable * @skb: buffer to free * @reason: reason why this skb is dropped * * Drop a reference to the buffer and free it if the usage count has hit * zero. Meanwhile, pass the receiving socket and drop reason to * 'kfree_skb' tracepoint. */ void __fix_address sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason reason) { if (__sk_skb_reason_drop(sk, skb, reason)) __kfree_skb(skb); } EXPORT_SYMBOL(sk_skb_reason_drop); #define KFREE_SKB_BULK_SIZE 16 struct skb_free_array { unsigned int skb_count; void *skb_array[KFREE_SKB_BULK_SIZE]; }; static void kfree_skb_add_bulk(struct sk_buff *skb, struct skb_free_array *sa, enum skb_drop_reason reason) { /* if SKB is a clone, don't handle this case */ if (unlikely(skb->fclone != SKB_FCLONE_UNAVAILABLE)) { __kfree_skb(skb); return; } skb_release_all(skb, reason); sa->skb_array[sa->skb_count++] = skb; if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) { kmem_cache_free_bulk(net_hotdata.skbuff_cache, KFREE_SKB_BULK_SIZE, sa->skb_array); sa->skb_count = 0; } } void __fix_address kfree_skb_list_reason(struct sk_buff *segs, enum skb_drop_reason reason) { struct skb_free_array sa; sa.skb_count = 0; while (segs) { struct sk_buff *next = segs->next; if (__sk_skb_reason_drop(NULL, segs, reason)) { skb_poison_list(segs); kfree_skb_add_bulk(segs, &sa, reason); } segs = next; } if (sa.skb_count) kmem_cache_free_bulk(net_hotdata.skbuff_cache, sa.skb_count, sa.skb_array); } EXPORT_SYMBOL(kfree_skb_list_reason); /* Dump skb information and contents. * * Must only be called from net_ratelimit()-ed paths. * * Dumps whole packets if full_pkt, only headers otherwise. */ void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt) { struct skb_shared_info *sh = skb_shinfo(skb); struct net_device *dev = skb->dev; struct sock *sk = skb->sk; struct sk_buff *list_skb; bool has_mac, has_trans; int headroom, tailroom; int i, len, seg_len; if (full_pkt) len = skb->len; else len = min_t(int, skb->len, MAX_HEADER + 128); headroom = skb_headroom(skb); tailroom = skb_tailroom(skb); has_mac = skb_mac_header_was_set(skb); has_trans = skb_transport_header_was_set(skb); printk("%sskb len=%u headroom=%u headlen=%u tailroom=%u\n" "mac=(%d,%d) mac_len=%u net=(%d,%d) trans=%d\n" "shinfo(txflags=%u nr_frags=%u gso(size=%hu type=%u segs=%hu))\n" "csum(0x%x start=%u offset=%u ip_summed=%u complete_sw=%u valid=%u level=%u)\n" "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n" "priority=0x%x mark=0x%x alloc_cpu=%u vlan_all=0x%x\n" "encapsulation=%d inner(proto=0x%04x, mac=%u, net=%u, trans=%u)\n", level, skb->len, headroom, skb_headlen(skb), tailroom, has_mac ? skb->mac_header : -1, has_mac ? skb_mac_header_len(skb) : -1, skb->mac_len, skb->network_header, has_trans ? skb_network_header_len(skb) : -1, has_trans ? skb->transport_header : -1, sh->tx_flags, sh->nr_frags, sh->gso_size, sh->gso_type, sh->gso_segs, skb->csum, skb->csum_start, skb->csum_offset, skb->ip_summed, skb->csum_complete_sw, skb->csum_valid, skb->csum_level, skb->hash, skb->sw_hash, skb->l4_hash, ntohs(skb->protocol), skb->pkt_type, skb->skb_iif, skb->priority, skb->mark, skb->alloc_cpu, skb->vlan_all, skb->encapsulation, skb->inner_protocol, skb->inner_mac_header, skb->inner_network_header, skb->inner_transport_header); if (dev) printk("%sdev name=%s feat=%pNF\n", level, dev->name, &dev->features); if (sk) printk("%ssk family=%hu type=%u proto=%u\n", level, sk->sk_family, sk->sk_type, sk->sk_protocol); if (full_pkt && headroom) print_hex_dump(level, "skb headroom: ", DUMP_PREFIX_OFFSET, 16, 1, skb->head, headroom, false); seg_len = min_t(int, skb_headlen(skb), len); if (seg_len) print_hex_dump(level, "skb linear: ", DUMP_PREFIX_OFFSET, 16, 1, skb->data, seg_len, false); len -= seg_len; if (full_pkt && tailroom) print_hex_dump(level, "skb tailroom: ", DUMP_PREFIX_OFFSET, 16, 1, skb_tail_pointer(skb), tailroom, false); for (i = 0; len && i < skb_shinfo(skb)->nr_frags; i++) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; u32 p_off, p_len, copied; struct page *p; u8 *vaddr; if (skb_frag_is_net_iov(frag)) { printk("%sskb frag %d: not readable\n", level, i); len -= skb_frag_size(frag); if (!len) break; continue; } skb_frag_foreach_page(frag, skb_frag_off(frag), skb_frag_size(frag), p, p_off, p_len, copied) { seg_len = min_t(int, p_len, len); vaddr = kmap_atomic(p); print_hex_dump(level, "skb frag: ", DUMP_PREFIX_OFFSET, 16, 1, vaddr + p_off, seg_len, false); kunmap_atomic(vaddr); len -= seg_len; if (!len) break; } } if (full_pkt && skb_has_frag_list(skb)) { printk("skb fraglist:\n"); skb_walk_frags(skb, list_skb) skb_dump(level, list_skb, true); } } EXPORT_SYMBOL(skb_dump); /** * skb_tx_error - report an sk_buff xmit error * @skb: buffer that triggered an error * * Report xmit error if a device callback is tracking this skb. * skb must be freed afterwards. */ void skb_tx_error(struct sk_buff *skb) { if (skb) { skb_zcopy_downgrade_managed(skb); skb_zcopy_clear(skb, true); } } EXPORT_SYMBOL(skb_tx_error); #ifdef CONFIG_TRACEPOINTS /** * consume_skb - free an skbuff * @skb: buffer to free * * Drop a ref to the buffer and free it if the usage count has hit zero * Functions identically to kfree_skb, but kfree_skb assumes that the frame * is being dropped after a failure and notes that */ void consume_skb(struct sk_buff *skb) { if (!skb_unref(skb)) return; trace_consume_skb(skb, __builtin_return_address(0)); __kfree_skb(skb); } EXPORT_SYMBOL(consume_skb); #endif /** * __consume_stateless_skb - free an skbuff, assuming it is stateless * @skb: buffer to free * * Alike consume_skb(), but this variant assumes that this is the last * skb reference and all the head states have been already dropped */ void __consume_stateless_skb(struct sk_buff *skb) { trace_consume_skb(skb, __builtin_return_address(0)); skb_release_data(skb, SKB_CONSUMED); kfree_skbmem(skb); } static void napi_skb_cache_put(struct sk_buff *skb) { struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); u32 i; if (!kasan_mempool_poison_object(skb)) return; local_lock_nested_bh(&napi_alloc_cache.bh_lock); nc->skb_cache[nc->skb_count++] = skb; if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) { for (i = NAPI_SKB_CACHE_HALF; i < NAPI_SKB_CACHE_SIZE; i++) kasan_mempool_unpoison_object(nc->skb_cache[i], kmem_cache_size(net_hotdata.skbuff_cache)); kmem_cache_free_bulk(net_hotdata.skbuff_cache, NAPI_SKB_CACHE_HALF, nc->skb_cache + NAPI_SKB_CACHE_HALF); nc->skb_count = NAPI_SKB_CACHE_HALF; } local_unlock_nested_bh(&napi_alloc_cache.bh_lock); } void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason) { skb_release_all(skb, reason); napi_skb_cache_put(skb); } void napi_skb_free_stolen_head(struct sk_buff *skb) { if (unlikely(skb->slow_gro)) { nf_reset_ct(skb); skb_dst_drop(skb); skb_ext_put(skb); skb_orphan(skb); skb->slow_gro = 0; } napi_skb_cache_put(skb); } void napi_consume_skb(struct sk_buff *skb, int budget) { /* Zero budget indicate non-NAPI context called us, like netpoll */ if (unlikely(!budget)) { dev_consume_skb_any(skb); return; } DEBUG_NET_WARN_ON_ONCE(!in_softirq()); if (!skb_unref(skb)) return; /* if reaching here SKB is ready to free */ trace_consume_skb(skb, __builtin_return_address(0)); /* if SKB is a clone, don't handle this case */ if (skb->fclone != SKB_FCLONE_UNAVAILABLE) { __kfree_skb(skb); return; } skb_release_all(skb, SKB_CONSUMED); napi_skb_cache_put(skb); } EXPORT_SYMBOL(napi_consume_skb); /* Make sure a field is contained by headers group */ #define CHECK_SKB_FIELD(field) \ BUILD_BUG_ON(offsetof(struct sk_buff, field) != \ offsetof(struct sk_buff, headers.field)); \ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) { new->tstamp = old->tstamp; /* We do not copy old->sk */ new->dev = old->dev; memcpy(new->cb, old->cb, sizeof(old->cb)); skb_dst_copy(new, old); __skb_ext_copy(new, old); __nf_copy(new, old, false); /* Note : this field could be in the headers group. * It is not yet because we do not want to have a 16 bit hole */ new->queue_mapping = old->queue_mapping; memcpy(&new->headers, &old->headers, sizeof(new->headers)); CHECK_SKB_FIELD(protocol); CHECK_SKB_FIELD(csum); CHECK_SKB_FIELD(hash); CHECK_SKB_FIELD(priority); CHECK_SKB_FIELD(skb_iif); CHECK_SKB_FIELD(vlan_proto); CHECK_SKB_FIELD(vlan_tci); CHECK_SKB_FIELD(transport_header); CHECK_SKB_FIELD(network_header); CHECK_SKB_FIELD(mac_header); CHECK_SKB_FIELD(inner_protocol); CHECK_SKB_FIELD(inner_transport_header); CHECK_SKB_FIELD(inner_network_header); CHECK_SKB_FIELD(inner_mac_header); CHECK_SKB_FIELD(mark); #ifdef CONFIG_NETWORK_SECMARK CHECK_SKB_FIELD(secmark); #endif #ifdef CONFIG_NET_RX_BUSY_POLL CHECK_SKB_FIELD(napi_id); #endif CHECK_SKB_FIELD(alloc_cpu); #ifdef CONFIG_XPS CHECK_SKB_FIELD(sender_cpu); #endif #ifdef CONFIG_NET_SCHED CHECK_SKB_FIELD(tc_index); #endif } /* * You should not add any new code to this function. Add it to * __copy_skb_header above instead. */ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) { #define C(x) n->x = skb->x n->next = n->prev = NULL; n->sk = NULL; __copy_skb_header(n, skb); C(len); C(data_len); C(mac_len); n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len; n->cloned = 1; n->nohdr = 0; n->peeked = 0; C(pfmemalloc); C(pp_recycle); n->destructor = NULL; C(tail); C(end); C(head); C(head_frag); C(data); C(truesize); refcount_set(&n->users, 1); atomic_inc(&(skb_shinfo(skb)->dataref)); skb->cloned = 1; return n; #undef C } /** * alloc_skb_for_msg() - allocate sk_buff to wrap frag list forming a msg * @first: first sk_buff of the msg */ struct sk_buff *alloc_skb_for_msg(struct sk_buff *first) { struct sk_buff *n; n = alloc_skb(0, GFP_ATOMIC); if (!n) return NULL; n->len = first->len; n->data_len = first->len; n->truesize = first->truesize; skb_shinfo(n)->frag_list = first; __copy_skb_header(n, first); n->destructor = NULL; return n; } EXPORT_SYMBOL_GPL(alloc_skb_for_msg); /** * skb_morph - morph one skb into another * @dst: the skb to receive the contents * @src: the skb to supply the contents * * This is identical to skb_clone except that the target skb is * supplied by the user. * * The target skb is returned upon exit. */ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) { skb_release_all(dst, SKB_CONSUMED); return __skb_clone(dst, src); } EXPORT_SYMBOL_GPL(skb_morph); int mm_account_pinned_pages(struct mmpin *mmp, size_t size) { unsigned long max_pg, num_pg, new_pg, old_pg, rlim; struct user_struct *user; if (capable(CAP_IPC_LOCK) || !size) return 0; rlim = rlimit(RLIMIT_MEMLOCK); if (rlim == RLIM_INFINITY) return 0; num_pg = (size >> PAGE_SHIFT) + 2; /* worst case */ max_pg = rlim >> PAGE_SHIFT; user = mmp->user ? : current_user(); old_pg = atomic_long_read(&user->locked_vm); do { new_pg = old_pg + num_pg; if (new_pg > max_pg) return -ENOBUFS; } while (!atomic_long_try_cmpxchg(&user->locked_vm, &old_pg, new_pg)); if (!mmp->user) { mmp->user = get_uid(user); mmp->num_pg = num_pg; } else { mmp->num_pg += num_pg; } return 0; } EXPORT_SYMBOL_GPL(mm_account_pinned_pages); void mm_unaccount_pinned_pages(struct mmpin *mmp) { if (mmp->user) { atomic_long_sub(mmp->num_pg, &mmp->user->locked_vm); free_uid(mmp->user); } } EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages); static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size, bool devmem) { struct ubuf_info_msgzc *uarg; struct sk_buff *skb; WARN_ON_ONCE(!in_task()); skb = sock_omalloc(sk, 0, GFP_KERNEL); if (!skb) return NULL; BUILD_BUG_ON(sizeof(*uarg) > sizeof(skb->cb)); uarg = (void *)skb->cb; uarg->mmp.user = NULL; if (likely(!devmem) && mm_account_pinned_pages(&uarg->mmp, size)) { kfree_skb(skb); return NULL; } uarg->ubuf.ops = &msg_zerocopy_ubuf_ops; uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1; uarg->len = 1; uarg->bytelen = size; uarg->zerocopy = 1; uarg->ubuf.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN; refcount_set(&uarg->ubuf.refcnt, 1); sock_hold(sk); return &uarg->ubuf; } static inline struct sk_buff *skb_from_uarg(struct ubuf_info_msgzc *uarg) { return container_of((void *)uarg, struct sk_buff, cb); } struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, struct ubuf_info *uarg, bool devmem) { if (uarg) { struct ubuf_info_msgzc *uarg_zc; const u32 byte_limit = 1 << 19; /* limit to a few TSO */ u32 bytelen, next; /* there might be non MSG_ZEROCOPY users */ if (uarg->ops != &msg_zerocopy_ubuf_ops) return NULL; /* realloc only when socket is locked (TCP, UDP cork), * so uarg->len and sk_zckey access is serialized */ if (!sock_owned_by_user(sk)) { WARN_ON_ONCE(1); return NULL; } uarg_zc = uarg_to_msgzc(uarg); bytelen = uarg_zc->bytelen + size; if (uarg_zc->len == USHRT_MAX - 1 || bytelen > byte_limit) { /* TCP can create new skb to attach new uarg */ if (sk->sk_type == SOCK_STREAM) goto new_alloc; return NULL; } next = (u32)atomic_read(&sk->sk_zckey); if ((u32)(uarg_zc->id + uarg_zc->len) == next) { if (likely(!devmem) && mm_account_pinned_pages(&uarg_zc->mmp, size)) return NULL; uarg_zc->len++; uarg_zc->bytelen = bytelen; atomic_set(&sk->sk_zckey, ++next); /* no extra ref when appending to datagram (MSG_MORE) */ if (sk->sk_type == SOCK_STREAM) net_zcopy_get(uarg); return uarg; } } new_alloc: return msg_zerocopy_alloc(sk, size, devmem); } EXPORT_SYMBOL_GPL(msg_zerocopy_realloc); static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len) { struct sock_exterr_skb *serr = SKB_EXT_ERR(skb); u32 old_lo, old_hi; u64 sum_len; old_lo = serr->ee.ee_info; old_hi = serr->ee.ee_data; sum_len = old_hi - old_lo + 1ULL + len; if (sum_len >= (1ULL << 32)) return false; if (lo != old_hi + 1) return false; serr->ee.ee_data += len; return true; } static void __msg_zerocopy_callback(struct ubuf_info_msgzc *uarg) { struct sk_buff *tail, *skb = skb_from_uarg(uarg); struct sock_exterr_skb *serr; struct sock *sk = skb->sk; struct sk_buff_head *q; unsigned long flags; bool is_zerocopy; u32 lo, hi; u16 len; mm_unaccount_pinned_pages(&uarg->mmp); /* if !len, there was only 1 call, and it was aborted * so do not queue a completion notification */ if (!uarg->len || sock_flag(sk, SOCK_DEAD)) goto release; len = uarg->len; lo = uarg->id; hi = uarg->id + len - 1; is_zerocopy = uarg->zerocopy; serr = SKB_EXT_ERR(skb); memset(serr, 0, sizeof(*serr)); serr->ee.ee_errno = 0; serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY; serr->ee.ee_data = hi; serr->ee.ee_info = lo; if (!is_zerocopy) serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED; q = &sk->sk_error_queue; spin_lock_irqsave(&q->lock, flags); tail = skb_peek_tail(q); if (!tail || SKB_EXT_ERR(tail)->ee.ee_origin != SO_EE_ORIGIN_ZEROCOPY || !skb_zerocopy_notify_extend(tail, lo, len)) { __skb_queue_tail(q, skb); skb = NULL; } spin_unlock_irqrestore(&q->lock, flags); sk_error_report(sk); release: consume_skb(skb); sock_put(sk); } static void msg_zerocopy_complete(struct sk_buff *skb, struct ubuf_info *uarg, bool success) { struct ubuf_info_msgzc *uarg_zc = uarg_to_msgzc(uarg); uarg_zc->zerocopy = uarg_zc->zerocopy & success; if (refcount_dec_and_test(&uarg->refcnt)) __msg_zerocopy_callback(uarg_zc); } void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref) { struct sock *sk = skb_from_uarg(uarg_to_msgzc(uarg))->sk; atomic_dec(&sk->sk_zckey); uarg_to_msgzc(uarg)->len--; if (have_uref) msg_zerocopy_complete(NULL, uarg, true); } EXPORT_SYMBOL_GPL(msg_zerocopy_put_abort); const struct ubuf_info_ops msg_zerocopy_ubuf_ops = { .complete = msg_zerocopy_complete, }; EXPORT_SYMBOL_GPL(msg_zerocopy_ubuf_ops); int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, struct msghdr *msg, int len, struct ubuf_info *uarg, struct net_devmem_dmabuf_binding *binding) { int err, orig_len = skb->len; if (uarg->ops->link_skb) { err = uarg->ops->link_skb(skb, uarg); if (err) return err; } else { struct ubuf_info *orig_uarg = skb_zcopy(skb); /* An skb can only point to one uarg. This edge case happens * when TCP appends to an skb, but zerocopy_realloc triggered * a new alloc. */ if (orig_uarg && uarg != orig_uarg) return -EEXIST; } err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len, binding); if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) { struct sock *save_sk = skb->sk; /* Streams do not free skb on error. Reset to prev state. */ iov_iter_revert(&msg->msg_iter, skb->len - orig_len); skb->sk = sk; ___pskb_trim(skb, orig_len); skb->sk = save_sk; return err; } skb_zcopy_set(skb, uarg, NULL); return skb->len - orig_len; } EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream); void __skb_zcopy_downgrade_managed(struct sk_buff *skb) { int i; skb_shinfo(skb)->flags &= ~SKBFL_MANAGED_FRAG_REFS; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) skb_frag_ref(skb, i); } EXPORT_SYMBOL_GPL(__skb_zcopy_downgrade_managed); static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig, gfp_t gfp_mask) { if (skb_zcopy(orig)) { if (skb_zcopy(nskb)) { /* !gfp_mask callers are verified to !skb_zcopy(nskb) */ if (!gfp_mask) { WARN_ON_ONCE(1); return -ENOMEM; } if (skb_uarg(nskb) == skb_uarg(orig)) return 0; if (skb_copy_ubufs(nskb, GFP_ATOMIC)) return -EIO; } skb_zcopy_set(nskb, skb_uarg(orig), NULL); } return 0; } /** * skb_copy_ubufs - copy userspace skb frags buffers to kernel * @skb: the skb to modify * @gfp_mask: allocation priority * * This must be called on skb with SKBFL_ZEROCOPY_ENABLE. * It will copy all frags into kernel and drop the reference * to userspace pages. * * If this function is called from an interrupt gfp_mask() must be * %GFP_ATOMIC. * * Returns 0 on success or a negative error code on failure * to allocate kernel memory to copy to. */ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) { int num_frags = skb_shinfo(skb)->nr_frags; struct page *page, *head = NULL; int i, order, psize, new_frags; u32 d_off; if (skb_shared(skb) || skb_unclone(skb, gfp_mask)) return -EINVAL; if (!skb_frags_readable(skb)) return -EFAULT; if (!num_frags) goto release; /* We might have to allocate high order pages, so compute what minimum * page order is needed. */ order = 0; while ((PAGE_SIZE << order) * MAX_SKB_FRAGS < __skb_pagelen(skb)) order++; psize = (PAGE_SIZE << order); new_frags = (__skb_pagelen(skb) + psize - 1) >> (PAGE_SHIFT + order); for (i = 0; i < new_frags; i++) { page = alloc_pages(gfp_mask | __GFP_COMP, order); if (!page) { while (head) { struct page *next = (struct page *)page_private(head); put_page(head); head = next; } return -ENOMEM; } set_page_private(page, (unsigned long)head); head = page; } page = head; d_off = 0; for (i = 0; i < num_frags; i++) { skb_frag_t *f = &skb_shinfo(skb)->frags[i]; u32 p_off, p_len, copied; struct page *p; u8 *vaddr; skb_frag_foreach_page(f, skb_frag_off(f), skb_frag_size(f), p, p_off, p_len, copied) { u32 copy, done = 0; vaddr = kmap_atomic(p); while (done < p_len) { if (d_off == psize) { d_off = 0; page = (struct page *)page_private(page); } copy = min_t(u32, psize - d_off, p_len - done); memcpy(page_address(page) + d_off, vaddr + p_off + done, copy); done += copy; d_off += copy; } kunmap_atomic(vaddr); } } /* skb frags release userspace buffers */ for (i = 0; i < num_frags; i++) skb_frag_unref(skb, i); /* skb frags point to kernel buffers */ for (i = 0; i < new_frags - 1; i++) { __skb_fill_netmem_desc(skb, i, page_to_netmem(head), 0, psize); head = (struct page *)page_private(head); } __skb_fill_netmem_desc(skb, new_frags - 1, page_to_netmem(head), 0, d_off); skb_shinfo(skb)->nr_frags = new_frags; release: skb_zcopy_clear(skb, false); return 0; } EXPORT_SYMBOL_GPL(skb_copy_ubufs); /** * skb_clone - duplicate an sk_buff * @skb: buffer to clone * @gfp_mask: allocation priority * * Duplicate an &sk_buff. The new one is not owned by a socket. Both * copies share the same packet data but not structure. The new * buffer has a reference count of 1. If the allocation fails the * function returns %NULL otherwise the new buffer is returned. * * If this function is called from an interrupt gfp_mask() must be * %GFP_ATOMIC. */ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) { struct sk_buff_fclones *fclones = container_of(skb, struct sk_buff_fclones, skb1); struct sk_buff *n; if (skb_orphan_frags(skb, gfp_mask)) return NULL; if (skb->fclone == SKB_FCLONE_ORIG && refcount_read(&fclones->fclone_ref) == 1) { n = &fclones->skb2; refcount_set(&fclones->fclone_ref, 2); n->fclone = SKB_FCLONE_CLONE; } else { if (skb_pfmemalloc(skb)) gfp_mask |= __GFP_MEMALLOC; n = kmem_cache_alloc(net_hotdata.skbuff_cache, gfp_mask); if (!n) return NULL; n->fclone = SKB_FCLONE_UNAVAILABLE; } return __skb_clone(n, skb); } EXPORT_SYMBOL(skb_clone); void skb_headers_offset_update(struct sk_buff *skb, int off) { /* Only adjust this if it actually is csum_start rather than csum */ if (skb->ip_summed == CHECKSUM_PARTIAL) skb->csum_start += off; /* {transport,network,mac}_header and tail are relative to skb->head */ skb->transport_header += off; skb->network_header += off; if (skb_mac_header_was_set(skb)) skb->mac_header += off; skb->inner_transport_header += off; skb->inner_network_header += off; skb->inner_mac_header += off; } EXPORT_SYMBOL(skb_headers_offset_update); void skb_copy_header(struct sk_buff *new, const struct sk_buff *old) { __copy_skb_header(new, old); skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size; skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs; skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type; } EXPORT_SYMBOL(skb_copy_header); static inline int skb_alloc_rx_flag(const struct sk_buff *skb) { if (skb_pfmemalloc(skb)) return SKB_ALLOC_RX; return 0; } /** * skb_copy - create private copy of an sk_buff * @skb: buffer to copy * @gfp_mask: allocation priority * * Make a copy of both an &sk_buff and its data. This is used when the * caller wishes to modify the data and needs a private copy of the * data to alter. Returns %NULL on failure or the pointer to the buffer * on success. The returned buffer has a reference count of 1. * * As by-product this function converts non-linear &sk_buff to linear * one, so that &sk_buff becomes completely private and caller is allowed * to modify all the data of returned buffer. This means that this * function is not recommended for use in circumstances when only * header is going to be modified. Use pskb_copy() instead. */ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) { struct sk_buff *n; unsigned int size; int headerlen; if (!skb_frags_readable(skb)) return NULL; if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)) return NULL; headerlen = skb_headroom(skb); size = skb_end_offset(skb) + skb->data_len; n = __alloc_skb(size, gfp_mask, skb_alloc_rx_flag(skb), NUMA_NO_NODE); if (!n) return NULL; /* Set the data pointer */ skb_reserve(n, headerlen); /* Set the tail pointer and length */ skb_put(n, skb->len); BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len)); skb_copy_header(n, skb); return n; } EXPORT_SYMBOL(skb_copy); /** * __pskb_copy_fclone - create copy of an sk_buff with private head. * @skb: buffer to copy * @headroom: headroom of new skb * @gfp_mask: allocation priority * @fclone: if true allocate the copy of the skb from the fclone * cache instead of the head cache; it is recommended to set this * to true for the cases where the copy will likely be cloned * * Make a copy of both an &sk_buff and part of its data, located * in header. Fragmented data remain shared. This is used when * the caller wishes to modify only header of &sk_buff and needs * private copy of the header to alter. Returns %NULL on failure * or the pointer to the buffer on success. * The returned buffer has a reference count of 1. */ struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom, gfp_t gfp_mask, bool fclone) { unsigned int size = skb_headlen(skb) + headroom; int flags = skb_alloc_rx_flag(skb) | (fclone ? SKB_ALLOC_FCLONE : 0); struct sk_buff *n = __alloc_skb(size, gfp_mask, flags, NUMA_NO_NODE); if (!n) goto out; /* Set the data pointer */ skb_reserve(n, headroom); /* Set the tail pointer and length */ skb_put(n, skb_headlen(skb)); /* Copy the bytes */ skb_copy_from_linear_data(skb, n->data, n->len); n->truesize += skb->data_len; n->data_len = skb->data_len; n->len = skb->len; if (skb_shinfo(skb)->nr_frags) { int i; if (skb_orphan_frags(skb, gfp_mask) || skb_zerocopy_clone(n, skb, gfp_mask)) { kfree_skb(n); n = NULL; goto out; } for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i]; skb_frag_ref(skb, i); } skb_shinfo(n)->nr_frags = i; } if (skb_has_frag_list(skb)) { skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list; skb_clone_fraglist(n); } skb_copy_header(n, skb); out: return n; } EXPORT_SYMBOL(__pskb_copy_fclone); /** * pskb_expand_head - reallocate header of &sk_buff * @skb: buffer to reallocate * @nhead: room to add at head * @ntail: room to add at tail * @gfp_mask: allocation priority * * Expands (or creates identical copy, if @nhead and @ntail are zero) * header of @skb. &sk_buff itself is not changed. &sk_buff MUST have * reference count of 1. Returns zero in the case of success or error, * if expansion failed. In the last case, &sk_buff is not changed. * * All the pointers pointing into skb header may change and must be * reloaded after call to this function. */ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, gfp_t gfp_mask) { unsigned int osize = skb_end_offset(skb); unsigned int size = osize + nhead + ntail; long off; u8 *data; int i; BUG_ON(nhead < 0); BUG_ON(skb_shared(skb)); skb_zcopy_downgrade_managed(skb); if (skb_pfmemalloc(skb)) gfp_mask |= __GFP_MEMALLOC; data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL); if (!data) goto nodata; size = SKB_WITH_OVERHEAD(size); /* Copy only real data... and, alas, header. This should be * optimized for the cases when header is void. */ memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head); memcpy((struct skb_shared_info *)(data + size), skb_shinfo(skb), offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags])); /* * if shinfo is shared we must drop the old head gracefully, but if it * is not we can just drop the old head and let the existing refcount * be since all we did is relocate the values */ if (skb_cloned(skb)) { if (skb_orphan_frags(skb, gfp_mask)) goto nofrags; if (skb_zcopy(skb)) refcount_inc(&skb_uarg(skb)->refcnt); for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) skb_frag_ref(skb, i); if (skb_has_frag_list(skb)) skb_clone_fraglist(skb); skb_release_data(skb, SKB_CONSUMED); } else { skb_free_head(skb); } off = (data + nhead) - skb->head; skb->head = data; skb->head_frag = 0; skb->data += off; skb_set_end_offset(skb, size); #ifdef NET_SKBUFF_DATA_USES_OFFSET off = nhead; #endif skb->tail += off; skb_headers_offset_update(skb, nhead); skb->cloned = 0; skb->hdr_len = 0; skb->nohdr = 0; atomic_set(&skb_shinfo(skb)->dataref, 1); skb_metadata_clear(skb); /* It is not generally safe to change skb->truesize. * For the moment, we really care of rx path, or * when skb is orphaned (not attached to a socket). */ if (!skb->sk || skb->destructor == sock_edemux) skb->truesize += size - osize; return 0; nofrags: skb_kfree_head(data, size); nodata: return -ENOMEM; } EXPORT_SYMBOL(pskb_expand_head); /* Make private copy of skb with writable head and some headroom */ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom) { struct sk_buff *skb2; int delta = headroom - skb_headroom(skb); if (delta <= 0) skb2 = pskb_copy(skb, GFP_ATOMIC); else { skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) { kfree_skb(skb2); skb2 = NULL; } } return skb2; } EXPORT_SYMBOL(skb_realloc_headroom); /* Note: We plan to rework this in linux-6.4 */ int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri) { unsigned int saved_end_offset, saved_truesize; struct skb_shared_info *shinfo; int res; saved_end_offset = skb_end_offset(skb); saved_truesize = skb->truesize; res = pskb_expand_head(skb, 0, 0, pri); if (res) return res; skb->truesize = saved_truesize; if (likely(skb_end_offset(skb) == saved_end_offset)) return 0; /* We can not change skb->end if the original or new value * is SKB_SMALL_HEAD_HEADROOM, as it might break skb_kfree_head(). */ if (saved_end_offset == SKB_SMALL_HEAD_HEADROOM || skb_end_offset(skb) == SKB_SMALL_HEAD_HEADROOM) { /* We think this path should not be taken. * Add a temporary trace to warn us just in case. */ pr_err_once("__skb_unclone_keeptruesize() skb_end_offset() %u -> %u\n", saved_end_offset, skb_end_offset(skb)); WARN_ON_ONCE(1); return 0; } shinfo = skb_shinfo(skb); /* We are about to change back skb->end, * we need to move skb_shinfo() to its new location. */ memmove(skb->head + saved_end_offset, shinfo, offsetof(struct skb_shared_info, frags[shinfo->nr_frags])); skb_set_end_offset(skb, saved_end_offset); return 0; } /** * skb_expand_head - reallocate header of &sk_buff * @skb: buffer to reallocate * @headroom: needed headroom * * Unlike skb_realloc_headroom, this one does not allocate a new skb * if possible; copies skb->sk to new skb as needed * and frees original skb in case of failures. * * It expect increased headroom and generates warning otherwise. */ struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom) { int delta = headroom - skb_headroom(skb); int osize = skb_end_offset(skb); struct sock *sk = skb->sk; if (WARN_ONCE(delta <= 0, "%s is expecting an increase in the headroom", __func__)) return skb; delta = SKB_DATA_ALIGN(delta); /* pskb_expand_head() might crash, if skb is shared. */ if (skb_shared(skb) || !is_skb_wmem(skb)) { struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); if (unlikely(!nskb)) goto fail; if (sk) skb_set_owner_w(nskb, sk); consume_skb(skb); skb = nskb; } if (pskb_expand_head(skb, delta, 0, GFP_ATOMIC)) goto fail; if (sk && is_skb_wmem(skb)) { delta = skb_end_offset(skb) - osize; refcount_add(delta, &sk->sk_wmem_alloc); skb->truesize += delta; } return skb; fail: kfree_skb(skb); return NULL; } EXPORT_SYMBOL(skb_expand_head); /** * skb_copy_expand - copy and expand sk_buff * @skb: buffer to copy * @newheadroom: new free bytes at head * @newtailroom: new free bytes at tail * @gfp_mask: allocation priority * * Make a copy of both an &sk_buff and its data and while doing so * allocate additional space. * * This is used when the caller wishes to modify the data and needs a * private copy of the data to alter as well as more space for new fields. * Returns %NULL on failure or the pointer to the buffer * on success. The returned buffer has a reference count of 1. * * You must pass %GFP_ATOMIC as the allocation priority if this function * is called from an interrupt. */ struct sk_buff *skb_copy_expand(const struct sk_buff *skb, int newheadroom, int newtailroom, gfp_t gfp_mask) { /* * Allocate the copy buffer */ int head_copy_len, head_copy_off; struct sk_buff *n; int oldheadroom; if (!skb_frags_readable(skb)) return NULL; if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)) return NULL; oldheadroom = skb_headroom(skb); n = __alloc_skb(newheadroom + skb->len + newtailroom, gfp_mask, skb_alloc_rx_flag(skb), NUMA_NO_NODE); if (!n) return NULL; skb_reserve(n, newheadroom); /* Set the tail pointer and length */ skb_put(n, skb->len); head_copy_len = oldheadroom; head_copy_off = 0; if (newheadroom <= head_copy_len) head_copy_len = newheadroom; else head_copy_off = newheadroom - head_copy_len; /* Copy the linear header and data. */ BUG_ON(skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off, skb->len + head_copy_len)); skb_copy_header(n, skb); skb_headers_offset_update(n, newheadroom - oldheadroom); return n; } EXPORT_SYMBOL(skb_copy_expand); /** * __skb_pad - zero pad the tail of an skb * @skb: buffer to pad * @pad: space to pad * @free_on_error: free buffer on error * * Ensure that a buffer is followed by a padding area that is zero * filled. Used by network drivers which may DMA or transfer data * beyond the buffer end onto the wire. * * May return error in out of memory cases. The skb is freed on error * if @free_on_error is true. */ int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error) { int err; int ntail; /* If the skbuff is non linear tailroom is always zero.. */ if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) { memset(skb->data+skb->len, 0, pad); return 0; } ntail = skb->data_len + pad - (skb->end - skb->tail); if (likely(skb_cloned(skb) || ntail > 0)) { err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC); if (unlikely(err)) goto free_skb; } /* FIXME: The use of this function with non-linear skb's really needs * to be audited. */ err = skb_linearize(skb); if (unlikely(err)) goto free_skb; memset(skb->data + skb->len, 0, pad); return 0; free_skb: if (free_on_error) kfree_skb(skb); return err; } EXPORT_SYMBOL(__skb_pad); /** * pskb_put - add data to the tail of a potentially fragmented buffer * @skb: start of the buffer to use * @tail: tail fragment of the buffer to use * @len: amount of data to add * * This function extends the used data area of the potentially * fragmented buffer. @tail must be the last fragment of @skb -- or * @skb itself. If this would exceed the total buffer size the kernel * will panic. A pointer to the first byte of the extra data is * returned. */ void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len) { if (tail != skb) { skb->data_len += len; skb->len += len; } return skb_put(tail, len); } EXPORT_SYMBOL_GPL(pskb_put); /** * skb_put - add data to a buffer * @skb: buffer to use * @len: amount of data to add * * This function extends the used data area of the buffer. If this would * exceed the total buffer size the kernel will panic. A pointer to the * first byte of the extra data is returned. */ void *skb_put(struct sk_buff *skb, unsigned int len) { void *tmp = skb_tail_pointer(skb); SKB_LINEAR_ASSERT(skb); skb->tail += len; skb->len += len; if (unlikely(skb->tail > skb->end)) skb_over_panic(skb, len, __builtin_return_address(0)); return tmp; } EXPORT_SYMBOL(skb_put); /** * skb_push - add data to the start of a buffer * @skb: buffer to use * @len: amount of data to add * * This function extends the used data area of the buffer at the buffer * start. If this would exceed the total buffer headroom the kernel will * panic. A pointer to the first byte of the extra data is returned. */ void *skb_push(struct sk_buff *skb, unsigned int len) { skb->data -= len; skb->len += len; if (unlikely(skb->data < skb->head)) skb_under_panic(skb, len, __builtin_return_address(0)); return skb->data; } EXPORT_SYMBOL(skb_push); /** * skb_pull - remove data from the start of a buffer * @skb: buffer to use * @len: amount of data to remove * * This function removes data from the start of a buffer, returning * the memory to the headroom. A pointer to the next data in the buffer * is returned. Once the data has been pulled future pushes will overwrite * the old data. */ void *skb_pull(struct sk_buff *skb, unsigned int len) { return skb_pull_inline(skb, len); } EXPORT_SYMBOL(skb_pull); /** * skb_pull_data - remove data from the start of a buffer returning its * original position. * @skb: buffer to use * @len: amount of data to remove * * This function removes data from the start of a buffer, returning * the memory to the headroom. A pointer to the original data in the buffer * is returned after checking if there is enough data to pull. Once the * data has been pulled future pushes will overwrite the old data. */ void *skb_pull_data(struct sk_buff *skb, size_t len) { void *data = skb->data; if (skb->len < len) return NULL; skb_pull(skb, len); return data; } EXPORT_SYMBOL(skb_pull_data); /** * skb_trim - remove end from a buffer * @skb: buffer to alter * @len: new length * * Cut the length of a buffer down by removing data from the tail. If * the buffer is already under the length specified it is not modified. * The skb must be linear. */ void skb_trim(struct sk_buff *skb, unsigned int len) { if (skb->len > len) __skb_trim(skb, len); } EXPORT_SYMBOL(skb_trim); /* Trims skb to length len. It can change skb pointers. */ int ___pskb_trim(struct sk_buff *skb, unsigned int len) { struct sk_buff **fragp; struct sk_buff *frag; int offset = skb_headlen(skb); int nfrags = skb_shinfo(skb)->nr_frags; int i; int err; if (skb_cloned(skb) && unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))) return err; i = 0; if (offset >= len) goto drop_pages; for (; i < nfrags; i++) { int end = offset + skb_frag_size(&skb_shinfo(skb)->frags[i]); if (end < len) { offset = end; continue; } skb_frag_size_set(&skb_shinfo(skb)->frags[i++], len - offset); drop_pages: skb_shinfo(skb)->nr_frags = i; for (; i < nfrags; i++) skb_frag_unref(skb, i); if (skb_has_frag_list(skb)) skb_drop_fraglist(skb); goto done; } for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp); fragp = &frag->next) { int end = offset + frag->len; if (skb_shared(frag)) { struct sk_buff *nfrag; nfrag = skb_clone(frag, GFP_ATOMIC); if (unlikely(!nfrag)) return -ENOMEM; nfrag->next = frag->next; consume_skb(frag); frag = nfrag; *fragp = frag; } if (end < len) { offset = end; continue; } if (end > len && unlikely((err = pskb_trim(frag, len - offset)))) return err; if (frag->next) skb_drop_list(&frag->next); break; } done: if (len > skb_headlen(skb)) { skb->data_len -= skb->len - len; skb->len = len; } else { skb->len = len; skb->data_len = 0; skb_set_tail_pointer(skb, len); } if (!skb->sk || skb->destructor == sock_edemux) skb_condense(skb); return 0; } EXPORT_SYMBOL(___pskb_trim); /* Note : use pskb_trim_rcsum() instead of calling this directly */ int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len) { if (skb->ip_summed == CHECKSUM_COMPLETE) { int delta = skb->len - len; skb->csum = csum_block_sub(skb->csum, skb_checksum(skb, len, delta, 0), len); } else if (skb->ip_summed == CHECKSUM_PARTIAL) { int hdlen = (len > skb_headlen(skb)) ? skb_headlen(skb) : len; int offset = skb_checksum_start_offset(skb) + skb->csum_offset; if (offset + sizeof(__sum16) > hdlen) return -EINVAL; } return __pskb_trim(skb, len); } EXPORT_SYMBOL(pskb_trim_rcsum_slow); /** * __pskb_pull_tail - advance tail of skb header * @skb: buffer to reallocate * @delta: number of bytes to advance tail * * The function makes a sense only on a fragmented &sk_buff, * it expands header moving its tail forward and copying necessary * data from fragmented part. * * &sk_buff MUST have reference count of 1. * * Returns %NULL (and &sk_buff does not change) if pull failed * or value of new tail of skb in the case of success. * * All the pointers pointing into skb header may change and must be * reloaded after call to this function. */ /* Moves tail of skb head forward, copying data from fragmented part, * when it is necessary. * 1. It may fail due to malloc failure. * 2. It may change skb pointers. * * It is pretty complicated. Luckily, it is called only in exceptional cases. */ void *__pskb_pull_tail(struct sk_buff *skb, int delta) { /* If skb has not enough free space at tail, get new one * plus 128 bytes for future expansions. If we have enough * room at tail, reallocate without expansion only if skb is cloned. */ int i, k, eat = (skb->tail + delta) - skb->end; if (!skb_frags_readable(skb)) return NULL; if (eat > 0 || skb_cloned(skb)) { if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0, GFP_ATOMIC)) return NULL; } BUG_ON(skb_copy_bits(skb, skb_headlen(skb), skb_tail_pointer(skb), delta)); /* Optimization: no fragments, no reasons to preestimate * size of pulled pages. Superb. */ if (!skb_has_frag_list(skb)) goto pull_pages; /* Estimate size of pulled pages. */ eat = delta; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); if (size >= eat) goto pull_pages; eat -= size; } /* If we need update frag list, we are in troubles. * Certainly, it is possible to add an offset to skb data, * but taking into account that pulling is expected to * be very rare operation, it is worth to fight against * further bloating skb head and crucify ourselves here instead. * Pure masohism, indeed. 8)8) */ if (eat) { struct sk_buff *list = skb_shinfo(skb)->frag_list; struct sk_buff *clone = NULL; struct sk_buff *insp = NULL; do { if (list->len <= eat) { /* Eaten as whole. */ eat -= list->len; list = list->next; insp = list; } else { /* Eaten partially. */ if (skb_is_gso(skb) && !list->head_frag && skb_headlen(list)) skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; if (skb_shared(list)) { /* Sucks! We need to fork list. :-( */ clone = skb_clone(list, GFP_ATOMIC); if (!clone) return NULL; insp = list->next; list = clone; } else { /* This may be pulled without * problems. */ insp = list; } if (!pskb_pull(list, eat)) { kfree_skb(clone); return NULL; } break; } } while (eat); /* Free pulled out fragments. */ while ((list = skb_shinfo(skb)->frag_list) != insp) { skb_shinfo(skb)->frag_list = list->next; consume_skb(list); } /* And insert new clone at head. */ if (clone) { clone->next = list; skb_shinfo(skb)->frag_list = clone; } } /* Success! Now we may commit changes to skb data. */ pull_pages: eat = delta; k = 0; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); if (size <= eat) { skb_frag_unref(skb, i); eat -= size; } else { skb_frag_t *frag = &skb_shinfo(skb)->frags[k]; *frag = skb_shinfo(skb)->frags[i]; if (eat) { skb_frag_off_add(frag, eat); skb_frag_size_sub(frag, eat); if (!i) goto end; eat = 0; } k++; } } skb_shinfo(skb)->nr_frags = k; end: skb->tail += delta; skb->data_len -= delta; if (!skb->data_len) skb_zcopy_clear(skb, false); return skb_tail_pointer(skb); } EXPORT_SYMBOL(__pskb_pull_tail); /** * skb_copy_bits - copy bits from skb to kernel buffer * @skb: source skb * @offset: offset in source * @to: destination buffer * @len: number of bytes to copy * * Copy the specified number of bytes from the source skb to the * destination buffer. * * CAUTION ! : * If its prototype is ever changed, * check arch/{*}/net/{*}.S files, * since it is called from BPF assembly code. */ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) { int start = skb_headlen(skb); struct sk_buff *frag_iter; int i, copy; if (offset > (int)skb->len - len) goto fault; /* Copy header. */ if ((copy = start - offset) > 0) { if (copy > len) copy = len; skb_copy_from_linear_data_offset(skb, offset, to, copy); if ((len -= copy) == 0) return 0; offset += copy; to += copy; } if (!skb_frags_readable(skb)) goto fault; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end; skb_frag_t *f = &skb_shinfo(skb)->frags[i]; WARN_ON(start > offset + len); end = start + skb_frag_size(f); if ((copy = end - offset) > 0) { u32 p_off, p_len, copied; struct page *p; u8 *vaddr; if (copy > len) copy = len; skb_frag_foreach_page(f, skb_frag_off(f) + offset - start, copy, p, p_off, p_len, copied) { vaddr = kmap_atomic(p); memcpy(to + copied, vaddr + p_off, p_len); kunmap_atomic(vaddr); } if ((len -= copy) == 0) return 0; offset += copy; to += copy; } start = end; } skb_walk_frags(skb, frag_iter) { int end; WARN_ON(start > offset + len); end = start + frag_iter->len; if ((copy = end - offset) > 0) { if (copy > len) copy = len; if (skb_copy_bits(frag_iter, offset - start, to, copy)) goto fault; if ((len -= copy) == 0) return 0; offset += copy; to += copy; } start = end; } if (!len) return 0; fault: return -EFAULT; } EXPORT_SYMBOL(skb_copy_bits); /* * Callback from splice_to_pipe(), if we need to release some pages * at the end of the spd in case we error'ed out in filling the pipe. */ static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i) { put_page(spd->pages[i]); } static struct page *linear_to_page(struct page *page, unsigned int *len, unsigned int *offset, struct sock *sk) { struct page_frag *pfrag = sk_page_frag(sk); if (!sk_page_frag_refill(sk, pfrag)) return NULL; *len = min_t(unsigned int, *len, pfrag->size - pfrag->offset); memcpy(page_address(pfrag->page) + pfrag->offset, page_address(page) + *offset, *len); *offset = pfrag->offset; pfrag->offset += *len; return pfrag->page; } static bool spd_can_coalesce(const struct splice_pipe_desc *spd, struct page *page, unsigned int offset) { return spd->nr_pages && spd->pages[spd->nr_pages - 1] == page && (spd->partial[spd->nr_pages - 1].offset + spd->partial[spd->nr_pages - 1].len == offset); } /* * Fill page/offset/length into spd, if it can hold more pages. */ static bool spd_fill_page(struct splice_pipe_desc *spd, struct pipe_inode_info *pipe, struct page *page, unsigned int *len, unsigned int offset, bool linear, struct sock *sk) { if (unlikely(spd->nr_pages == MAX_SKB_FRAGS)) return true; if (linear) { page = linear_to_page(page, len, &offset, sk); if (!page) return true; } if (spd_can_coalesce(spd, page, offset)) { spd->partial[spd->nr_pages - 1].len += *len; return false; } get_page(page); spd->pages[spd->nr_pages] = page; spd->partial[spd->nr_pages].len = *len; spd->partial[spd->nr_pages].offset = offset; spd->nr_pages++; return false; } static bool __splice_segment(struct page *page, unsigned int poff, unsigned int plen, unsigned int *off, unsigned int *len, struct splice_pipe_desc *spd, bool linear, struct sock *sk, struct pipe_inode_info *pipe) { if (!*len) return true; /* skip this segment if already processed */ if (*off >= plen) { *off -= plen; return false; } /* ignore any bits we already processed */ poff += *off; plen -= *off; *off = 0; do { unsigned int flen = min(*len, plen); if (spd_fill_page(spd, pipe, page, &flen, poff, linear, sk)) return true; poff += flen; plen -= flen; *len -= flen; } while (*len && plen); return false; } /* * Map linear and fragment data from the skb to spd. It reports true if the * pipe is full or if we already spliced the requested length. */ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, unsigned int *offset, unsigned int *len, struct splice_pipe_desc *spd, struct sock *sk) { int seg; struct sk_buff *iter; /* map the linear part : * If skb->head_frag is set, this 'linear' part is backed by a * fragment, and if the head is not shared with any clones then * we can avoid a copy since we own the head portion of this page. */ if (__splice_segment(virt_to_page(skb->data), (unsigned long) skb->data & (PAGE_SIZE - 1), skb_headlen(skb), offset, len, spd, skb_head_is_locked(skb), sk, pipe)) return true; /* * then map the fragments */ if (!skb_frags_readable(skb)) return false; for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) { const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; if (WARN_ON_ONCE(!skb_frag_page(f))) return false; if (__splice_segment(skb_frag_page(f), skb_frag_off(f), skb_frag_size(f), offset, len, spd, false, sk, pipe)) return true; } skb_walk_frags(skb, iter) { if (*offset >= iter->len) { *offset -= iter->len; continue; } /* __skb_splice_bits() only fails if the output has no room * left, so no point in going over the frag_list for the error * case. */ if (__skb_splice_bits(iter, pipe, offset, len, spd, sk)) return true; } return false; } /* * Map data from the skb to a pipe. Should handle both the linear part, * the fragments, and the frag list. */ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, struct pipe_inode_info *pipe, unsigned int tlen, unsigned int flags) { struct partial_page partial[MAX_SKB_FRAGS]; struct page *pages[MAX_SKB_FRAGS]; struct splice_pipe_desc spd = { .pages = pages, .partial = partial, .nr_pages_max = MAX_SKB_FRAGS, .ops = &nosteal_pipe_buf_ops, .spd_release = sock_spd_release, }; int ret = 0; __skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk); if (spd.nr_pages) ret = splice_to_pipe(pipe, &spd); return ret; } EXPORT_SYMBOL_GPL(skb_splice_bits); static int sendmsg_locked(struct sock *sk, struct msghdr *msg) { struct socket *sock = sk->sk_socket; size_t size = msg_data_left(msg); if (!sock) return -EINVAL; if (!sock->ops->sendmsg_locked) return sock_no_sendmsg_locked(sk, msg, size); return sock->ops->sendmsg_locked(sk, msg, size); } static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg) { struct socket *sock = sk->sk_socket; if (!sock) return -EINVAL; return sock_sendmsg(sock, msg); } typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg); static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len, sendmsg_func sendmsg, int flags) { unsigned int orig_len = len; struct sk_buff *head = skb; unsigned short fragidx; int slen, ret; do_frag_list: /* Deal with head data */ while (offset < skb_headlen(skb) && len) { struct kvec kv; struct msghdr msg; slen = min_t(int, len, skb_headlen(skb) - offset); kv.iov_base = skb->data + offset; kv.iov_len = slen; memset(&msg, 0, sizeof(msg)); msg.msg_flags = MSG_DONTWAIT | flags; iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &kv, 1, slen); ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked, sendmsg_unlocked, sk, &msg); if (ret <= 0) goto error; offset += ret; len -= ret; } /* All the data was skb head? */ if (!len) goto out; /* Make offset relative to start of frags */ offset -= skb_headlen(skb); /* Find where we are in frag list */ for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags; fragidx++) { skb_frag_t *frag = &skb_shinfo(skb)->frags[fragidx]; if (offset < skb_frag_size(frag)) break; offset -= skb_frag_size(frag); } for (; len && fragidx < skb_shinfo(skb)->nr_frags; fragidx++) { skb_frag_t *frag = &skb_shinfo(skb)->frags[fragidx]; slen = min_t(size_t, len, skb_frag_size(frag) - offset); while (slen) { struct bio_vec bvec; struct msghdr msg = { .msg_flags = MSG_SPLICE_PAGES | MSG_DONTWAIT | flags, }; bvec_set_page(&bvec, skb_frag_page(frag), slen, skb_frag_off(frag) + offset); iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, slen); ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked, sendmsg_unlocked, sk, &msg); if (ret <= 0) goto error; len -= ret; offset += ret; slen -= ret; } offset = 0; } if (len) { /* Process any frag lists */ if (skb == head) { if (skb_has_frag_list(skb)) { skb = skb_shinfo(skb)->frag_list; goto do_frag_list; } } else if (skb->next) { skb = skb->next; goto do_frag_list; } } out: return orig_len - len; error: return orig_len == len ? ret : orig_len - len; } /* Send skb data on a socket. Socket must be locked. */ int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, int len) { return __skb_send_sock(sk, skb, offset, len, sendmsg_locked, 0); } EXPORT_SYMBOL_GPL(skb_send_sock_locked); int skb_send_sock_locked_with_flags(struct sock *sk, struct sk_buff *skb, int offset, int len, int flags) { return __skb_send_sock(sk, skb, offset, len, sendmsg_locked, flags); } EXPORT_SYMBOL_GPL(skb_send_sock_locked_with_flags); /* Send skb data on a socket. Socket must be unlocked. */ int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len) { return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked, 0); } /** * skb_store_bits - store bits from kernel buffer to skb * @skb: destination buffer * @offset: offset in destination * @from: source buffer * @len: number of bytes to copy * * Copy the specified number of bytes from the source buffer to the * destination skb. This function handles all the messy bits of * traversing fragment lists and such. */ int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len) { int start = skb_headlen(skb); struct sk_buff *frag_iter; int i, copy; if (offset > (int)skb->len - len) goto fault; if ((copy = start - offset) > 0) { if (copy > len) copy = len; skb_copy_to_linear_data_offset(skb, offset, from, copy); if ((len -= copy) == 0) return 0; offset += copy; from += copy; } if (!skb_frags_readable(skb)) goto fault; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; int end; WARN_ON(start > offset + len); end = start + skb_frag_size(frag); if ((copy = end - offset) > 0) { u32 p_off, p_len, copied; struct page *p; u8 *vaddr; if (copy > len) copy = len; skb_frag_foreach_page(frag, skb_frag_off(frag) + offset - start, copy, p, p_off, p_len, copied) { vaddr = kmap_atomic(p); memcpy(vaddr + p_off, from + copied, p_len); kunmap_atomic(vaddr); } if ((len -= copy) == 0) return 0; offset += copy; from += copy; } start = end; } skb_walk_frags(skb, frag_iter) { int end; WARN_ON(start > offset + len); end = start + frag_iter->len; if ((copy = end - offset) > 0) { if (copy > len) copy = len; if (skb_store_bits(frag_iter, offset - start, from, copy)) goto fault; if ((len -= copy) == 0) return 0; offset += copy; from += copy; } start = end; } if (!len) return 0; fault: return -EFAULT; } EXPORT_SYMBOL(skb_store_bits); /* Checksum skb data. */ __wsum skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum) { int start = skb_headlen(skb); int i, copy = start - offset; struct sk_buff *frag_iter; int pos = 0; /* Checksum header. */ if (copy > 0) { if (copy > len) copy = len; csum = csum_partial(skb->data + offset, copy, csum); if ((len -= copy) == 0) return csum; offset += copy; pos = copy; } if (WARN_ON_ONCE(!skb_frags_readable(skb))) return 0; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end; skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; WARN_ON(start > offset + len); end = start + skb_frag_size(frag); if ((copy = end - offset) > 0) { u32 p_off, p_len, copied; struct page *p; __wsum csum2; u8 *vaddr; if (copy > len) copy = len; skb_frag_foreach_page(frag, skb_frag_off(frag) + offset - start, copy, p, p_off, p_len, copied) { vaddr = kmap_atomic(p); csum2 = csum_partial(vaddr + p_off, p_len, 0); kunmap_atomic(vaddr); csum = csum_block_add(csum, csum2, pos); pos += p_len; } if (!(len -= copy)) return csum; offset += copy; } start = end; } skb_walk_frags(skb, frag_iter) { int end; WARN_ON(start > offset + len); end = start + frag_iter->len; if ((copy = end - offset) > 0) { __wsum csum2; if (copy > len) copy = len; csum2 = skb_checksum(frag_iter, offset - start, copy, 0); csum = csum_block_add(csum, csum2, pos); if ((len -= copy) == 0) return csum; offset += copy; pos += copy; } start = end; } BUG_ON(len); return csum; } EXPORT_SYMBOL(skb_checksum); /* Both of above in one bottle. */ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, u8 *to, int len) { int start = skb_headlen(skb); int i, copy = start - offset; struct sk_buff *frag_iter; int pos = 0; __wsum csum = 0; /* Copy header. */ if (copy > 0) { if (copy > len) copy = len; csum = csum_partial_copy_nocheck(skb->data + offset, to, copy); if ((len -= copy) == 0) return csum; offset += copy; to += copy; pos = copy; } if (!skb_frags_readable(skb)) return 0; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end; WARN_ON(start > offset + len); end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); if ((copy = end - offset) > 0) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; u32 p_off, p_len, copied; struct page *p; __wsum csum2; u8 *vaddr; if (copy > len) copy = len; skb_frag_foreach_page(frag, skb_frag_off(frag) + offset - start, copy, p, p_off, p_len, copied) { vaddr = kmap_atomic(p); csum2 = csum_partial_copy_nocheck(vaddr + p_off, to + copied, p_len); kunmap_atomic(vaddr); csum = csum_block_add(csum, csum2, pos); pos += p_len; } if (!(len -= copy)) return csum; offset += copy; to += copy; } start = end; } skb_walk_frags(skb, frag_iter) { __wsum csum2; int end; WARN_ON(start > offset + len); end = start + frag_iter->len; if ((copy = end - offset) > 0) { if (copy > len) copy = len; csum2 = skb_copy_and_csum_bits(frag_iter, offset - start, to, copy); csum = csum_block_add(csum, csum2, pos); if ((len -= copy) == 0) return csum; offset += copy; to += copy; pos += copy; } start = end; } BUG_ON(len); return csum; } EXPORT_SYMBOL(skb_copy_and_csum_bits); #ifdef CONFIG_NET_CRC32C u32 skb_crc32c(const struct sk_buff *skb, int offset, int len, u32 crc) { int start = skb_headlen(skb); int i, copy = start - offset; struct sk_buff *frag_iter; if (copy > 0) { copy = min(copy, len); crc = crc32c(crc, skb->data + offset, copy); len -= copy; if (len == 0) return crc; offset += copy; } if (WARN_ON_ONCE(!skb_frags_readable(skb))) return 0; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end; skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; WARN_ON(start > offset + len); end = start + skb_frag_size(frag); copy = end - offset; if (copy > 0) { u32 p_off, p_len, copied; struct page *p; u8 *vaddr; copy = min(copy, len); skb_frag_foreach_page(frag, skb_frag_off(frag) + offset - start, copy, p, p_off, p_len, copied) { vaddr = kmap_atomic(p); crc = crc32c(crc, vaddr + p_off, p_len); kunmap_atomic(vaddr); } len -= copy; if (len == 0) return crc; offset += copy; } start = end; } skb_walk_frags(skb, frag_iter) { int end; WARN_ON(start > offset + len); end = start + frag_iter->len; copy = end - offset; if (copy > 0) { copy = min(copy, len); crc = skb_crc32c(frag_iter, offset - start, copy, crc); len -= copy; if (len == 0) return crc; offset += copy; } start = end; } BUG_ON(len); return crc; } EXPORT_SYMBOL(skb_crc32c); #endif /* CONFIG_NET_CRC32C */ __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) { __sum16 sum; sum = csum_fold(skb_checksum(skb, 0, len, skb->csum)); /* See comments in __skb_checksum_complete(). */ if (likely(!sum)) { if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && !skb->csum_complete_sw) netdev_rx_csum_fault(skb->dev, skb); } if (!skb_shared(skb)) skb->csum_valid = !sum; return sum; } EXPORT_SYMBOL(__skb_checksum_complete_head); /* This function assumes skb->csum already holds pseudo header's checksum, * which has been changed from the hardware checksum, for example, by * __skb_checksum_validate_complete(). And, the original skb->csum must * have been validated unsuccessfully for CHECKSUM_COMPLETE case. * * It returns non-zero if the recomputed checksum is still invalid, otherwise * zero. The new checksum is stored back into skb->csum unless the skb is * shared. */ __sum16 __skb_checksum_complete(struct sk_buff *skb) { __wsum csum; __sum16 sum; csum = skb_checksum(skb, 0, skb->len, 0); sum = csum_fold(csum_add(skb->csum, csum)); /* This check is inverted, because we already knew the hardware * checksum is invalid before calling this function. So, if the * re-computed checksum is valid instead, then we have a mismatch * between the original skb->csum and skb_checksum(). This means either * the original hardware checksum is incorrect or we screw up skb->csum * when moving skb->data around. */ if (likely(!sum)) { if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && !skb->csum_complete_sw) netdev_rx_csum_fault(skb->dev, skb); } if (!skb_shared(skb)) { /* Save full packet checksum */ skb->csum = csum; skb->ip_summed = CHECKSUM_COMPLETE; skb->csum_complete_sw = 1; skb->csum_valid = !sum; } return sum; } EXPORT_SYMBOL(__skb_checksum_complete); /** * skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy() * @from: source buffer * * Calculates the amount of linear headroom needed in the 'to' skb passed * into skb_zerocopy(). */ unsigned int skb_zerocopy_headlen(const struct sk_buff *from) { unsigned int hlen = 0; if (!from->head_frag || skb_headlen(from) < L1_CACHE_BYTES || skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) { hlen = skb_headlen(from); if (!hlen) hlen = from->len; } if (skb_has_frag_list(from)) hlen = from->len; return hlen; } EXPORT_SYMBOL_GPL(skb_zerocopy_headlen); /** * skb_zerocopy - Zero copy skb to skb * @to: destination buffer * @from: source buffer * @len: number of bytes to copy from source buffer * @hlen: size of linear headroom in destination buffer * * Copies up to `len` bytes from `from` to `to` by creating references * to the frags in the source buffer. * * The `hlen` as calculated by skb_zerocopy_headlen() specifies the * headroom in the `to` buffer. * * Return value: * 0: everything is OK * -ENOMEM: couldn't orphan frags of @from due to lack of memory * -EFAULT: skb_copy_bits() found some problem with skb geometry */ int skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen) { int i, j = 0; int plen = 0; /* length of skb->head fragment */ int ret; struct page *page; unsigned int offset; BUG_ON(!from->head_frag && !hlen); /* dont bother with small payloads */ if (len <= skb_tailroom(to)) return skb_copy_bits(from, 0, skb_put(to, len), len); if (hlen) { ret = skb_copy_bits(from, 0, skb_put(to, hlen), hlen); if (unlikely(ret)) return ret; len -= hlen; } else { plen = min_t(int, skb_headlen(from), len); if (plen) { page = virt_to_head_page(from->head); offset = from->data - (unsigned char *)page_address(page); __skb_fill_netmem_desc(to, 0, page_to_netmem(page), offset, plen); get_page(page); j = 1; len -= plen; } } skb_len_add(to, len + plen); if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) { skb_tx_error(from); return -ENOMEM; } skb_zerocopy_clone(to, from, GFP_ATOMIC); for (i = 0; i < skb_shinfo(from)->nr_frags; i++) { int size; if (!len) break; skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i]; size = min_t(int, skb_frag_size(&skb_shinfo(to)->frags[j]), len); skb_frag_size_set(&skb_shinfo(to)->frags[j], size); len -= size; skb_frag_ref(to, j); j++; } skb_shinfo(to)->nr_frags = j; return 0; } EXPORT_SYMBOL_GPL(skb_zerocopy); void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to) { __wsum csum; long csstart; if (skb->ip_summed == CHECKSUM_PARTIAL) csstart = skb_checksum_start_offset(skb); else csstart = skb_headlen(skb); BUG_ON(csstart > skb_headlen(skb)); skb_copy_from_linear_data(skb, to, csstart); csum = 0; if (csstart != skb->len) csum = skb_copy_and_csum_bits(skb, csstart, to + csstart, skb->len - csstart); if (skb->ip_summed == CHECKSUM_PARTIAL) { long csstuff = csstart + skb->csum_offset; *((__sum16 *)(to + csstuff)) = csum_fold(csum); } } EXPORT_SYMBOL(skb_copy_and_csum_dev); /** * skb_dequeue - remove from the head of the queue * @list: list to dequeue from * * Remove the head of the list. The list lock is taken so the function * may be used safely with other locking list functions. The head item is * returned or %NULL if the list is empty. */ struct sk_buff *skb_dequeue(struct sk_buff_head *list) { unsigned long flags; struct sk_buff *result; spin_lock_irqsave(&list->lock, flags); result = __skb_dequeue(list); spin_unlock_irqrestore(&list->lock, flags); return result; } EXPORT_SYMBOL(skb_dequeue); /** * skb_dequeue_tail - remove from the tail of the queue * @list: list to dequeue from * * Remove the tail of the list. The list lock is taken so the function * may be used safely with other locking list functions. The tail item is * returned or %NULL if the list is empty. */ struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list) { unsigned long flags; struct sk_buff *result; spin_lock_irqsave(&list->lock, flags); result = __skb_dequeue_tail(list); spin_unlock_irqrestore(&list->lock, flags); return result; } EXPORT_SYMBOL(skb_dequeue_tail); /** * skb_queue_purge_reason - empty a list * @list: list to empty * @reason: drop reason * * Delete all buffers on an &sk_buff list. Each buffer is removed from * the list and one reference dropped. This function takes the list * lock and is atomic with respect to other list locking functions. */ void skb_queue_purge_reason(struct sk_buff_head *list, enum skb_drop_reason reason) { struct sk_buff_head tmp; unsigned long flags; if (skb_queue_empty_lockless(list)) return; __skb_queue_head_init(&tmp); spin_lock_irqsave(&list->lock, flags); skb_queue_splice_init(list, &tmp); spin_unlock_irqrestore(&list->lock, flags); __skb_queue_purge_reason(&tmp, reason); } EXPORT_SYMBOL(skb_queue_purge_reason); /** * skb_rbtree_purge - empty a skb rbtree * @root: root of the rbtree to empty * Return value: the sum of truesizes of all purged skbs. * * Delete all buffers on an &sk_buff rbtree. Each buffer is removed from * the list and one reference dropped. This function does not take * any lock. Synchronization should be handled by the caller (e.g., TCP * out-of-order queue is protected by the socket lock). */ unsigned int skb_rbtree_purge(struct rb_root *root) { struct rb_node *p = rb_first(root); unsigned int sum = 0; while (p) { struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); p = rb_next(p); rb_erase(&skb->rbnode, root); sum += skb->truesize; kfree_skb(skb); } return sum; } void skb_errqueue_purge(struct sk_buff_head *list) { struct sk_buff *skb, *next; struct sk_buff_head kill; unsigned long flags; __skb_queue_head_init(&kill); spin_lock_irqsave(&list->lock, flags); skb_queue_walk_safe(list, skb, next) { if (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ZEROCOPY || SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING) continue; __skb_unlink(skb, list); __skb_queue_tail(&kill, skb); } spin_unlock_irqrestore(&list->lock, flags); __skb_queue_purge(&kill); } EXPORT_SYMBOL(skb_errqueue_purge); /** * skb_queue_head - queue a buffer at the list head * @list: list to use * @newsk: buffer to queue * * Queue a buffer at the start of the list. This function takes the * list lock and can be used safely with other locking &sk_buff functions * safely. * * A buffer cannot be placed on two lists at the same time. */ void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk) { unsigned long flags; spin_lock_irqsave(&list->lock, flags); __skb_queue_head(list, newsk); spin_unlock_irqrestore(&list->lock, flags); } EXPORT_SYMBOL(skb_queue_head); /** * skb_queue_tail - queue a buffer at the list tail * @list: list to use * @newsk: buffer to queue * * Queue a buffer at the tail of the list. This function takes the * list lock and can be used safely with other locking &sk_buff functions * safely. * * A buffer cannot be placed on two lists at the same time. */ void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk) { unsigned long flags; spin_lock_irqsave(&list->lock, flags); __skb_queue_tail(list, newsk); spin_unlock_irqrestore(&list->lock, flags); } EXPORT_SYMBOL(skb_queue_tail); /** * skb_unlink - remove a buffer from a list * @skb: buffer to remove * @list: list to use * * Remove a packet from a list. The list locks are taken and this * function is atomic with respect to other list locked calls * * You must know what list the SKB is on. */ void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list) { unsigned long flags; spin_lock_irqsave(&list->lock, flags); __skb_unlink(skb, list); spin_unlock_irqrestore(&list->lock, flags); } EXPORT_SYMBOL(skb_unlink); /** * skb_append - append a buffer * @old: buffer to insert after * @newsk: buffer to insert * @list: list to use * * Place a packet after a given packet in a list. The list locks are taken * and this function is atomic with respect to other list locked calls. * A buffer cannot be placed on two lists at the same time. */ void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) { unsigned long flags; spin_lock_irqsave(&list->lock, flags); __skb_queue_after(list, old, newsk); spin_unlock_irqrestore(&list->lock, flags); } EXPORT_SYMBOL(skb_append); static inline void skb_split_inside_header(struct sk_buff *skb, struct sk_buff* skb1, const u32 len, const int pos) { int i; skb_copy_from_linear_data_offset(skb, len, skb_put(skb1, pos - len), pos - len); /* And move data appendix as is. */ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i]; skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags; skb1->unreadable = skb->unreadable; skb_shinfo(skb)->nr_frags = 0; skb1->data_len = skb->data_len; skb1->len += skb1->data_len; skb->data_len = 0; skb->len = len; skb_set_tail_pointer(skb, len); } static inline void skb_split_no_header(struct sk_buff *skb, struct sk_buff* skb1, const u32 len, int pos) { int i, k = 0; const int nfrags = skb_shinfo(skb)->nr_frags; skb_shinfo(skb)->nr_frags = 0; skb1->len = skb1->data_len = skb->len - len; skb->len = len; skb->data_len = len - pos; for (i = 0; i < nfrags; i++) { int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); if (pos + size > len) { skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i]; if (pos < len) { /* Split frag. * We have two variants in this case: * 1. Move all the frag to the second * part, if it is possible. F.e. * this approach is mandatory for TUX, * where splitting is expensive. * 2. Split is accurately. We make this. */ skb_frag_ref(skb, i); skb_frag_off_add(&skb_shinfo(skb1)->frags[0], len - pos); skb_frag_size_sub(&skb_shinfo(skb1)->frags[0], len - pos); skb_frag_size_set(&skb_shinfo(skb)->frags[i], len - pos); skb_shinfo(skb)->nr_frags++; } k++; } else skb_shinfo(skb)->nr_frags++; pos += size; } skb_shinfo(skb1)->nr_frags = k; skb1->unreadable = skb->unreadable; } /** * skb_split - Split fragmented skb to two parts at length len. * @skb: the buffer to split * @skb1: the buffer to receive the second part * @len: new length for skb */ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) { int pos = skb_headlen(skb); const int zc_flags = SKBFL_SHARED_FRAG | SKBFL_PURE_ZEROCOPY; skb_zcopy_downgrade_managed(skb); skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & zc_flags; skb_zerocopy_clone(skb1, skb, 0); if (len < pos) /* Split line is inside header. */ skb_split_inside_header(skb, skb1, len, pos); else /* Second chunk has no header, nothing to copy. */ skb_split_no_header(skb, skb1, len, pos); } EXPORT_SYMBOL(skb_split); /* Shifting from/to a cloned skb is a no-go. * * Caller cannot keep skb_shinfo related pointers past calling here! */ static int skb_prepare_for_shift(struct sk_buff *skb) { return skb_unclone_keeptruesize(skb, GFP_ATOMIC); } /** * skb_shift - Shifts paged data partially from skb to another * @tgt: buffer into which tail data gets added * @skb: buffer from which the paged data comes from * @shiftlen: shift up to this many bytes * * Attempts to shift up to shiftlen worth of bytes, which may be less than * the length of the skb, from skb to tgt. Returns number bytes shifted. * It's up to caller to free skb if everything was shifted. * * If @tgt runs out of frags, the whole operation is aborted. * * Skb cannot include anything else but paged data while tgt is allowed * to have non-paged data as well. * * TODO: full sized shift could be optimized but that would need * specialized skb free'er to handle frags without up-to-date nr_frags. */ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) { int from, to, merge, todo; skb_frag_t *fragfrom, *fragto; BUG_ON(shiftlen > skb->len); if (skb_headlen(skb)) return 0; if (skb_zcopy(tgt) || skb_zcopy(skb)) return 0; DEBUG_NET_WARN_ON_ONCE(tgt->pp_recycle != skb->pp_recycle); DEBUG_NET_WARN_ON_ONCE(skb_cmp_decrypted(tgt, skb)); todo = shiftlen; from = 0; to = skb_shinfo(tgt)->nr_frags; fragfrom = &skb_shinfo(skb)->frags[from]; /* Actual merge is delayed until the point when we know we can * commit all, so that we don't have to undo partial changes */ if (!skb_can_coalesce(tgt, to, skb_frag_page(fragfrom), skb_frag_off(fragfrom))) { merge = -1; } else { merge = to - 1; todo -= skb_frag_size(fragfrom); if (todo < 0) { if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt)) return 0; /* All previous frag pointers might be stale! */ fragfrom = &skb_shinfo(skb)->frags[from]; fragto = &skb_shinfo(tgt)->frags[merge]; skb_frag_size_add(fragto, shiftlen); skb_frag_size_sub(fragfrom, shiftlen); skb_frag_off_add(fragfrom, shiftlen); goto onlymerged; } from++; } /* Skip full, not-fitting skb to avoid expensive operations */ if ((shiftlen == skb->len) && (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to)) return 0; if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt)) return 0; while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) { if (to == MAX_SKB_FRAGS) return 0; fragfrom = &skb_shinfo(skb)->frags[from]; fragto = &skb_shinfo(tgt)->frags[to]; if (todo >= skb_frag_size(fragfrom)) { *fragto = *fragfrom; todo -= skb_frag_size(fragfrom); from++; to++; } else { __skb_frag_ref(fragfrom); skb_frag_page_copy(fragto, fragfrom); skb_frag_off_copy(fragto, fragfrom); skb_frag_size_set(fragto, todo); skb_frag_off_add(fragfrom, todo); skb_frag_size_sub(fragfrom, todo); todo = 0; to++; break; } } /* Ready to "commit" this state change to tgt */ skb_shinfo(tgt)->nr_frags = to; if (merge >= 0) { fragfrom = &skb_shinfo(skb)->frags[0]; fragto = &skb_shinfo(tgt)->frags[merge]; skb_frag_size_add(fragto, skb_frag_size(fragfrom)); __skb_frag_unref(fragfrom, skb->pp_recycle); } /* Reposition in the original skb */ to = 0; while (from < skb_shinfo(skb)->nr_frags) skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++]; skb_shinfo(skb)->nr_frags = to; BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags); onlymerged: /* Most likely the tgt won't ever need its checksum anymore, skb on * the other hand might need it if it needs to be resent */ tgt->ip_summed = CHECKSUM_PARTIAL; skb->ip_summed = CHECKSUM_PARTIAL; skb_len_add(skb, -shiftlen); skb_len_add(tgt, shiftlen); return shiftlen; } /** * skb_prepare_seq_read - Prepare a sequential read of skb data * @skb: the buffer to read * @from: lower offset of data to be read * @to: upper offset of data to be read * @st: state variable * * Initializes the specified state variable. Must be called before * invoking skb_seq_read() for the first time. */ void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from, unsigned int to, struct skb_seq_state *st) { st->lower_offset = from; st->upper_offset = to; st->root_skb = st->cur_skb = skb; st->frag_idx = st->stepped_offset = 0; st->frag_data = NULL; st->frag_off = 0; } EXPORT_SYMBOL(skb_prepare_seq_read); /** * skb_seq_read - Sequentially read skb data * @consumed: number of bytes consumed by the caller so far * @data: destination pointer for data to be returned * @st: state variable * * Reads a block of skb data at @consumed relative to the * lower offset specified to skb_prepare_seq_read(). Assigns * the head of the data block to @data and returns the length * of the block or 0 if the end of the skb data or the upper * offset has been reached. * * The caller is not required to consume all of the data * returned, i.e. @consumed is typically set to the number * of bytes already consumed and the next call to * skb_seq_read() will return the remaining part of the block. * * Note 1: The size of each block of data returned can be arbitrary, * this limitation is the cost for zerocopy sequential * reads of potentially non linear data. * * Note 2: Fragment lists within fragments are not implemented * at the moment, state->root_skb could be replaced with * a stack for this purpose. */ unsigned int skb_seq_read(unsigned int consumed, const u8 **data, struct skb_seq_state *st) { unsigned int block_limit, abs_offset = consumed + st->lower_offset; skb_frag_t *frag; if (unlikely(abs_offset >= st->upper_offset)) { if (st->frag_data) { kunmap_atomic(st->frag_data); st->frag_data = NULL; } return 0; } next_skb: block_limit = skb_headlen(st->cur_skb) + st->stepped_offset; if (abs_offset < block_limit && !st->frag_data) { *data = st->cur_skb->data + (abs_offset - st->stepped_offset); return block_limit - abs_offset; } if (!skb_frags_readable(st->cur_skb)) return 0; if (st->frag_idx == 0 && !st->frag_data) st->stepped_offset += skb_headlen(st->cur_skb); while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) { unsigned int pg_idx, pg_off, pg_sz; frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx]; pg_idx = 0; pg_off = skb_frag_off(frag); pg_sz = skb_frag_size(frag); if (skb_frag_must_loop(skb_frag_page(frag))) { pg_idx = (pg_off + st->frag_off) >> PAGE_SHIFT; pg_off = offset_in_page(pg_off + st->frag_off); pg_sz = min_t(unsigned int, pg_sz - st->frag_off, PAGE_SIZE - pg_off); } block_limit = pg_sz + st->stepped_offset; if (abs_offset < block_limit) { if (!st->frag_data) st->frag_data = kmap_atomic(skb_frag_page(frag) + pg_idx); *data = (u8 *)st->frag_data + pg_off + (abs_offset - st->stepped_offset); return block_limit - abs_offset; } if (st->frag_data) { kunmap_atomic(st->frag_data); st->frag_data = NULL; } st->stepped_offset += pg_sz; st->frag_off += pg_sz; if (st->frag_off == skb_frag_size(frag)) { st->frag_off = 0; st->frag_idx++; } } if (st->frag_data) { kunmap_atomic(st->frag_data); st->frag_data = NULL; } if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) { st->cur_skb = skb_shinfo(st->root_skb)->frag_list; st->frag_idx = 0; goto next_skb; } else if (st->cur_skb->next) { st->cur_skb = st->cur_skb->next; st->frag_idx = 0; goto next_skb; } return 0; } EXPORT_SYMBOL(skb_seq_read); /** * skb_abort_seq_read - Abort a sequential read of skb data * @st: state variable * * Must be called if skb_seq_read() was not called until it * returned 0. */ void skb_abort_seq_read(struct skb_seq_state *st) { if (st->frag_data) kunmap_atomic(st->frag_data); } EXPORT_SYMBOL(skb_abort_seq_read); /** * skb_copy_seq_read() - copy from a skb_seq_state to a buffer * @st: source skb_seq_state * @offset: offset in source * @to: destination buffer * @len: number of bytes to copy * * Copy @len bytes from @offset bytes into the source @st to the destination * buffer @to. `offset` should increase (or be unchanged) with each subsequent * call to this function. If offset needs to decrease from the previous use `st` * should be reset first. * * Return: 0 on success or -EINVAL if the copy ended early */ int skb_copy_seq_read(struct skb_seq_state *st, int offset, void *to, int len) { const u8 *data; u32 sqlen; for (;;) { sqlen = skb_seq_read(offset, &data, st); if (sqlen == 0) return -EINVAL; if (sqlen >= len) { memcpy(to, data, len); return 0; } memcpy(to, data, sqlen); to += sqlen; offset += sqlen; len -= sqlen; } } EXPORT_SYMBOL(skb_copy_seq_read); #define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb)) static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text, struct ts_config *conf, struct ts_state *state) { return skb_seq_read(offset, text, TS_SKB_CB(state)); } static void skb_ts_finish(struct ts_config *conf, struct ts_state *state) { skb_abort_seq_read(TS_SKB_CB(state)); } /** * skb_find_text - Find a text pattern in skb data * @skb: the buffer to look in * @from: search offset * @to: search limit * @config: textsearch configuration * * Finds a pattern in the skb data according to the specified * textsearch configuration. Use textsearch_next() to retrieve * subsequent occurrences of the pattern. Returns the offset * to the first occurrence or UINT_MAX if no match was found. */ unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, unsigned int to, struct ts_config *config) { unsigned int patlen = config->ops->get_pattern_len(config); struct ts_state state; unsigned int ret; BUILD_BUG_ON(sizeof(struct skb_seq_state) > sizeof(state.cb)); config->get_next_block = skb_ts_get_next_block; config->finish = skb_ts_finish; skb_prepare_seq_read(skb, from, to, TS_SKB_CB(&state)); ret = textsearch_find(config, &state); return (ret + patlen <= to - from ? ret : UINT_MAX); } EXPORT_SYMBOL(skb_find_text); int skb_append_pagefrags(struct sk_buff *skb, struct page *page, int offset, size_t size, size_t max_frags) { int i = skb_shinfo(skb)->nr_frags; if (skb_can_coalesce(skb, i, page, offset)) { skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size); } else if (i < max_frags) { skb_zcopy_downgrade_managed(skb); get_page(page); skb_fill_page_desc_noacc(skb, i, page, offset, size); } else { return -EMSGSIZE; } return 0; } EXPORT_SYMBOL_GPL(skb_append_pagefrags); /** * skb_pull_rcsum - pull skb and update receive checksum * @skb: buffer to update * @len: length of data pulled * * This function performs an skb_pull on the packet and updates * the CHECKSUM_COMPLETE checksum. It should be used on * receive path processing instead of skb_pull unless you know * that the checksum difference is zero (e.g., a valid IP header) * or you are setting ip_summed to CHECKSUM_NONE. */ void *skb_pull_rcsum(struct sk_buff *skb, unsigned int len) { unsigned char *data = skb->data; BUG_ON(len > skb->len); __skb_pull(skb, len); skb_postpull_rcsum(skb, data, len); return skb->data; } EXPORT_SYMBOL_GPL(skb_pull_rcsum); static inline skb_frag_t skb_head_frag_to_page_desc(struct sk_buff *frag_skb) { skb_frag_t head_frag; struct page *page; page = virt_to_head_page(frag_skb->head); skb_frag_fill_page_desc(&head_frag, page, frag_skb->data - (unsigned char *)page_address(page), skb_headlen(frag_skb)); return head_frag; } struct sk_buff *skb_segment_list(struct sk_buff *skb, netdev_features_t features, unsigned int offset) { struct sk_buff *list_skb = skb_shinfo(skb)->frag_list; unsigned int tnl_hlen = skb_tnl_header_len(skb); unsigned int delta_truesize = 0; unsigned int delta_len = 0; struct sk_buff *tail = NULL; struct sk_buff *nskb, *tmp; int len_diff, err; skb_push(skb, -skb_network_offset(skb) + offset); /* Ensure the head is writeable before touching the shared info */ err = skb_unclone(skb, GFP_ATOMIC); if (err) goto err_linearize; skb_shinfo(skb)->frag_list = NULL; while (list_skb) { nskb = list_skb; list_skb = list_skb->next; err = 0; delta_truesize += nskb->truesize; if (skb_shared(nskb)) { tmp = skb_clone(nskb, GFP_ATOMIC); if (tmp) { consume_skb(nskb); nskb = tmp; err = skb_unclone(nskb, GFP_ATOMIC); } else { err = -ENOMEM; } } if (!tail) skb->next = nskb; else tail->next = nskb; if (unlikely(err)) { nskb->next = list_skb; goto err_linearize; } tail = nskb; delta_len += nskb->len; skb_push(nskb, -skb_network_offset(nskb) + offset); skb_release_head_state(nskb); len_diff = skb_network_header_len(nskb) - skb_network_header_len(skb); __copy_skb_header(nskb, skb); skb_headers_offset_update(nskb, skb_headroom(nskb) - skb_headroom(skb)); nskb->transport_header += len_diff; skb_copy_from_linear_data_offset(skb, -tnl_hlen, nskb->data - tnl_hlen, offset + tnl_hlen); if (skb_needs_linearize(nskb, features) && __skb_linearize(nskb)) goto err_linearize; } skb->truesize = skb->truesize - delta_truesize; skb->data_len = skb->data_len - delta_len; skb->len = skb->len - delta_len; skb_gso_reset(skb); skb->prev = tail; if (skb_needs_linearize(skb, features) && __skb_linearize(skb)) goto err_linearize; skb_get(skb); return skb; err_linearize: kfree_skb_list(skb->next); skb->next = NULL; return ERR_PTR(-ENOMEM); } EXPORT_SYMBOL_GPL(skb_segment_list); /** * skb_segment - Perform protocol segmentation on skb. * @head_skb: buffer to segment * @features: features for the output path (see dev->features) * * This function performs segmentation on the given skb. It returns * a pointer to the first in a list of new skbs for the segments. * In case of error it returns ERR_PTR(err). */ struct sk_buff *skb_segment(struct sk_buff *head_skb, netdev_features_t features) { struct sk_buff *segs = NULL; struct sk_buff *tail = NULL; struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list; unsigned int mss = skb_shinfo(head_skb)->gso_size; unsigned int doffset = head_skb->data - skb_mac_header(head_skb); unsigned int offset = doffset; unsigned int tnl_hlen = skb_tnl_header_len(head_skb); unsigned int partial_segs = 0; unsigned int headroom; unsigned int len = head_skb->len; struct sk_buff *frag_skb; skb_frag_t *frag; __be16 proto; bool csum, sg; int err = -ENOMEM; int i = 0; int nfrags, pos; if ((skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY) && mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb)) { struct sk_buff *check_skb; for (check_skb = list_skb; check_skb; check_skb = check_skb->next) { if (skb_headlen(check_skb) && !check_skb->head_frag) { /* gso_size is untrusted, and we have a frag_list with * a linear non head_frag item. * * If head_skb's headlen does not fit requested gso_size, * it means that the frag_list members do NOT terminate * on exact gso_size boundaries. Hence we cannot perform * skb_frag_t page sharing. Therefore we must fallback to * copying the frag_list skbs; we do so by disabling SG. */ features &= ~NETIF_F_SG; break; } } } __skb_push(head_skb, doffset); proto = skb_network_protocol(head_skb, NULL); if (unlikely(!proto)) return ERR_PTR(-EINVAL); sg = !!(features & NETIF_F_SG); csum = !!can_checksum_protocol(features, proto); if (sg && csum && (mss != GSO_BY_FRAGS)) { if (!(features & NETIF_F_GSO_PARTIAL)) { struct sk_buff *iter; unsigned int frag_len; if (!list_skb || !net_gso_ok(features, skb_shinfo(head_skb)->gso_type)) goto normal; /* If we get here then all the required * GSO features except frag_list are supported. * Try to split the SKB to multiple GSO SKBs * with no frag_list. * Currently we can do that only when the buffers don't * have a linear part and all the buffers except * the last are of the same length. */ frag_len = list_skb->len; skb_walk_frags(head_skb, iter) { if (frag_len != iter->len && iter->next) goto normal; if (skb_headlen(iter) && !iter->head_frag) goto normal; len -= iter->len; } if (len != frag_len) goto normal; } /* GSO partial only requires that we trim off any excess that * doesn't fit into an MSS sized block, so take care of that * now. * Cap len to not accidentally hit GSO_BY_FRAGS. */ partial_segs = min(len, GSO_BY_FRAGS - 1) / mss; if (partial_segs > 1) mss *= partial_segs; else partial_segs = 0; } normal: headroom = skb_headroom(head_skb); pos = skb_headlen(head_skb); if (skb_orphan_frags(head_skb, GFP_ATOMIC)) return ERR_PTR(-ENOMEM); nfrags = skb_shinfo(head_skb)->nr_frags; frag = skb_shinfo(head_skb)->frags; frag_skb = head_skb; do { struct sk_buff *nskb; skb_frag_t *nskb_frag; int hsize; int size; if (unlikely(mss == GSO_BY_FRAGS)) { len = list_skb->len; } else { len = head_skb->len - offset; if (len > mss) len = mss; } hsize = skb_headlen(head_skb) - offset; if (hsize <= 0 && i >= nfrags && skb_headlen(list_skb) && (skb_headlen(list_skb) == len || sg)) { BUG_ON(skb_headlen(list_skb) > len); nskb = skb_clone(list_skb, GFP_ATOMIC); if (unlikely(!nskb)) goto err; i = 0; nfrags = skb_shinfo(list_skb)->nr_frags; frag = skb_shinfo(list_skb)->frags; frag_skb = list_skb; pos += skb_headlen(list_skb); while (pos < offset + len) { BUG_ON(i >= nfrags); size = skb_frag_size(frag); if (pos + size > offset + len) break; i++; pos += size; frag++; } list_skb = list_skb->next; if (unlikely(pskb_trim(nskb, len))) { kfree_skb(nskb); goto err; } hsize = skb_end_offset(nskb); if (skb_cow_head(nskb, doffset + headroom)) { kfree_skb(nskb); goto err; } nskb->truesize += skb_end_offset(nskb) - hsize; skb_release_head_state(nskb); __skb_push(nskb, doffset); } else { if (hsize < 0) hsize = 0; if (hsize > len || !sg) hsize = len; nskb = __alloc_skb(hsize + doffset + headroom, GFP_ATOMIC, skb_alloc_rx_flag(head_skb), NUMA_NO_NODE); if (unlikely(!nskb)) goto err; skb_reserve(nskb, headroom); __skb_put(nskb, doffset); } if (segs) tail->next = nskb; else segs = nskb; tail = nskb; __copy_skb_header(nskb, head_skb); skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom); skb_reset_mac_len(nskb); skb_copy_from_linear_data_offset(head_skb, -tnl_hlen, nskb->data - tnl_hlen, doffset + tnl_hlen); if (nskb->len == len + doffset) goto perform_csum_check; if (!sg) { if (!csum) { if (!nskb->remcsum_offload) nskb->ip_summed = CHECKSUM_NONE; SKB_GSO_CB(nskb)->csum = skb_copy_and_csum_bits(head_skb, offset, skb_put(nskb, len), len); SKB_GSO_CB(nskb)->csum_start = skb_headroom(nskb) + doffset; } else { if (skb_copy_bits(head_skb, offset, skb_put(nskb, len), len)) goto err; } continue; } nskb_frag = skb_shinfo(nskb)->frags; skb_copy_from_linear_data_offset(head_skb, offset, skb_put(nskb, hsize), hsize); skb_shinfo(nskb)->flags |= skb_shinfo(head_skb)->flags & SKBFL_SHARED_FRAG; if (skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC)) goto err; while (pos < offset + len) { if (i >= nfrags) { if (skb_orphan_frags(list_skb, GFP_ATOMIC) || skb_zerocopy_clone(nskb, list_skb, GFP_ATOMIC)) goto err; i = 0; nfrags = skb_shinfo(list_skb)->nr_frags; frag = skb_shinfo(list_skb)->frags; frag_skb = list_skb; if (!skb_headlen(list_skb)) { BUG_ON(!nfrags); } else { BUG_ON(!list_skb->head_frag); /* to make room for head_frag. */ i--; frag--; } list_skb = list_skb->next; } if (unlikely(skb_shinfo(nskb)->nr_frags >= MAX_SKB_FRAGS)) { net_warn_ratelimited( "skb_segment: too many frags: %u %u\n", pos, mss); err = -EINVAL; goto err; } *nskb_frag = (i < 0) ? skb_head_frag_to_page_desc(frag_skb) : *frag; __skb_frag_ref(nskb_frag); size = skb_frag_size(nskb_frag); if (pos < offset) { skb_frag_off_add(nskb_frag, offset - pos); skb_frag_size_sub(nskb_frag, offset - pos); } skb_shinfo(nskb)->nr_frags++; if (pos + size <= offset + len) { i++; frag++; pos += size; } else { skb_frag_size_sub(nskb_frag, pos + size - (offset + len)); goto skip_fraglist; } nskb_frag++; } skip_fraglist: nskb->data_len = len - hsize; nskb->len += nskb->data_len; nskb->truesize += nskb->data_len; perform_csum_check: if (!csum) { if (skb_has_shared_frag(nskb) && __skb_linearize(nskb)) goto err; if (!nskb->remcsum_offload) nskb->ip_summed = CHECKSUM_NONE; SKB_GSO_CB(nskb)->csum = skb_checksum(nskb, doffset, nskb->len - doffset, 0); SKB_GSO_CB(nskb)->csum_start = skb_headroom(nskb) + doffset; } } while ((offset += len) < head_skb->len); /* Some callers want to get the end of the list. * Put it in segs->prev to avoid walking the list. * (see validate_xmit_skb_list() for example) */ segs->prev = tail; if (partial_segs) { struct sk_buff *iter; int type = skb_shinfo(head_skb)->gso_type; unsigned short gso_size = skb_shinfo(head_skb)->gso_size; /* Update type to add partial and then remove dodgy if set */ type |= (features & NETIF_F_GSO_PARTIAL) / NETIF_F_GSO_PARTIAL * SKB_GSO_PARTIAL; type &= ~SKB_GSO_DODGY; /* Update GSO info and prepare to start updating headers on * our way back down the stack of protocols. */ for (iter = segs; iter; iter = iter->next) { skb_shinfo(iter)->gso_size = gso_size; skb_shinfo(iter)->gso_segs = partial_segs; skb_shinfo(iter)->gso_type = type; SKB_GSO_CB(iter)->data_offset = skb_headroom(iter) + doffset; } if (tail->len - doffset <= gso_size) skb_shinfo(tail)->gso_size = 0; else if (tail != segs) skb_shinfo(tail)->gso_segs = DIV_ROUND_UP(tail->len - doffset, gso_size); } /* Following permits correct backpressure, for protocols * using skb_set_owner_w(). * Idea is to tranfert ownership from head_skb to last segment. */ if (head_skb->destructor == sock_wfree) { swap(tail->truesize, head_skb->truesize); swap(tail->destructor, head_skb->destructor); swap(tail->sk, head_skb->sk); } return segs; err: kfree_skb_list(segs); return ERR_PTR(err); } EXPORT_SYMBOL_GPL(skb_segment); #ifdef CONFIG_SKB_EXTENSIONS #define SKB_EXT_ALIGN_VALUE 8 #define SKB_EXT_CHUNKSIZEOF(x) (ALIGN((sizeof(x)), SKB_EXT_ALIGN_VALUE) / SKB_EXT_ALIGN_VALUE) static const u8 skb_ext_type_len[] = { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) [SKB_EXT_BRIDGE_NF] = SKB_EXT_CHUNKSIZEOF(struct nf_bridge_info), #endif #ifdef CONFIG_XFRM [SKB_EXT_SEC_PATH] = SKB_EXT_CHUNKSIZEOF(struct sec_path), #endif #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) [TC_SKB_EXT] = SKB_EXT_CHUNKSIZEOF(struct tc_skb_ext), #endif #if IS_ENABLED(CONFIG_MPTCP) [SKB_EXT_MPTCP] = SKB_EXT_CHUNKSIZEOF(struct mptcp_ext), #endif #if IS_ENABLED(CONFIG_MCTP_FLOWS) [SKB_EXT_MCTP] = SKB_EXT_CHUNKSIZEOF(struct mctp_flow), #endif }; static __always_inline unsigned int skb_ext_total_length(void) { unsigned int l = SKB_EXT_CHUNKSIZEOF(struct skb_ext); int i; for (i = 0; i < ARRAY_SIZE(skb_ext_type_len); i++) l += skb_ext_type_len[i]; return l; } static void skb_extensions_init(void) { BUILD_BUG_ON(SKB_EXT_NUM >= 8); #if !IS_ENABLED(CONFIG_KCOV_INSTRUMENT_ALL) BUILD_BUG_ON(skb_ext_total_length() > 255); #endif skbuff_ext_cache = kmem_cache_create("skbuff_ext_cache", SKB_EXT_ALIGN_VALUE * skb_ext_total_length(), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); } #else static void skb_extensions_init(void) {} #endif /* The SKB kmem_cache slab is critical for network performance. Never * merge/alias the slab with similar sized objects. This avoids fragmentation * that hurts performance of kmem_cache_{alloc,free}_bulk APIs. */ #ifndef CONFIG_SLUB_TINY #define FLAG_SKB_NO_MERGE SLAB_NO_MERGE #else /* CONFIG_SLUB_TINY - simple loop in kmem_cache_alloc_bulk */ #define FLAG_SKB_NO_MERGE 0 #endif void __init skb_init(void) { net_hotdata.skbuff_cache = kmem_cache_create_usercopy("skbuff_head_cache", sizeof(struct sk_buff), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC| FLAG_SKB_NO_MERGE, offsetof(struct sk_buff, cb), sizeof_field(struct sk_buff, cb), NULL); net_hotdata.skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache", sizeof(struct sk_buff_fclones), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); /* usercopy should only access first SKB_SMALL_HEAD_HEADROOM bytes. * struct skb_shared_info is located at the end of skb->head, * and should not be copied to/from user. */ net_hotdata.skb_small_head_cache = kmem_cache_create_usercopy("skbuff_small_head", SKB_SMALL_HEAD_CACHE_SIZE, 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, 0, SKB_SMALL_HEAD_HEADROOM, NULL); skb_extensions_init(); } static int __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len, unsigned int recursion_level) { int start = skb_headlen(skb); int i, copy = start - offset; struct sk_buff *frag_iter; int elt = 0; if (unlikely(recursion_level >= 24)) return -EMSGSIZE; if (copy > 0) { if (copy > len) copy = len; sg_set_buf(sg, skb->data + offset, copy); elt++; if ((len -= copy) == 0) return elt; offset += copy; } for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end; WARN_ON(start > offset + len); end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); if ((copy = end - offset) > 0) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; if (unlikely(elt && sg_is_last(&sg[elt - 1]))) return -EMSGSIZE; if (copy > len) copy = len; sg_set_page(&sg[elt], skb_frag_page(frag), copy, skb_frag_off(frag) + offset - start); elt++; if (!(len -= copy)) return elt; offset += copy; } start = end; } skb_walk_frags(skb, frag_iter) { int end, ret; WARN_ON(start > offset + len); end = start + frag_iter->len; if ((copy = end - offset) > 0) { if (unlikely(elt && sg_is_last(&sg[elt - 1]))) return -EMSGSIZE; if (copy > len) copy = len; ret = __skb_to_sgvec(frag_iter, sg+elt, offset - start, copy, recursion_level + 1); if (unlikely(ret < 0)) return ret; elt += ret; if ((len -= copy) == 0) return elt; offset += copy; } start = end; } BUG_ON(len); return elt; } /** * skb_to_sgvec - Fill a scatter-gather list from a socket buffer * @skb: Socket buffer containing the buffers to be mapped * @sg: The scatter-gather list to map into * @offset: The offset into the buffer's contents to start mapping * @len: Length of buffer space to be mapped * * Fill the specified scatter-gather list with mappings/pointers into a * region of the buffer space attached to a socket buffer. Returns either * the number of scatterlist items used, or -EMSGSIZE if the contents * could not fit. */ int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) { int nsg = __skb_to_sgvec(skb, sg, offset, len, 0); if (nsg <= 0) return nsg; sg_mark_end(&sg[nsg - 1]); return nsg; } EXPORT_SYMBOL_GPL(skb_to_sgvec); /* As compared with skb_to_sgvec, skb_to_sgvec_nomark only map skb to given * sglist without mark the sg which contain last skb data as the end. * So the caller can mannipulate sg list as will when padding new data after * the first call without calling sg_unmark_end to expend sg list. * * Scenario to use skb_to_sgvec_nomark: * 1. sg_init_table * 2. skb_to_sgvec_nomark(payload1) * 3. skb_to_sgvec_nomark(payload2) * * This is equivalent to: * 1. sg_init_table * 2. skb_to_sgvec(payload1) * 3. sg_unmark_end * 4. skb_to_sgvec(payload2) * * When mapping multiple payload conditionally, skb_to_sgvec_nomark * is more preferable. */ int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) { return __skb_to_sgvec(skb, sg, offset, len, 0); } EXPORT_SYMBOL_GPL(skb_to_sgvec_nomark); /** * skb_cow_data - Check that a socket buffer's data buffers are writable * @skb: The socket buffer to check. * @tailbits: Amount of trailing space to be added * @trailer: Returned pointer to the skb where the @tailbits space begins * * Make sure that the data buffers attached to a socket buffer are * writable. If they are not, private copies are made of the data buffers * and the socket buffer is set to use these instead. * * If @tailbits is given, make sure that there is space to write @tailbits * bytes of data beyond current end of socket buffer. @trailer will be * set to point to the skb in which this space begins. * * The number of scatterlist elements required to completely map the * COW'd and extended socket buffer will be returned. */ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) { int copyflag; int elt; struct sk_buff *skb1, **skb_p; /* If skb is cloned or its head is paged, reallocate * head pulling out all the pages (pages are considered not writable * at the moment even if they are anonymous). */ if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) && !__pskb_pull_tail(skb, __skb_pagelen(skb))) return -ENOMEM; /* Easy case. Most of packets will go this way. */ if (!skb_has_frag_list(skb)) { /* A little of trouble, not enough of space for trailer. * This should not happen, when stack is tuned to generate * good frames. OK, on miss we reallocate and reserve even more * space, 128 bytes is fair. */ if (skb_tailroom(skb) < tailbits && pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC)) return -ENOMEM; /* Voila! */ *trailer = skb; return 1; } /* Misery. We are in troubles, going to mincer fragments... */ elt = 1; skb_p = &skb_shinfo(skb)->frag_list; copyflag = 0; while ((skb1 = *skb_p) != NULL) { int ntail = 0; /* The fragment is partially pulled by someone, * this can happen on input. Copy it and everything * after it. */ if (skb_shared(skb1)) copyflag = 1; /* If the skb is the last, worry about trailer. */ if (skb1->next == NULL && tailbits) { if (skb_shinfo(skb1)->nr_frags || skb_has_frag_list(skb1) || skb_tailroom(skb1) < tailbits) ntail = tailbits + 128; } if (copyflag || skb_cloned(skb1) || ntail || skb_shinfo(skb1)->nr_frags || skb_has_frag_list(skb1)) { struct sk_buff *skb2; /* Fuck, we are miserable poor guys... */ if (ntail == 0) skb2 = skb_copy(skb1, GFP_ATOMIC); else skb2 = skb_copy_expand(skb1, skb_headroom(skb1), ntail, GFP_ATOMIC); if (unlikely(skb2 == NULL)) return -ENOMEM; if (skb1->sk) skb_set_owner_w(skb2, skb1->sk); /* Looking around. Are we still alive? * OK, link new skb, drop old one */ skb2->next = skb1->next; *skb_p = skb2; kfree_skb(skb1); skb1 = skb2; } elt++; *trailer = skb1; skb_p = &skb1->next; } return elt; } EXPORT_SYMBOL_GPL(skb_cow_data); static void sock_rmem_free(struct sk_buff *skb) { struct sock *sk = skb->sk; atomic_sub(skb->truesize, &sk->sk_rmem_alloc); } static void skb_set_err_queue(struct sk_buff *skb) { /* pkt_type of skbs received on local sockets is never PACKET_OUTGOING. * So, it is safe to (mis)use it to mark skbs on the error queue. */ skb->pkt_type = PACKET_OUTGOING; BUILD_BUG_ON(PACKET_OUTGOING == 0); } /* * Note: We dont mem charge error packets (no sk_forward_alloc changes) */ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) { if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= (unsigned int)READ_ONCE(sk->sk_rcvbuf)) return -ENOMEM; skb_orphan(skb); skb->sk = sk; skb->destructor = sock_rmem_free; atomic_add(skb->truesize, &sk->sk_rmem_alloc); skb_set_err_queue(skb); /* before exiting rcu section, make sure dst is refcounted */ skb_dst_force(skb); skb_queue_tail(&sk->sk_error_queue, skb); if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); return 0; } EXPORT_SYMBOL(sock_queue_err_skb); static bool is_icmp_err_skb(const struct sk_buff *skb) { return skb && (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP || SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP6); } struct sk_buff *sock_dequeue_err_skb(struct sock *sk) { struct sk_buff_head *q = &sk->sk_error_queue; struct sk_buff *skb, *skb_next = NULL; bool icmp_next = false; unsigned long flags; if (skb_queue_empty_lockless(q)) return NULL; spin_lock_irqsave(&q->lock, flags); skb = __skb_dequeue(q); if (skb && (skb_next = skb_peek(q))) { icmp_next = is_icmp_err_skb(skb_next); if (icmp_next) sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_errno; } spin_unlock_irqrestore(&q->lock, flags); if (is_icmp_err_skb(skb) && !icmp_next) sk->sk_err = 0; if (skb_next) sk_error_report(sk); return skb; } EXPORT_SYMBOL(sock_dequeue_err_skb); /** * skb_clone_sk - create clone of skb, and take reference to socket * @skb: the skb to clone * * This function creates a clone of a buffer that holds a reference on * sk_refcnt. Buffers created via this function are meant to be * returned using sock_queue_err_skb, or free via kfree_skb. * * When passing buffers allocated with this function to sock_queue_err_skb * it is necessary to wrap the call with sock_hold/sock_put in order to * prevent the socket from being released prior to being enqueued on * the sk_error_queue. */ struct sk_buff *skb_clone_sk(struct sk_buff *skb) { struct sock *sk = skb->sk; struct sk_buff *clone; if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt)) return NULL; clone = skb_clone(skb, GFP_ATOMIC); if (!clone) { sock_put(sk); return NULL; } clone->sk = sk; clone->destructor = sock_efree; return clone; } EXPORT_SYMBOL(skb_clone_sk); static void __skb_complete_tx_timestamp(struct sk_buff *skb, struct sock *sk, int tstype, bool opt_stats) { struct sock_exterr_skb *serr; int err; BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb)); serr = SKB_EXT_ERR(skb); memset(serr, 0, sizeof(*serr)); serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; serr->ee.ee_info = tstype; serr->opt_stats = opt_stats; serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0; if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) { serr->ee.ee_data = skb_shinfo(skb)->tskey; if (sk_is_tcp(sk)) serr->ee.ee_data -= atomic_read(&sk->sk_tskey); } err = sock_queue_err_skb(sk, skb); if (err) kfree_skb(skb); } static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly) { bool ret; if (likely(tsonly || READ_ONCE(sock_net(sk)->core.sysctl_tstamp_allow_data))) return true; read_lock_bh(&sk->sk_callback_lock); ret = sk->sk_socket && sk->sk_socket->file && file_ns_capable(sk->sk_socket->file, &init_user_ns, CAP_NET_RAW); read_unlock_bh(&sk->sk_callback_lock); return ret; } void skb_complete_tx_timestamp(struct sk_buff *skb, struct skb_shared_hwtstamps *hwtstamps) { struct sock *sk = skb->sk; if (!skb_may_tx_timestamp(sk, false)) goto err; /* Take a reference to prevent skb_orphan() from freeing the socket, * but only if the socket refcount is not zero. */ if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) { *skb_hwtstamps(skb) = *hwtstamps; __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false); sock_put(sk); return; } err: kfree_skb(skb); } EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp); static bool skb_tstamp_tx_report_so_timestamping(struct sk_buff *skb, struct skb_shared_hwtstamps *hwtstamps, int tstype) { switch (tstype) { case SCM_TSTAMP_SCHED: return skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP; case SCM_TSTAMP_SND: return skb_shinfo(skb)->tx_flags & (hwtstamps ? SKBTX_HW_TSTAMP_NOBPF : SKBTX_SW_TSTAMP); case SCM_TSTAMP_ACK: return TCP_SKB_CB(skb)->txstamp_ack & TSTAMP_ACK_SK; case SCM_TSTAMP_COMPLETION: return skb_shinfo(skb)->tx_flags & SKBTX_COMPLETION_TSTAMP; } return false; } static void skb_tstamp_tx_report_bpf_timestamping(struct sk_buff *skb, struct skb_shared_hwtstamps *hwtstamps, struct sock *sk, int tstype) { int op; switch (tstype) { case SCM_TSTAMP_SCHED: op = BPF_SOCK_OPS_TSTAMP_SCHED_CB; break; case SCM_TSTAMP_SND: if (hwtstamps) { op = BPF_SOCK_OPS_TSTAMP_SND_HW_CB; *skb_hwtstamps(skb) = *hwtstamps; } else { op = BPF_SOCK_OPS_TSTAMP_SND_SW_CB; } break; case SCM_TSTAMP_ACK: op = BPF_SOCK_OPS_TSTAMP_ACK_CB; break; default: return; } bpf_skops_tx_timestamping(sk, skb, op); } void __skb_tstamp_tx(struct sk_buff *orig_skb, const struct sk_buff *ack_skb, struct skb_shared_hwtstamps *hwtstamps, struct sock *sk, int tstype) { struct sk_buff *skb; bool tsonly, opt_stats = false; u32 tsflags; if (!sk) return; if (skb_shinfo(orig_skb)->tx_flags & SKBTX_BPF) skb_tstamp_tx_report_bpf_timestamping(orig_skb, hwtstamps, sk, tstype); if (!skb_tstamp_tx_report_so_timestamping(orig_skb, hwtstamps, tstype)) return; tsflags = READ_ONCE(sk->sk_tsflags); if (!hwtstamps && !(tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) && skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS) return; tsonly = tsflags & SOF_TIMESTAMPING_OPT_TSONLY; if (!skb_may_tx_timestamp(sk, tsonly)) return; if (tsonly) { #ifdef CONFIG_INET if ((tsflags & SOF_TIMESTAMPING_OPT_STATS) && sk_is_tcp(sk)) { skb = tcp_get_timestamping_opt_stats(sk, orig_skb, ack_skb); opt_stats = true; } else #endif skb = alloc_skb(0, GFP_ATOMIC); } else { skb = skb_clone(orig_skb, GFP_ATOMIC); if (skb_orphan_frags_rx(skb, GFP_ATOMIC)) { kfree_skb(skb); return; } } if (!skb) return; if (tsonly) { skb_shinfo(skb)->tx_flags |= skb_shinfo(orig_skb)->tx_flags & SKBTX_ANY_TSTAMP; skb_shinfo(skb)->tskey = skb_shinfo(orig_skb)->tskey; } if (hwtstamps) *skb_hwtstamps(skb) = *hwtstamps; else __net_timestamp(skb); __skb_complete_tx_timestamp(skb, sk, tstype, opt_stats); } EXPORT_SYMBOL_GPL(__skb_tstamp_tx); void skb_tstamp_tx(struct sk_buff *orig_skb, struct skb_shared_hwtstamps *hwtstamps) { return __skb_tstamp_tx(orig_skb, NULL, hwtstamps, orig_skb->sk, SCM_TSTAMP_SND); } EXPORT_SYMBOL_GPL(skb_tstamp_tx); #ifdef CONFIG_WIRELESS void skb_complete_wifi_ack(struct sk_buff *skb, bool acked) { struct sock *sk = skb->sk; struct sock_exterr_skb *serr; int err = 1; skb->wifi_acked_valid = 1; skb->wifi_acked = acked; serr = SKB_EXT_ERR(skb); memset(serr, 0, sizeof(*serr)); serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS; /* Take a reference to prevent skb_orphan() from freeing the socket, * but only if the socket refcount is not zero. */ if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) { err = sock_queue_err_skb(sk, skb); sock_put(sk); } if (err) kfree_skb(skb); } EXPORT_SYMBOL_GPL(skb_complete_wifi_ack); #endif /* CONFIG_WIRELESS */ /** * skb_partial_csum_set - set up and verify partial csum values for packet * @skb: the skb to set * @start: the number of bytes after skb->data to start checksumming. * @off: the offset from start to place the checksum. * * For untrusted partially-checksummed packets, we need to make sure the values * for skb->csum_start and skb->csum_offset are valid so we don't oops. * * This function checks and sets those values and skb->ip_summed: if this * returns false you should drop the packet. */ bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off) { u32 csum_end = (u32)start + (u32)off + sizeof(__sum16); u32 csum_start = skb_headroom(skb) + (u32)start; if (unlikely(csum_start >= U16_MAX || csum_end > skb_headlen(skb))) { net_warn_ratelimited("bad partial csum: csum=%u/%u headroom=%u headlen=%u\n", start, off, skb_headroom(skb), skb_headlen(skb)); return false; } skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = csum_start; skb->csum_offset = off; skb->transport_header = csum_start; return true; } EXPORT_SYMBOL_GPL(skb_partial_csum_set); static int skb_maybe_pull_tail(struct sk_buff *skb, unsigned int len, unsigned int max) { if (skb_headlen(skb) >= len) return 0; /* If we need to pullup then pullup to the max, so we * won't need to do it again. */ if (max > skb->len) max = skb->len; if (__pskb_pull_tail(skb, max - skb_headlen(skb)) == NULL) return -ENOMEM; if (skb_headlen(skb) < len) return -EPROTO; return 0; } #define MAX_TCP_HDR_LEN (15 * 4) static __sum16 *skb_checksum_setup_ip(struct sk_buff *skb, typeof(IPPROTO_IP) proto, unsigned int off) { int err; switch (proto) { case IPPROTO_TCP: err = skb_maybe_pull_tail(skb, off + sizeof(struct tcphdr), off + MAX_TCP_HDR_LEN); if (!err && !skb_partial_csum_set(skb, off, offsetof(struct tcphdr, check))) err = -EPROTO; return err ? ERR_PTR(err) : &tcp_hdr(skb)->check; case IPPROTO_UDP: err = skb_maybe_pull_tail(skb, off + sizeof(struct udphdr), off + sizeof(struct udphdr)); if (!err && !skb_partial_csum_set(skb, off, offsetof(struct udphdr, check))) err = -EPROTO; return err ? ERR_PTR(err) : &udp_hdr(skb)->check; } return ERR_PTR(-EPROTO); } /* This value should be large enough to cover a tagged ethernet header plus * maximally sized IP and TCP or UDP headers. */ #define MAX_IP_HDR_LEN 128 static int skb_checksum_setup_ipv4(struct sk_buff *skb, bool recalculate) { unsigned int off; bool fragment; __sum16 *csum; int err; fragment = false; err = skb_maybe_pull_tail(skb, sizeof(struct iphdr), MAX_IP_HDR_LEN); if (err < 0) goto out; if (ip_is_fragment(ip_hdr(skb))) fragment = true; off = ip_hdrlen(skb); err = -EPROTO; if (fragment) goto out; csum = skb_checksum_setup_ip(skb, ip_hdr(skb)->protocol, off); if (IS_ERR(csum)) return PTR_ERR(csum); if (recalculate) *csum = ~csum_tcpudp_magic(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, skb->len - off, ip_hdr(skb)->protocol, 0); err = 0; out: return err; } /* This value should be large enough to cover a tagged ethernet header plus * an IPv6 header, all options, and a maximal TCP or UDP header. */ #define MAX_IPV6_HDR_LEN 256 #define OPT_HDR(type, skb, off) \ (type *)(skb_network_header(skb) + (off)) static int skb_checksum_setup_ipv6(struct sk_buff *skb, bool recalculate) { int err; u8 nexthdr; unsigned int off; unsigned int len; bool fragment; bool done; __sum16 *csum; fragment = false; done = false; off = sizeof(struct ipv6hdr); err = skb_maybe_pull_tail(skb, off, MAX_IPV6_HDR_LEN); if (err < 0) goto out; nexthdr = ipv6_hdr(skb)->nexthdr; len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len); while (off <= len && !done) { switch (nexthdr) { case IPPROTO_DSTOPTS: case IPPROTO_HOPOPTS: case IPPROTO_ROUTING: { struct ipv6_opt_hdr *hp; err = skb_maybe_pull_tail(skb, off + sizeof(struct ipv6_opt_hdr), MAX_IPV6_HDR_LEN); if (err < 0) goto out; hp = OPT_HDR(struct ipv6_opt_hdr, skb, off); nexthdr = hp->nexthdr; off += ipv6_optlen(hp); break; } case IPPROTO_AH: { struct ip_auth_hdr *hp; err = skb_maybe_pull_tail(skb, off + sizeof(struct ip_auth_hdr), MAX_IPV6_HDR_LEN); if (err < 0) goto out; hp = OPT_HDR(struct ip_auth_hdr, skb, off); nexthdr = hp->nexthdr; off += ipv6_authlen(hp); break; } case IPPROTO_FRAGMENT: { struct frag_hdr *hp; err = skb_maybe_pull_tail(skb, off + sizeof(struct frag_hdr), MAX_IPV6_HDR_LEN); if (err < 0) goto out; hp = OPT_HDR(struct frag_hdr, skb, off); if (hp->frag_off & htons(IP6_OFFSET | IP6_MF)) fragment = true; nexthdr = hp->nexthdr; off += sizeof(struct frag_hdr); break; } default: done = true; break; } } err = -EPROTO; if (!done || fragment) goto out; csum = skb_checksum_setup_ip(skb, nexthdr, off); if (IS_ERR(csum)) return PTR_ERR(csum); if (recalculate) *csum = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, skb->len - off, nexthdr, 0); err = 0; out: return err; } /** * skb_checksum_setup - set up partial checksum offset * @skb: the skb to set up * @recalculate: if true the pseudo-header checksum will be recalculated */ int skb_checksum_setup(struct sk_buff *skb, bool recalculate) { int err; switch (skb->protocol) { case htons(ETH_P_IP): err = skb_checksum_setup_ipv4(skb, recalculate); break; case htons(ETH_P_IPV6): err = skb_checksum_setup_ipv6(skb, recalculate); break; default: err = -EPROTO; break; } return err; } EXPORT_SYMBOL(skb_checksum_setup); /** * skb_checksum_maybe_trim - maybe trims the given skb * @skb: the skb to check * @transport_len: the data length beyond the network header * * Checks whether the given skb has data beyond the given transport length. * If so, returns a cloned skb trimmed to this transport length. * Otherwise returns the provided skb. Returns NULL in error cases * (e.g. transport_len exceeds skb length or out-of-memory). * * Caller needs to set the skb transport header and free any returned skb if it * differs from the provided skb. */ static struct sk_buff *skb_checksum_maybe_trim(struct sk_buff *skb, unsigned int transport_len) { struct sk_buff *skb_chk; unsigned int len = skb_transport_offset(skb) + transport_len; int ret; if (skb->len < len) return NULL; else if (skb->len == len) return skb; skb_chk = skb_clone(skb, GFP_ATOMIC); if (!skb_chk) return NULL; ret = pskb_trim_rcsum(skb_chk, len); if (ret) { kfree_skb(skb_chk); return NULL; } return skb_chk; } /** * skb_checksum_trimmed - validate checksum of an skb * @skb: the skb to check * @transport_len: the data length beyond the network header * @skb_chkf: checksum function to use * * Applies the given checksum function skb_chkf to the provided skb. * Returns a checked and maybe trimmed skb. Returns NULL on error. * * If the skb has data beyond the given transport length, then a * trimmed & cloned skb is checked and returned. * * Caller needs to set the skb transport header and free any returned skb if it * differs from the provided skb. */ struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb, unsigned int transport_len, __sum16(*skb_chkf)(struct sk_buff *skb)) { struct sk_buff *skb_chk; unsigned int offset = skb_transport_offset(skb); __sum16 ret; skb_chk = skb_checksum_maybe_trim(skb, transport_len); if (!skb_chk) goto err; if (!pskb_may_pull(skb_chk, offset)) goto err; skb_pull_rcsum(skb_chk, offset); ret = skb_chkf(skb_chk); skb_push_rcsum(skb_chk, offset); if (ret) goto err; return skb_chk; err: if (skb_chk && skb_chk != skb) kfree_skb(skb_chk); return NULL; } EXPORT_SYMBOL(skb_checksum_trimmed); void __skb_warn_lro_forwarding(const struct sk_buff *skb) { net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n", skb->dev->name); } EXPORT_SYMBOL(__skb_warn_lro_forwarding); void kfree_skb_partial(struct sk_buff *skb, bool head_stolen) { if (head_stolen) { skb_release_head_state(skb); kmem_cache_free(net_hotdata.skbuff_cache, skb); } else { __kfree_skb(skb); } } EXPORT_SYMBOL(kfree_skb_partial); /** * skb_try_coalesce - try to merge skb to prior one * @to: prior buffer * @from: buffer to add * @fragstolen: pointer to boolean * @delta_truesize: how much more was allocated than was requested */ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, bool *fragstolen, int *delta_truesize) { struct skb_shared_info *to_shinfo, *from_shinfo; int i, delta, len = from->len; *fragstolen = false; if (skb_cloned(to)) return false; /* In general, avoid mixing page_pool and non-page_pool allocated * pages within the same SKB. In theory we could take full * references if @from is cloned and !@to->pp_recycle but its * tricky (due to potential race with the clone disappearing) and * rare, so not worth dealing with. */ if (to->pp_recycle != from->pp_recycle) return false; if (skb_frags_readable(from) != skb_frags_readable(to)) return false; if (len <= skb_tailroom(to) && skb_frags_readable(from)) { if (len) BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len)); *delta_truesize = 0; return true; } to_shinfo = skb_shinfo(to); from_shinfo = skb_shinfo(from); if (to_shinfo->frag_list || from_shinfo->frag_list) return false; if (skb_zcopy(to) || skb_zcopy(from)) return false; if (skb_headlen(from) != 0) { struct page *page; unsigned int offset; if (to_shinfo->nr_frags + from_shinfo->nr_frags >= MAX_SKB_FRAGS) return false; if (skb_head_is_locked(from)) return false; delta = from->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff)); page = virt_to_head_page(from->head); offset = from->data - (unsigned char *)page_address(page); skb_fill_page_desc(to, to_shinfo->nr_frags, page, offset, skb_headlen(from)); *fragstolen = true; } else { if (to_shinfo->nr_frags + from_shinfo->nr_frags > MAX_SKB_FRAGS) return false; delta = from->truesize - SKB_TRUESIZE(skb_end_offset(from)); } WARN_ON_ONCE(delta < len); memcpy(to_shinfo->frags + to_shinfo->nr_frags, from_shinfo->frags, from_shinfo->nr_frags * sizeof(skb_frag_t)); to_shinfo->nr_frags += from_shinfo->nr_frags; if (!skb_cloned(from)) from_shinfo->nr_frags = 0; /* if the skb is not cloned this does nothing * since we set nr_frags to 0. */ if (skb_pp_frag_ref(from)) { for (i = 0; i < from_shinfo->nr_frags; i++) __skb_frag_ref(&from_shinfo->frags[i]); } to->truesize += delta; to->len += len; to->data_len += len; *delta_truesize = delta; return true; } EXPORT_SYMBOL(skb_try_coalesce); /** * skb_scrub_packet - scrub an skb * * @skb: buffer to clean * @xnet: packet is crossing netns * * skb_scrub_packet can be used after encapsulating or decapsulating a packet * into/from a tunnel. Some information have to be cleared during these * operations. * skb_scrub_packet can also be used to clean a skb before injecting it in * another namespace (@xnet == true). We have to clear all information in the * skb that could impact namespace isolation. */ void skb_scrub_packet(struct sk_buff *skb, bool xnet) { skb->pkt_type = PACKET_HOST; skb->skb_iif = 0; skb->ignore_df = 0; skb_dst_drop(skb); skb_ext_reset(skb); nf_reset_ct(skb); nf_reset_trace(skb); #ifdef CONFIG_NET_SWITCHDEV skb->offload_fwd_mark = 0; skb->offload_l3_fwd_mark = 0; #endif ipvs_reset(skb); if (!xnet) return; skb->mark = 0; skb_clear_tstamp(skb); } EXPORT_SYMBOL_GPL(skb_scrub_packet); static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb) { int mac_len, meta_len; void *meta; if (skb_cow(skb, skb_headroom(skb)) < 0) { kfree_skb(skb); return NULL; } mac_len = skb->data - skb_mac_header(skb); if (likely(mac_len > VLAN_HLEN + ETH_TLEN)) { memmove(skb_mac_header(skb) + VLAN_HLEN, skb_mac_header(skb), mac_len - VLAN_HLEN - ETH_TLEN); } meta_len = skb_metadata_len(skb); if (meta_len) { meta = skb_metadata_end(skb) - meta_len; memmove(meta + VLAN_HLEN, meta, meta_len); } skb->mac_header += VLAN_HLEN; return skb; } struct sk_buff *skb_vlan_untag(struct sk_buff *skb) { struct vlan_hdr *vhdr; u16 vlan_tci; if (unlikely(skb_vlan_tag_present(skb))) { /* vlan_tci is already set-up so leave this for another time */ return skb; } skb = skb_share_check(skb, GFP_ATOMIC); if (unlikely(!skb)) goto err_free; /* We may access the two bytes after vlan_hdr in vlan_set_encap_proto(). */ if (unlikely(!pskb_may_pull(skb, VLAN_HLEN + sizeof(unsigned short)))) goto err_free; vhdr = (struct vlan_hdr *)skb->data; vlan_tci = ntohs(vhdr->h_vlan_TCI); __vlan_hwaccel_put_tag(skb, skb->protocol, vlan_tci); skb_pull_rcsum(skb, VLAN_HLEN); vlan_set_encap_proto(skb, vhdr); skb = skb_reorder_vlan_header(skb); if (unlikely(!skb)) goto err_free; skb_reset_network_header(skb); if (!skb_transport_header_was_set(skb)) skb_reset_transport_header(skb); skb_reset_mac_len(skb); return skb; err_free: kfree_skb(skb); return NULL; } EXPORT_SYMBOL(skb_vlan_untag); int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len) { if (!pskb_may_pull(skb, write_len)) return -ENOMEM; if (!skb_cloned(skb) || skb_clone_writable(skb, write_len)) return 0; return pskb_expand_head(skb, 0, 0, GFP_ATOMIC); } EXPORT_SYMBOL(skb_ensure_writable); int skb_ensure_writable_head_tail(struct sk_buff *skb, struct net_device *dev) { int needed_headroom = dev->needed_headroom; int needed_tailroom = dev->needed_tailroom; /* For tail taggers, we need to pad short frames ourselves, to ensure * that the tail tag does not fail at its role of being at the end of * the packet, once the conduit interface pads the frame. Account for * that pad length here, and pad later. */ if (unlikely(needed_tailroom && skb->len < ETH_ZLEN)) needed_tailroom += ETH_ZLEN - skb->len; /* skb_headroom() returns unsigned int... */ needed_headroom = max_t(int, needed_headroom - skb_headroom(skb), 0); needed_tailroom = max_t(int, needed_tailroom - skb_tailroom(skb), 0); if (likely(!needed_headroom && !needed_tailroom && !skb_cloned(skb))) /* No reallocation needed, yay! */ return 0; return pskb_expand_head(skb, needed_headroom, needed_tailroom, GFP_ATOMIC); } EXPORT_SYMBOL(skb_ensure_writable_head_tail); /* remove VLAN header from packet and update csum accordingly. * expects a non skb_vlan_tag_present skb with a vlan tag payload */ int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci) { int offset = skb->data - skb_mac_header(skb); int err; if (WARN_ONCE(offset, "__skb_vlan_pop got skb with skb->data not at mac header (offset %d)\n", offset)) { return -EINVAL; } err = skb_ensure_writable(skb, VLAN_ETH_HLEN); if (unlikely(err)) return err; skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); vlan_remove_tag(skb, vlan_tci); skb->mac_header += VLAN_HLEN; if (skb_network_offset(skb) < ETH_HLEN) skb_set_network_header(skb, ETH_HLEN); skb_reset_mac_len(skb); return err; } EXPORT_SYMBOL(__skb_vlan_pop); /* Pop a vlan tag either from hwaccel or from payload. * Expects skb->data at mac header. */ int skb_vlan_pop(struct sk_buff *skb) { u16 vlan_tci; __be16 vlan_proto; int err; if (likely(skb_vlan_tag_present(skb))) { __vlan_hwaccel_clear_tag(skb); } else { if (unlikely(!eth_type_vlan(skb->protocol))) return 0; err = __skb_vlan_pop(skb, &vlan_tci); if (err) return err; } /* move next vlan tag to hw accel tag */ if (likely(!eth_type_vlan(skb->protocol))) return 0; vlan_proto = skb->protocol; err = __skb_vlan_pop(skb, &vlan_tci); if (unlikely(err)) return err; __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); return 0; } EXPORT_SYMBOL(skb_vlan_pop); /* Push a vlan tag either into hwaccel or into payload (if hwaccel tag present). * Expects skb->data at mac header. */ int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) { if (skb_vlan_tag_present(skb)) { int offset = skb->data - skb_mac_header(skb); int err; if (WARN_ONCE(offset, "skb_vlan_push got skb with skb->data not at mac header (offset %d)\n", offset)) { return -EINVAL; } err = __vlan_insert_tag(skb, skb->vlan_proto, skb_vlan_tag_get(skb)); if (err) return err; skb->protocol = skb->vlan_proto; skb->network_header -= VLAN_HLEN; skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); } __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); return 0; } EXPORT_SYMBOL(skb_vlan_push); /** * skb_eth_pop() - Drop the Ethernet header at the head of a packet * * @skb: Socket buffer to modify * * Drop the Ethernet header of @skb. * * Expects that skb->data points to the mac header and that no VLAN tags are * present. * * Returns 0 on success, -errno otherwise. */ int skb_eth_pop(struct sk_buff *skb) { if (!pskb_may_pull(skb, ETH_HLEN) || skb_vlan_tagged(skb) || skb_network_offset(skb) < ETH_HLEN) return -EPROTO; skb_pull_rcsum(skb, ETH_HLEN); skb_reset_mac_header(skb); skb_reset_mac_len(skb); return 0; } EXPORT_SYMBOL(skb_eth_pop); /** * skb_eth_push() - Add a new Ethernet header at the head of a packet * * @skb: Socket buffer to modify * @dst: Destination MAC address of the new header * @src: Source MAC address of the new header * * Prepend @skb with a new Ethernet header. * * Expects that skb->data points to the mac header, which must be empty. * * Returns 0 on success, -errno otherwise. */ int skb_eth_push(struct sk_buff *skb, const unsigned char *dst, const unsigned char *src) { struct ethhdr *eth; int err; if (skb_network_offset(skb) || skb_vlan_tag_present(skb)) return -EPROTO; err = skb_cow_head(skb, sizeof(*eth)); if (err < 0) return err; skb_push(skb, sizeof(*eth)); skb_reset_mac_header(skb); skb_reset_mac_len(skb); eth = eth_hdr(skb); ether_addr_copy(eth->h_dest, dst); ether_addr_copy(eth->h_source, src); eth->h_proto = skb->protocol; skb_postpush_rcsum(skb, eth, sizeof(*eth)); return 0; } EXPORT_SYMBOL(skb_eth_push); /* Update the ethertype of hdr and the skb csum value if required. */ static void skb_mod_eth_type(struct sk_buff *skb, struct ethhdr *hdr, __be16 ethertype) { if (skb->ip_summed == CHECKSUM_COMPLETE) { __be16 diff[] = { ~hdr->h_proto, ethertype }; skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum); } hdr->h_proto = ethertype; } /** * skb_mpls_push() - push a new MPLS header after mac_len bytes from start of * the packet * * @skb: buffer * @mpls_lse: MPLS label stack entry to push * @mpls_proto: ethertype of the new MPLS header (expects 0x8847 or 0x8848) * @mac_len: length of the MAC header * @ethernet: flag to indicate if the resulting packet after skb_mpls_push is * ethernet * * Expects skb->data at mac header. * * Returns 0 on success, -errno otherwise. */ int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto, int mac_len, bool ethernet) { struct mpls_shim_hdr *lse; int err; if (unlikely(!eth_p_mpls(mpls_proto))) return -EINVAL; /* Networking stack does not allow simultaneous Tunnel and MPLS GSO. */ if (skb->encapsulation) return -EINVAL; err = skb_cow_head(skb, MPLS_HLEN); if (unlikely(err)) return err; if (!skb->inner_protocol) { skb_set_inner_network_header(skb, skb_network_offset(skb)); skb_set_inner_protocol(skb, skb->protocol); } skb_push(skb, MPLS_HLEN); memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb), mac_len); skb_reset_mac_header(skb); skb_set_network_header(skb, mac_len); skb_reset_mac_len(skb); lse = mpls_hdr(skb); lse->label_stack_entry = mpls_lse; skb_postpush_rcsum(skb, lse, MPLS_HLEN); if (ethernet && mac_len >= ETH_HLEN) skb_mod_eth_type(skb, eth_hdr(skb), mpls_proto); skb->protocol = mpls_proto; return 0; } EXPORT_SYMBOL_GPL(skb_mpls_push); /** * skb_mpls_pop() - pop the outermost MPLS header * * @skb: buffer * @next_proto: ethertype of header after popped MPLS header * @mac_len: length of the MAC header * @ethernet: flag to indicate if the packet is ethernet * * Expects skb->data at mac header. * * Returns 0 on success, -errno otherwise. */ int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len, bool ethernet) { int err; if (unlikely(!eth_p_mpls(skb->protocol))) return 0; err = skb_ensure_writable(skb, mac_len + MPLS_HLEN); if (unlikely(err)) return err; skb_postpull_rcsum(skb, mpls_hdr(skb), MPLS_HLEN); memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb), mac_len); __skb_pull(skb, MPLS_HLEN); skb_reset_mac_header(skb); skb_set_network_header(skb, mac_len); if (ethernet && mac_len >= ETH_HLEN) { struct ethhdr *hdr; /* use mpls_hdr() to get ethertype to account for VLANs. */ hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN); skb_mod_eth_type(skb, hdr, next_proto); } skb->protocol = next_proto; return 0; } EXPORT_SYMBOL_GPL(skb_mpls_pop); /** * skb_mpls_update_lse() - modify outermost MPLS header and update csum * * @skb: buffer * @mpls_lse: new MPLS label stack entry to update to * * Expects skb->data at mac header. * * Returns 0 on success, -errno otherwise. */ int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse) { int err; if (unlikely(!eth_p_mpls(skb->protocol))) return -EINVAL; err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN); if (unlikely(err)) return err; if (skb->ip_summed == CHECKSUM_COMPLETE) { __be32 diff[] = { ~mpls_hdr(skb)->label_stack_entry, mpls_lse }; skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum); } mpls_hdr(skb)->label_stack_entry = mpls_lse; return 0; } EXPORT_SYMBOL_GPL(skb_mpls_update_lse); /** * skb_mpls_dec_ttl() - decrement the TTL of the outermost MPLS header * * @skb: buffer * * Expects skb->data at mac header. * * Returns 0 on success, -errno otherwise. */ int skb_mpls_dec_ttl(struct sk_buff *skb) { u32 lse; u8 ttl; if (unlikely(!eth_p_mpls(skb->protocol))) return -EINVAL; if (!pskb_may_pull(skb, skb_network_offset(skb) + MPLS_HLEN)) return -ENOMEM; lse = be32_to_cpu(mpls_hdr(skb)->label_stack_entry); ttl = (lse & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT; if (!--ttl) return -EINVAL; lse &= ~MPLS_LS_TTL_MASK; lse |= ttl << MPLS_LS_TTL_SHIFT; return skb_mpls_update_lse(skb, cpu_to_be32(lse)); } EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl); /** * alloc_skb_with_frags - allocate skb with page frags * * @header_len: size of linear part * @data_len: needed length in frags * @order: max page order desired. * @errcode: pointer to error code if any * @gfp_mask: allocation mask * * This can be used to allocate a paged skb, given a maximal order for frags. */ struct sk_buff *alloc_skb_with_frags(unsigned long header_len, unsigned long data_len, int order, int *errcode, gfp_t gfp_mask) { unsigned long chunk; struct sk_buff *skb; struct page *page; int nr_frags = 0; *errcode = -EMSGSIZE; if (unlikely(data_len > MAX_SKB_FRAGS * (PAGE_SIZE << order))) return NULL; *errcode = -ENOBUFS; skb = alloc_skb(header_len, gfp_mask); if (!skb) return NULL; while (data_len) { if (nr_frags == MAX_SKB_FRAGS - 1) goto failure; while (order && PAGE_ALIGN(data_len) < (PAGE_SIZE << order)) order--; if (order) { page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) | __GFP_COMP | __GFP_NOWARN, order); if (!page) { order--; continue; } } else { page = alloc_page(gfp_mask); if (!page) goto failure; } chunk = min_t(unsigned long, data_len, PAGE_SIZE << order); skb_fill_page_desc(skb, nr_frags, page, 0, chunk); nr_frags++; skb->truesize += (PAGE_SIZE << order); data_len -= chunk; } return skb; failure: kfree_skb(skb); return NULL; } EXPORT_SYMBOL(alloc_skb_with_frags); /* carve out the first off bytes from skb when off < headlen */ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, const int headlen, gfp_t gfp_mask) { int i; unsigned int size = skb_end_offset(skb); int new_hlen = headlen - off; u8 *data; if (skb_pfmemalloc(skb)) gfp_mask |= __GFP_MEMALLOC; data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL); if (!data) return -ENOMEM; size = SKB_WITH_OVERHEAD(size); /* Copy real data, and all frags */ skb_copy_from_linear_data_offset(skb, off, data, new_hlen); skb->len -= off; memcpy((struct skb_shared_info *)(data + size), skb_shinfo(skb), offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags])); if (skb_cloned(skb)) { /* drop the old head gracefully */ if (skb_orphan_frags(skb, gfp_mask)) { skb_kfree_head(data, size); return -ENOMEM; } for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) skb_frag_ref(skb, i); if (skb_has_frag_list(skb)) skb_clone_fraglist(skb); skb_release_data(skb, SKB_CONSUMED); } else { /* we can reuse existing recount- all we did was * relocate values */ skb_free_head(skb); } skb->head = data; skb->data = data; skb->head_frag = 0; skb_set_end_offset(skb, size); skb_set_tail_pointer(skb, skb_headlen(skb)); skb_headers_offset_update(skb, 0); skb->cloned = 0; skb->hdr_len = 0; skb->nohdr = 0; atomic_set(&skb_shinfo(skb)->dataref, 1); return 0; } static int pskb_carve(struct sk_buff *skb, const u32 off, gfp_t gfp); /* carve out the first eat bytes from skb's frag_list. May recurse into * pskb_carve() */ static int pskb_carve_frag_list(struct sk_buff *skb, struct skb_shared_info *shinfo, int eat, gfp_t gfp_mask) { struct sk_buff *list = shinfo->frag_list; struct sk_buff *clone = NULL; struct sk_buff *insp = NULL; do { if (!list) { pr_err("Not enough bytes to eat. Want %d\n", eat); return -EFAULT; } if (list->len <= eat) { /* Eaten as whole. */ eat -= list->len; list = list->next; insp = list; } else { /* Eaten partially. */ if (skb_shared(list)) { clone = skb_clone(list, gfp_mask); if (!clone) return -ENOMEM; insp = list->next; list = clone; } else { /* This may be pulled without problems. */ insp = list; } if (pskb_carve(list, eat, gfp_mask) < 0) { kfree_skb(clone); return -ENOMEM; } break; } } while (eat); /* Free pulled out fragments. */ while ((list = shinfo->frag_list) != insp) { shinfo->frag_list = list->next; consume_skb(list); } /* And insert new clone at head. */ if (clone) { clone->next = list; shinfo->frag_list = clone; } return 0; } /* carve off first len bytes from skb. Split line (off) is in the * non-linear part of skb */ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, int pos, gfp_t gfp_mask) { int i, k = 0; unsigned int size = skb_end_offset(skb); u8 *data; const int nfrags = skb_shinfo(skb)->nr_frags; struct skb_shared_info *shinfo; if (skb_pfmemalloc(skb)) gfp_mask |= __GFP_MEMALLOC; data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL); if (!data) return -ENOMEM; size = SKB_WITH_OVERHEAD(size); memcpy((struct skb_shared_info *)(data + size), skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0])); if (skb_orphan_frags(skb, gfp_mask)) { skb_kfree_head(data, size); return -ENOMEM; } shinfo = (struct skb_shared_info *)(data + size); for (i = 0; i < nfrags; i++) { int fsize = skb_frag_size(&skb_shinfo(skb)->frags[i]); if (pos + fsize > off) { shinfo->frags[k] = skb_shinfo(skb)->frags[i]; if (pos < off) { /* Split frag. * We have two variants in this case: * 1. Move all the frag to the second * part, if it is possible. F.e. * this approach is mandatory for TUX, * where splitting is expensive. * 2. Split is accurately. We make this. */ skb_frag_off_add(&shinfo->frags[0], off - pos); skb_frag_size_sub(&shinfo->frags[0], off - pos); } skb_frag_ref(skb, i); k++; } pos += fsize; } shinfo->nr_frags = k; if (skb_has_frag_list(skb)) skb_clone_fraglist(skb); /* split line is in frag list */ if (k == 0 && pskb_carve_frag_list(skb, shinfo, off - pos, gfp_mask)) { /* skb_frag_unref() is not needed here as shinfo->nr_frags = 0. */ if (skb_has_frag_list(skb)) kfree_skb_list(skb_shinfo(skb)->frag_list); skb_kfree_head(data, size); return -ENOMEM; } skb_release_data(skb, SKB_CONSUMED); skb->head = data; skb->head_frag = 0; skb->data = data; skb_set_end_offset(skb, size); skb_reset_tail_pointer(skb); skb_headers_offset_update(skb, 0); skb->cloned = 0; skb->hdr_len = 0; skb->nohdr = 0; skb->len -= off; skb->data_len = skb->len; atomic_set(&skb_shinfo(skb)->dataref, 1); return 0; } /* remove len bytes from the beginning of the skb */ static int pskb_carve(struct sk_buff *skb, const u32 len, gfp_t gfp) { int headlen = skb_headlen(skb); if (len < headlen) return pskb_carve_inside_header(skb, len, headlen, gfp); else return pskb_carve_inside_nonlinear(skb, len, headlen, gfp); } /* Extract to_copy bytes starting at off from skb, and return this in * a new skb */ struct sk_buff *pskb_extract(struct sk_buff *skb, int off, int to_copy, gfp_t gfp) { struct sk_buff *clone = skb_clone(skb, gfp); if (!clone) return NULL; if (pskb_carve(clone, off, gfp) < 0 || pskb_trim(clone, to_copy)) { kfree_skb(clone); return NULL; } return clone; } EXPORT_SYMBOL(pskb_extract); /** * skb_condense - try to get rid of fragments/frag_list if possible * @skb: buffer * * Can be used to save memory before skb is added to a busy queue. * If packet has bytes in frags and enough tail room in skb->head, * pull all of them, so that we can free the frags right now and adjust * truesize. * Notes: * We do not reallocate skb->head thus can not fail. * Caller must re-evaluate skb->truesize if needed. */ void skb_condense(struct sk_buff *skb) { if (skb->data_len) { if (skb->data_len > skb->end - skb->tail || skb_cloned(skb) || !skb_frags_readable(skb)) return; /* Nice, we can free page frag(s) right now */ __pskb_pull_tail(skb, skb->data_len); } /* At this point, skb->truesize might be over estimated, * because skb had a fragment, and fragments do not tell * their truesize. * When we pulled its content into skb->head, fragment * was freed, but __pskb_pull_tail() could not possibly * adjust skb->truesize, not knowing the frag truesize. */ skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); } EXPORT_SYMBOL(skb_condense); #ifdef CONFIG_SKB_EXTENSIONS static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id) { return (void *)ext + (ext->offset[id] * SKB_EXT_ALIGN_VALUE); } /** * __skb_ext_alloc - allocate a new skb extensions storage * * @flags: See kmalloc(). * * Returns the newly allocated pointer. The pointer can later attached to a * skb via __skb_ext_set(). * Note: caller must handle the skb_ext as an opaque data. */ struct skb_ext *__skb_ext_alloc(gfp_t flags) { struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, flags); if (new) { memset(new->offset, 0, sizeof(new->offset)); refcount_set(&new->refcnt, 1); } return new; } static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old, unsigned int old_active) { struct skb_ext *new; if (refcount_read(&old->refcnt) == 1) return old; new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC); if (!new) return NULL; memcpy(new, old, old->chunks * SKB_EXT_ALIGN_VALUE); refcount_set(&new->refcnt, 1); #ifdef CONFIG_XFRM if (old_active & (1 << SKB_EXT_SEC_PATH)) { struct sec_path *sp = skb_ext_get_ptr(old, SKB_EXT_SEC_PATH); unsigned int i; for (i = 0; i < sp->len; i++) xfrm_state_hold(sp->xvec[i]); } #endif #ifdef CONFIG_MCTP_FLOWS if (old_active & (1 << SKB_EXT_MCTP)) { struct mctp_flow *flow = skb_ext_get_ptr(old, SKB_EXT_MCTP); if (flow->key) refcount_inc(&flow->key->refs); } #endif __skb_ext_put(old); return new; } /** * __skb_ext_set - attach the specified extension storage to this skb * @skb: buffer * @id: extension id * @ext: extension storage previously allocated via __skb_ext_alloc() * * Existing extensions, if any, are cleared. * * Returns the pointer to the extension. */ void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id, struct skb_ext *ext) { unsigned int newlen, newoff = SKB_EXT_CHUNKSIZEOF(*ext); skb_ext_put(skb); newlen = newoff + skb_ext_type_len[id]; ext->chunks = newlen; ext->offset[id] = newoff; skb->extensions = ext; skb->active_extensions = 1 << id; return skb_ext_get_ptr(ext, id); } /** * skb_ext_add - allocate space for given extension, COW if needed * @skb: buffer * @id: extension to allocate space for * * Allocates enough space for the given extension. * If the extension is already present, a pointer to that extension * is returned. * * If the skb was cloned, COW applies and the returned memory can be * modified without changing the extension space of clones buffers. * * Returns pointer to the extension or NULL on allocation failure. */ void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id) { struct skb_ext *new, *old = NULL; unsigned int newlen, newoff; if (skb->active_extensions) { old = skb->extensions; new = skb_ext_maybe_cow(old, skb->active_extensions); if (!new) return NULL; if (__skb_ext_exist(new, id)) goto set_active; newoff = new->chunks; } else { newoff = SKB_EXT_CHUNKSIZEOF(*new); new = __skb_ext_alloc(GFP_ATOMIC); if (!new) return NULL; } newlen = newoff + skb_ext_type_len[id]; new->chunks = newlen; new->offset[id] = newoff; set_active: skb->slow_gro = 1; skb->extensions = new; skb->active_extensions |= 1 << id; return skb_ext_get_ptr(new, id); } EXPORT_SYMBOL(skb_ext_add); #ifdef CONFIG_XFRM static void skb_ext_put_sp(struct sec_path *sp) { unsigned int i; for (i = 0; i < sp->len; i++) xfrm_state_put(sp->xvec[i]); } #endif #ifdef CONFIG_MCTP_FLOWS static void skb_ext_put_mctp(struct mctp_flow *flow) { if (flow->key) mctp_key_unref(flow->key); } #endif void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id) { struct skb_ext *ext = skb->extensions; skb->active_extensions &= ~(1 << id); if (skb->active_extensions == 0) { skb->extensions = NULL; __skb_ext_put(ext); #ifdef CONFIG_XFRM } else if (id == SKB_EXT_SEC_PATH && refcount_read(&ext->refcnt) == 1) { struct sec_path *sp = skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH); skb_ext_put_sp(sp); sp->len = 0; #endif } } EXPORT_SYMBOL(__skb_ext_del); void __skb_ext_put(struct skb_ext *ext) { /* If this is last clone, nothing can increment * it after check passes. Avoids one atomic op. */ if (refcount_read(&ext->refcnt) == 1) goto free_now; if (!refcount_dec_and_test(&ext->refcnt)) return; free_now: #ifdef CONFIG_XFRM if (__skb_ext_exist(ext, SKB_EXT_SEC_PATH)) skb_ext_put_sp(skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH)); #endif #ifdef CONFIG_MCTP_FLOWS if (__skb_ext_exist(ext, SKB_EXT_MCTP)) skb_ext_put_mctp(skb_ext_get_ptr(ext, SKB_EXT_MCTP)); #endif kmem_cache_free(skbuff_ext_cache, ext); } EXPORT_SYMBOL(__skb_ext_put); #endif /* CONFIG_SKB_EXTENSIONS */ static void kfree_skb_napi_cache(struct sk_buff *skb) { /* if SKB is a clone, don't handle this case */ if (skb->fclone != SKB_FCLONE_UNAVAILABLE) { __kfree_skb(skb); return; } local_bh_disable(); __napi_kfree_skb(skb, SKB_CONSUMED); local_bh_enable(); } /** * skb_attempt_defer_free - queue skb for remote freeing * @skb: buffer * * Put @skb in a per-cpu list, using the cpu which * allocated the skb/pages to reduce false sharing * and memory zone spinlock contention. */ void skb_attempt_defer_free(struct sk_buff *skb) { int cpu = skb->alloc_cpu; struct softnet_data *sd; unsigned int defer_max; bool kick; if (cpu == raw_smp_processor_id() || WARN_ON_ONCE(cpu >= nr_cpu_ids) || !cpu_online(cpu)) { nodefer: kfree_skb_napi_cache(skb); return; } DEBUG_NET_WARN_ON_ONCE(skb_dst(skb)); DEBUG_NET_WARN_ON_ONCE(skb->destructor); sd = &per_cpu(softnet_data, cpu); defer_max = READ_ONCE(net_hotdata.sysctl_skb_defer_max); if (READ_ONCE(sd->defer_count) >= defer_max) goto nodefer; spin_lock_bh(&sd->defer_lock); /* Send an IPI every time queue reaches half capacity. */ kick = sd->defer_count == (defer_max >> 1); /* Paired with the READ_ONCE() few lines above */ WRITE_ONCE(sd->defer_count, sd->defer_count + 1); skb->next = sd->defer_list; /* Paired with READ_ONCE() in skb_defer_free_flush() */ WRITE_ONCE(sd->defer_list, skb); spin_unlock_bh(&sd->defer_lock); /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU * if we are unlucky enough (this seems very unlikely). */ if (unlikely(kick)) kick_defer_list_purge(sd, cpu); } static void skb_splice_csum_page(struct sk_buff *skb, struct page *page, size_t offset, size_t len) { const char *kaddr; __wsum csum; kaddr = kmap_local_page(page); csum = csum_partial(kaddr + offset, len, 0); kunmap_local(kaddr); skb->csum = csum_block_add(skb->csum, csum, skb->len); } /** * skb_splice_from_iter - Splice (or copy) pages to skbuff * @skb: The buffer to add pages to * @iter: Iterator representing the pages to be added * @maxsize: Maximum amount of pages to be added * @gfp: Allocation flags * * This is a common helper function for supporting MSG_SPLICE_PAGES. It * extracts pages from an iterator and adds them to the socket buffer if * possible, copying them to fragments if not possible (such as if they're slab * pages). * * Returns the amount of data spliced/copied or -EMSGSIZE if there's * insufficient space in the buffer to transfer anything. */ ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter, ssize_t maxsize, gfp_t gfp) { size_t frag_limit = READ_ONCE(net_hotdata.sysctl_max_skb_frags); struct page *pages[8], **ppages = pages; ssize_t spliced = 0, ret = 0; unsigned int i; while (iter->count > 0) { ssize_t space, nr, len; size_t off; ret = -EMSGSIZE; space = frag_limit - skb_shinfo(skb)->nr_frags; if (space < 0) break; /* We might be able to coalesce without increasing nr_frags */ nr = clamp_t(size_t, space, 1, ARRAY_SIZE(pages)); len = iov_iter_extract_pages(iter, &ppages, maxsize, nr, 0, &off); if (len <= 0) { ret = len ?: -EIO; break; } i = 0; do { struct page *page = pages[i++]; size_t part = min_t(size_t, PAGE_SIZE - off, len); ret = -EIO; if (WARN_ON_ONCE(!sendpage_ok(page))) goto out; ret = skb_append_pagefrags(skb, page, off, part, frag_limit); if (ret < 0) { iov_iter_revert(iter, len); goto out; } if (skb->ip_summed == CHECKSUM_NONE) skb_splice_csum_page(skb, page, off, part); off = 0; spliced += part; maxsize -= part; len -= part; } while (len > 0); if (maxsize <= 0) break; } out: skb_len_add(skb, spliced); return spliced ?: ret; } EXPORT_SYMBOL(skb_splice_from_iter); static __always_inline size_t memcpy_from_iter_csum(void *iter_from, size_t progress, size_t len, void *to, void *priv2) { __wsum *csum = priv2; __wsum next = csum_partial_copy_nocheck(iter_from, to + progress, len); *csum = csum_block_add(*csum, next, progress); return 0; } static __always_inline size_t copy_from_user_iter_csum(void __user *iter_from, size_t progress, size_t len, void *to, void *priv2) { __wsum next, *csum = priv2; next = csum_and_copy_from_user(iter_from, to + progress, len); *csum = csum_block_add(*csum, next, progress); return next ? 0 : len; } bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i) { size_t copied; if (WARN_ON_ONCE(!i->data_source)) return false; copied = iterate_and_advance2(i, bytes, addr, csum, copy_from_user_iter_csum, memcpy_from_iter_csum); if (likely(copied == bytes)) return true; iov_iter_revert(i, copied); return false; } EXPORT_SYMBOL(csum_and_copy_from_iter_full); void get_netmem(netmem_ref netmem) { struct net_iov *niov; if (netmem_is_net_iov(netmem)) { niov = netmem_to_net_iov(netmem); if (net_is_devmem_iov(niov)) net_devmem_get_net_iov(netmem_to_net_iov(netmem)); return; } get_page(netmem_to_page(netmem)); } EXPORT_SYMBOL(get_netmem); void put_netmem(netmem_ref netmem) { struct net_iov *niov; if (netmem_is_net_iov(netmem)) { niov = netmem_to_net_iov(netmem); if (net_is_devmem_iov(niov)) net_devmem_put_net_iov(netmem_to_net_iov(netmem)); return; } put_page(netmem_to_page(netmem)); } EXPORT_SYMBOL(put_netmem);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2020 ARM Ltd. */ #ifndef __ASM_MTE_H #define __ASM_MTE_H #include <asm/compiler.h> #include <asm/mte-def.h> #ifndef __ASSEMBLY__ #include <linux/bitfield.h> #include <linux/kasan-enabled.h> #include <linux/page-flags.h> #include <linux/sched.h> #include <linux/types.h> #include <asm/pgtable-types.h> void mte_clear_page_tags(void *addr); unsigned long mte_copy_tags_from_user(void *to, const void __user *from, unsigned long n); unsigned long mte_copy_tags_to_user(void __user *to, void *from, unsigned long n); int mte_save_tags(struct page *page); void mte_save_page_tags(const void *page_addr, void *tag_storage); void mte_restore_tags(swp_entry_t entry, struct page *page); void mte_restore_page_tags(void *page_addr, const void *tag_storage); void mte_invalidate_tags(int type, pgoff_t offset); void mte_invalidate_tags_area(int type); void *mte_allocate_tag_storage(void); void mte_free_tag_storage(char *storage); #ifdef CONFIG_ARM64_MTE /* track which pages have valid allocation tags */ #define PG_mte_tagged PG_arch_2 /* simple lock to avoid multiple threads tagging the same page */ #define PG_mte_lock PG_arch_3 static inline void set_page_mte_tagged(struct page *page) { VM_WARN_ON_ONCE(folio_test_hugetlb(page_folio(page))); /* * Ensure that the tags written prior to this function are visible * before the page flags update. */ smp_wmb(); set_bit(PG_mte_tagged, &page->flags); } static inline bool page_mte_tagged(struct page *page) { bool ret = test_bit(PG_mte_tagged, &page->flags); VM_WARN_ON_ONCE(folio_test_hugetlb(page_folio(page))); /* * If the page is tagged, ensure ordering with a likely subsequent * read of the tags. */ if (ret) smp_rmb(); return ret; } /* * Lock the page for tagging and return 'true' if the page can be tagged, * 'false' if already tagged. PG_mte_tagged is never cleared and therefore the * locking only happens once for page initialisation. * * The page MTE lock state: * * Locked: PG_mte_lock && !PG_mte_tagged * Unlocked: !PG_mte_lock || PG_mte_tagged * * Acquire semantics only if the page is tagged (returning 'false'). */ static inline bool try_page_mte_tagging(struct page *page) { VM_WARN_ON_ONCE(folio_test_hugetlb(page_folio(page))); if (!test_and_set_bit(PG_mte_lock, &page->flags)) return true; /* * The tags are either being initialised or may have been initialised * already. Check if the PG_mte_tagged flag has been set or wait * otherwise. */ smp_cond_load_acquire(&page->flags, VAL & (1UL << PG_mte_tagged)); return false; } void mte_zero_clear_page_tags(void *addr); void mte_sync_tags(pte_t pte, unsigned int nr_pages); void mte_copy_page_tags(void *kto, const void *kfrom); void mte_thread_init_user(void); void mte_thread_switch(struct task_struct *next); void mte_cpu_setup(void); void mte_suspend_enter(void); void mte_suspend_exit(void); long set_mte_ctrl(struct task_struct *task, unsigned long arg); long get_mte_ctrl(struct task_struct *task); int mte_ptrace_copy_tags(struct task_struct *child, long request, unsigned long addr, unsigned long data); size_t mte_probe_user_range(const char __user *uaddr, size_t size); #else /* CONFIG_ARM64_MTE */ /* unused if !CONFIG_ARM64_MTE, silence the compiler */ #define PG_mte_tagged 0 static inline void set_page_mte_tagged(struct page *page) { } static inline bool page_mte_tagged(struct page *page) { return false; } static inline bool try_page_mte_tagging(struct page *page) { return false; } static inline void mte_zero_clear_page_tags(void *addr) { } static inline void mte_sync_tags(pte_t pte, unsigned int nr_pages) { } static inline void mte_copy_page_tags(void *kto, const void *kfrom) { } static inline void mte_thread_init_user(void) { } static inline void mte_thread_switch(struct task_struct *next) { } static inline void mte_suspend_enter(void) { } static inline void mte_suspend_exit(void) { } static inline long set_mte_ctrl(struct task_struct *task, unsigned long arg) { return 0; } static inline long get_mte_ctrl(struct task_struct *task) { return 0; } static inline int mte_ptrace_copy_tags(struct task_struct *child, long request, unsigned long addr, unsigned long data) { return -EIO; } #endif /* CONFIG_ARM64_MTE */ #if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_ARM64_MTE) static inline void folio_set_hugetlb_mte_tagged(struct folio *folio) { VM_WARN_ON_ONCE(!folio_test_hugetlb(folio)); /* * Ensure that the tags written prior to this function are visible * before the folio flags update. */ smp_wmb(); set_bit(PG_mte_tagged, &folio->flags); } static inline bool folio_test_hugetlb_mte_tagged(struct folio *folio) { bool ret = test_bit(PG_mte_tagged, &folio->flags); VM_WARN_ON_ONCE(!folio_test_hugetlb(folio)); /* * If the folio is tagged, ensure ordering with a likely subsequent * read of the tags. */ if (ret) smp_rmb(); return ret; } static inline bool folio_try_hugetlb_mte_tagging(struct folio *folio) { VM_WARN_ON_ONCE(!folio_test_hugetlb(folio)); if (!test_and_set_bit(PG_mte_lock, &folio->flags)) return true; /* * The tags are either being initialised or may have been initialised * already. Check if the PG_mte_tagged flag has been set or wait * otherwise. */ smp_cond_load_acquire(&folio->flags, VAL & (1UL << PG_mte_tagged)); return false; } #else static inline void folio_set_hugetlb_mte_tagged(struct folio *folio) { } static inline bool folio_test_hugetlb_mte_tagged(struct folio *folio) { return false; } static inline bool folio_try_hugetlb_mte_tagging(struct folio *folio) { return false; } #endif static inline void mte_disable_tco_entry(struct task_struct *task) { if (!system_supports_mte()) return; /* * Re-enable tag checking (TCO set on exception entry). This is only * necessary if MTE is enabled in either the kernel or the userspace * task in synchronous or asymmetric mode (SCTLR_EL1.TCF0 bit 0 is set * for both). With MTE disabled in the kernel and disabled or * asynchronous in userspace, tag check faults (including in uaccesses) * are not reported, therefore there is no need to re-enable checking. * This is beneficial on microarchitectures where re-enabling TCO is * expensive. */ if (kasan_hw_tags_enabled() || (task->thread.sctlr_user & (1UL << SCTLR_EL1_TCF0_SHIFT))) asm volatile(SET_PSTATE_TCO(0)); } #ifdef CONFIG_KASAN_HW_TAGS void mte_check_tfsr_el1(void); static inline void mte_check_tfsr_entry(void) { if (!kasan_hw_tags_enabled()) return; mte_check_tfsr_el1(); } static inline void mte_check_tfsr_exit(void) { if (!kasan_hw_tags_enabled()) return; /* * The asynchronous faults are sync'ed automatically with * TFSR_EL1 on kernel entry but for exit an explicit dsb() * is required. */ dsb(nsh); isb(); mte_check_tfsr_el1(); } #else static inline void mte_check_tfsr_el1(void) { } static inline void mte_check_tfsr_entry(void) { } static inline void mte_check_tfsr_exit(void) { } #endif /* CONFIG_KASAN_HW_TAGS */ #endif /* __ASSEMBLY__ */ #endif /* __ASM_MTE_H */
1 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 /* SPDX-License-Identifier: GPL-2.0 */ /* * workqueue.h --- work queue handling for Linux. */ #ifndef _LINUX_WORKQUEUE_H #define _LINUX_WORKQUEUE_H #include <linux/timer.h> #include <linux/linkage.h> #include <linux/bitops.h> #include <linux/lockdep.h> #include <linux/threads.h> #include <linux/atomic.h> #include <linux/cpumask_types.h> #include <linux/rcupdate.h> #include <linux/workqueue_types.h> /* * The first word is the work queue pointer and the flags rolled into * one */ #define work_data_bits(work) ((unsigned long *)(&(work)->data)) enum work_bits { WORK_STRUCT_PENDING_BIT = 0, /* work item is pending execution */ WORK_STRUCT_INACTIVE_BIT, /* work item is inactive */ WORK_STRUCT_PWQ_BIT, /* data points to pwq */ WORK_STRUCT_LINKED_BIT, /* next work is linked to this one */ #ifdef CONFIG_DEBUG_OBJECTS_WORK WORK_STRUCT_STATIC_BIT, /* static initializer (debugobjects) */ #endif WORK_STRUCT_FLAG_BITS, /* color for workqueue flushing */ WORK_STRUCT_COLOR_SHIFT = WORK_STRUCT_FLAG_BITS, WORK_STRUCT_COLOR_BITS = 4, /* * When WORK_STRUCT_PWQ is set, reserve 8 bits off of pwq pointer w/ * debugobjects turned off. This makes pwqs aligned to 256 bytes (512 * bytes w/ DEBUG_OBJECTS_WORK) and allows 16 workqueue flush colors. * * MSB * [ pwq pointer ] [ flush color ] [ STRUCT flags ] * 4 bits 4 or 5 bits */ WORK_STRUCT_PWQ_SHIFT = WORK_STRUCT_COLOR_SHIFT + WORK_STRUCT_COLOR_BITS, /* * data contains off-queue information when !WORK_STRUCT_PWQ. * * MSB * [ pool ID ] [ disable depth ] [ OFFQ flags ] [ STRUCT flags ] * 16 bits 1 bit 4 or 5 bits */ WORK_OFFQ_FLAG_SHIFT = WORK_STRUCT_FLAG_BITS, WORK_OFFQ_BH_BIT = WORK_OFFQ_FLAG_SHIFT, WORK_OFFQ_FLAG_END, WORK_OFFQ_FLAG_BITS = WORK_OFFQ_FLAG_END - WORK_OFFQ_FLAG_SHIFT, WORK_OFFQ_DISABLE_SHIFT = WORK_OFFQ_FLAG_SHIFT + WORK_OFFQ_FLAG_BITS, WORK_OFFQ_DISABLE_BITS = 16, /* * When a work item is off queue, the high bits encode off-queue flags * and the last pool it was on. Cap pool ID to 31 bits and use the * highest number to indicate that no pool is associated. */ WORK_OFFQ_POOL_SHIFT = WORK_OFFQ_DISABLE_SHIFT + WORK_OFFQ_DISABLE_BITS, WORK_OFFQ_LEFT = BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT, WORK_OFFQ_POOL_BITS = WORK_OFFQ_LEFT <= 31 ? WORK_OFFQ_LEFT : 31, }; enum work_flags { WORK_STRUCT_PENDING = 1 << WORK_STRUCT_PENDING_BIT, WORK_STRUCT_INACTIVE = 1 << WORK_STRUCT_INACTIVE_BIT, WORK_STRUCT_PWQ = 1 << WORK_STRUCT_PWQ_BIT, WORK_STRUCT_LINKED = 1 << WORK_STRUCT_LINKED_BIT, #ifdef CONFIG_DEBUG_OBJECTS_WORK WORK_STRUCT_STATIC = 1 << WORK_STRUCT_STATIC_BIT, #else WORK_STRUCT_STATIC = 0, #endif }; enum wq_misc_consts { WORK_NR_COLORS = (1 << WORK_STRUCT_COLOR_BITS), /* not bound to any CPU, prefer the local CPU */ WORK_CPU_UNBOUND = NR_CPUS, /* bit mask for work_busy() return values */ WORK_BUSY_PENDING = 1 << 0, WORK_BUSY_RUNNING = 1 << 1, /* maximum string length for set_worker_desc() */ WORKER_DESC_LEN = 32, }; /* Convenience constants - of type 'unsigned long', not 'enum'! */ #define WORK_OFFQ_BH (1ul << WORK_OFFQ_BH_BIT) #define WORK_OFFQ_FLAG_MASK (((1ul << WORK_OFFQ_FLAG_BITS) - 1) << WORK_OFFQ_FLAG_SHIFT) #define WORK_OFFQ_DISABLE_MASK (((1ul << WORK_OFFQ_DISABLE_BITS) - 1) << WORK_OFFQ_DISABLE_SHIFT) #define WORK_OFFQ_POOL_NONE ((1ul << WORK_OFFQ_POOL_BITS) - 1) #define WORK_STRUCT_NO_POOL (WORK_OFFQ_POOL_NONE << WORK_OFFQ_POOL_SHIFT) #define WORK_STRUCT_PWQ_MASK (~((1ul << WORK_STRUCT_PWQ_SHIFT) - 1)) #define WORK_DATA_INIT() ATOMIC_LONG_INIT((unsigned long)WORK_STRUCT_NO_POOL) #define WORK_DATA_STATIC_INIT() \ ATOMIC_LONG_INIT((unsigned long)(WORK_STRUCT_NO_POOL | WORK_STRUCT_STATIC)) struct delayed_work { struct work_struct work; struct timer_list timer; /* target workqueue and CPU ->timer uses to queue ->work */ struct workqueue_struct *wq; int cpu; }; struct rcu_work { struct work_struct work; struct rcu_head rcu; /* target workqueue ->rcu uses to queue ->work */ struct workqueue_struct *wq; }; enum wq_affn_scope { WQ_AFFN_DFL, /* use system default */ WQ_AFFN_CPU, /* one pod per CPU */ WQ_AFFN_SMT, /* one pod poer SMT */ WQ_AFFN_CACHE, /* one pod per LLC */ WQ_AFFN_NUMA, /* one pod per NUMA node */ WQ_AFFN_SYSTEM, /* one pod across the whole system */ WQ_AFFN_NR_TYPES, }; /** * struct workqueue_attrs - A struct for workqueue attributes. * * This can be used to change attributes of an unbound workqueue. */ struct workqueue_attrs { /** * @nice: nice level */ int nice; /** * @cpumask: allowed CPUs * * Work items in this workqueue are affine to these CPUs and not allowed * to execute on other CPUs. A pool serving a workqueue must have the * same @cpumask. */ cpumask_var_t cpumask; /** * @__pod_cpumask: internal attribute used to create per-pod pools * * Internal use only. * * Per-pod unbound worker pools are used to improve locality. Always a * subset of ->cpumask. A workqueue can be associated with multiple * worker pools with disjoint @__pod_cpumask's. Whether the enforcement * of a pool's @__pod_cpumask is strict depends on @affn_strict. */ cpumask_var_t __pod_cpumask; /** * @affn_strict: affinity scope is strict * * If clear, workqueue will make a best-effort attempt at starting the * worker inside @__pod_cpumask but the scheduler is free to migrate it * outside. * * If set, workers are only allowed to run inside @__pod_cpumask. */ bool affn_strict; /* * Below fields aren't properties of a worker_pool. They only modify how * :c:func:`apply_workqueue_attrs` select pools and thus don't * participate in pool hash calculations or equality comparisons. * * If @affn_strict is set, @cpumask isn't a property of a worker_pool * either. */ /** * @affn_scope: unbound CPU affinity scope * * CPU pods are used to improve execution locality of unbound work * items. There are multiple pod types, one for each wq_affn_scope, and * every CPU in the system belongs to one pod in every pod type. CPUs * that belong to the same pod share the worker pool. For example, * selecting %WQ_AFFN_NUMA makes the workqueue use a separate worker * pool for each NUMA node. */ enum wq_affn_scope affn_scope; /** * @ordered: work items must be executed one by one in queueing order */ bool ordered; }; static inline struct delayed_work *to_delayed_work(struct work_struct *work) { return container_of(work, struct delayed_work, work); } static inline struct rcu_work *to_rcu_work(struct work_struct *work) { return container_of(work, struct rcu_work, work); } struct execute_work { struct work_struct work; }; #ifdef CONFIG_LOCKDEP /* * NB: because we have to copy the lockdep_map, setting _key * here is required, otherwise it could get initialised to the * copy of the lockdep_map! */ #define __WORK_INIT_LOCKDEP_MAP(n, k) \ .lockdep_map = STATIC_LOCKDEP_MAP_INIT(n, k), #else #define __WORK_INIT_LOCKDEP_MAP(n, k) #endif #define __WORK_INITIALIZER(n, f) { \ .data = WORK_DATA_STATIC_INIT(), \ .entry = { &(n).entry, &(n).entry }, \ .func = (f), \ __WORK_INIT_LOCKDEP_MAP(#n, &(n)) \ } #define __DELAYED_WORK_INITIALIZER(n, f, tflags) { \ .work = __WORK_INITIALIZER((n).work, (f)), \ .timer = __TIMER_INITIALIZER(delayed_work_timer_fn,\ (tflags) | TIMER_IRQSAFE), \ } #define DECLARE_WORK(n, f) \ struct work_struct n = __WORK_INITIALIZER(n, f) #define DECLARE_DELAYED_WORK(n, f) \ struct delayed_work n = __DELAYED_WORK_INITIALIZER(n, f, 0) #define DECLARE_DEFERRABLE_WORK(n, f) \ struct delayed_work n = __DELAYED_WORK_INITIALIZER(n, f, TIMER_DEFERRABLE) #ifdef CONFIG_DEBUG_OBJECTS_WORK extern void __init_work(struct work_struct *work, int onstack); extern void destroy_work_on_stack(struct work_struct *work); extern void destroy_delayed_work_on_stack(struct delayed_work *work); static inline unsigned int work_static(struct work_struct *work) { return *work_data_bits(work) & WORK_STRUCT_STATIC; } #else static inline void __init_work(struct work_struct *work, int onstack) { } static inline void destroy_work_on_stack(struct work_struct *work) { } static inline void destroy_delayed_work_on_stack(struct delayed_work *work) { } static inline unsigned int work_static(struct work_struct *work) { return 0; } #endif /* * initialize all of a work item in one go * * NOTE! No point in using "atomic_long_set()": using a direct * assignment of the work data initializer allows the compiler * to generate better code. */ #ifdef CONFIG_LOCKDEP #define __INIT_WORK_KEY(_work, _func, _onstack, _key) \ do { \ __init_work((_work), _onstack); \ (_work)->data = (atomic_long_t) WORK_DATA_INIT(); \ lockdep_init_map(&(_work)->lockdep_map, "(work_completion)"#_work, (_key), 0); \ INIT_LIST_HEAD(&(_work)->entry); \ (_work)->func = (_func); \ } while (0) #else #define __INIT_WORK_KEY(_work, _func, _onstack, _key) \ do { \ __init_work((_work), _onstack); \ (_work)->data = (atomic_long_t) WORK_DATA_INIT(); \ INIT_LIST_HEAD(&(_work)->entry); \ (_work)->func = (_func); \ } while (0) #endif #define __INIT_WORK(_work, _func, _onstack) \ do { \ static __maybe_unused struct lock_class_key __key; \ \ __INIT_WORK_KEY(_work, _func, _onstack, &__key); \ } while (0) #define INIT_WORK(_work, _func) \ __INIT_WORK((_work), (_func), 0) #define INIT_WORK_ONSTACK(_work, _func) \ __INIT_WORK((_work), (_func), 1) #define INIT_WORK_ONSTACK_KEY(_work, _func, _key) \ __INIT_WORK_KEY((_work), (_func), 1, _key) #define __INIT_DELAYED_WORK(_work, _func, _tflags) \ do { \ INIT_WORK(&(_work)->work, (_func)); \ __timer_init(&(_work)->timer, \ delayed_work_timer_fn, \ (_tflags) | TIMER_IRQSAFE); \ } while (0) #define __INIT_DELAYED_WORK_ONSTACK(_work, _func, _tflags) \ do { \ INIT_WORK_ONSTACK(&(_work)->work, (_func)); \ __timer_init_on_stack(&(_work)->timer, \ delayed_work_timer_fn, \ (_tflags) | TIMER_IRQSAFE); \ } while (0) #define INIT_DELAYED_WORK(_work, _func) \ __INIT_DELAYED_WORK(_work, _func, 0) #define INIT_DELAYED_WORK_ONSTACK(_work, _func) \ __INIT_DELAYED_WORK_ONSTACK(_work, _func, 0) #define INIT_DEFERRABLE_WORK(_work, _func) \ __INIT_DELAYED_WORK(_work, _func, TIMER_DEFERRABLE) #define INIT_DEFERRABLE_WORK_ONSTACK(_work, _func) \ __INIT_DELAYED_WORK_ONSTACK(_work, _func, TIMER_DEFERRABLE) #define INIT_RCU_WORK(_work, _func) \ INIT_WORK(&(_work)->work, (_func)) #define INIT_RCU_WORK_ONSTACK(_work, _func) \ INIT_WORK_ONSTACK(&(_work)->work, (_func)) /** * work_pending - Find out whether a work item is currently pending * @work: The work item in question */ #define work_pending(work) \ test_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) /** * delayed_work_pending - Find out whether a delayable work item is currently * pending * @w: The work item in question */ #define delayed_work_pending(w) \ work_pending(&(w)->work) /* * Workqueue flags and constants. For details, please refer to * Documentation/core-api/workqueue.rst. */ enum wq_flags { WQ_BH = 1 << 0, /* execute in bottom half (softirq) context */ WQ_UNBOUND = 1 << 1, /* not bound to any cpu */ WQ_FREEZABLE = 1 << 2, /* freeze during suspend */ WQ_MEM_RECLAIM = 1 << 3, /* may be used for memory reclaim */ WQ_HIGHPRI = 1 << 4, /* high priority */ WQ_CPU_INTENSIVE = 1 << 5, /* cpu intensive workqueue */ WQ_SYSFS = 1 << 6, /* visible in sysfs, see workqueue_sysfs_register() */ /* * Per-cpu workqueues are generally preferred because they tend to * show better performance thanks to cache locality. Per-cpu * workqueues exclude the scheduler from choosing the CPU to * execute the worker threads, which has an unfortunate side effect * of increasing power consumption. * * The scheduler considers a CPU idle if it doesn't have any task * to execute and tries to keep idle cores idle to conserve power; * however, for example, a per-cpu work item scheduled from an * interrupt handler on an idle CPU will force the scheduler to * execute the work item on that CPU breaking the idleness, which in * turn may lead to more scheduling choices which are sub-optimal * in terms of power consumption. * * Workqueues marked with WQ_POWER_EFFICIENT are per-cpu by default * but become unbound if workqueue.power_efficient kernel param is * specified. Per-cpu workqueues which are identified to * contribute significantly to power-consumption are identified and * marked with this flag and enabling the power_efficient mode * leads to noticeable power saving at the cost of small * performance disadvantage. * * http://thread.gmane.org/gmane.linux.kernel/1480396 */ WQ_POWER_EFFICIENT = 1 << 7, __WQ_DESTROYING = 1 << 15, /* internal: workqueue is destroying */ __WQ_DRAINING = 1 << 16, /* internal: workqueue is draining */ __WQ_ORDERED = 1 << 17, /* internal: workqueue is ordered */ __WQ_LEGACY = 1 << 18, /* internal: create*_workqueue() */ /* BH wq only allows the following flags */ __WQ_BH_ALLOWS = WQ_BH | WQ_HIGHPRI, }; enum wq_consts { WQ_MAX_ACTIVE = 2048, /* I like 2048, better ideas? */ WQ_UNBOUND_MAX_ACTIVE = WQ_MAX_ACTIVE, WQ_DFL_ACTIVE = WQ_MAX_ACTIVE / 2, /* * Per-node default cap on min_active. Unless explicitly set, min_active * is set to min(max_active, WQ_DFL_MIN_ACTIVE). For more details, see * workqueue_struct->min_active definition. */ WQ_DFL_MIN_ACTIVE = 8, }; /* * System-wide workqueues which are always present. * * system_wq is the one used by schedule[_delayed]_work[_on](). * Multi-CPU multi-threaded. There are users which expect relatively * short queue flush time. Don't queue works which can run for too * long. * * system_highpri_wq is similar to system_wq but for work items which * require WQ_HIGHPRI. * * system_long_wq is similar to system_wq but may host long running * works. Queue flushing might take relatively long. * * system_unbound_wq is unbound workqueue. Workers are not bound to * any specific CPU, not concurrency managed, and all queued works are * executed immediately as long as max_active limit is not reached and * resources are available. * * system_freezable_wq is equivalent to system_wq except that it's * freezable. * * *_power_efficient_wq are inclined towards saving power and converted * into WQ_UNBOUND variants if 'wq_power_efficient' is enabled; otherwise, * they are same as their non-power-efficient counterparts - e.g. * system_power_efficient_wq is identical to system_wq if * 'wq_power_efficient' is disabled. See WQ_POWER_EFFICIENT for more info. * * system_bh[_highpri]_wq are convenience interface to softirq. BH work items * are executed in the queueing CPU's BH context in the queueing order. */ extern struct workqueue_struct *system_wq; extern struct workqueue_struct *system_highpri_wq; extern struct workqueue_struct *system_long_wq; extern struct workqueue_struct *system_unbound_wq; extern struct workqueue_struct *system_freezable_wq; extern struct workqueue_struct *system_power_efficient_wq; extern struct workqueue_struct *system_freezable_power_efficient_wq; extern struct workqueue_struct *system_bh_wq; extern struct workqueue_struct *system_bh_highpri_wq; void workqueue_softirq_action(bool highpri); void workqueue_softirq_dead(unsigned int cpu); /** * alloc_workqueue - allocate a workqueue * @fmt: printf format for the name of the workqueue * @flags: WQ_* flags * @max_active: max in-flight work items, 0 for default * @...: args for @fmt * * For a per-cpu workqueue, @max_active limits the number of in-flight work * items for each CPU. e.g. @max_active of 1 indicates that each CPU can be * executing at most one work item for the workqueue. * * For unbound workqueues, @max_active limits the number of in-flight work items * for the whole system. e.g. @max_active of 16 indicates that there can be * at most 16 work items executing for the workqueue in the whole system. * * As sharing the same active counter for an unbound workqueue across multiple * NUMA nodes can be expensive, @max_active is distributed to each NUMA node * according to the proportion of the number of online CPUs and enforced * independently. * * Depending on online CPU distribution, a node may end up with per-node * max_active which is significantly lower than @max_active, which can lead to * deadlocks if the per-node concurrency limit is lower than the maximum number * of interdependent work items for the workqueue. * * To guarantee forward progress regardless of online CPU distribution, the * concurrency limit on every node is guaranteed to be equal to or greater than * min_active which is set to min(@max_active, %WQ_DFL_MIN_ACTIVE). This means * that the sum of per-node max_active's may be larger than @max_active. * * For detailed information on %WQ_* flags, please refer to * Documentation/core-api/workqueue.rst. * * RETURNS: * Pointer to the allocated workqueue on success, %NULL on failure. */ __printf(1, 4) struct workqueue_struct * alloc_workqueue(const char *fmt, unsigned int flags, int max_active, ...); #ifdef CONFIG_LOCKDEP /** * alloc_workqueue_lockdep_map - allocate a workqueue with user-defined lockdep_map * @fmt: printf format for the name of the workqueue * @flags: WQ_* flags * @max_active: max in-flight work items, 0 for default * @lockdep_map: user-defined lockdep_map * @...: args for @fmt * * Same as alloc_workqueue but with the a user-define lockdep_map. Useful for * workqueues created with the same purpose and to avoid leaking a lockdep_map * on each workqueue creation. * * RETURNS: * Pointer to the allocated workqueue on success, %NULL on failure. */ __printf(1, 5) struct workqueue_struct * alloc_workqueue_lockdep_map(const char *fmt, unsigned int flags, int max_active, struct lockdep_map *lockdep_map, ...); /** * alloc_ordered_workqueue_lockdep_map - allocate an ordered workqueue with * user-defined lockdep_map * * @fmt: printf format for the name of the workqueue * @flags: WQ_* flags (only WQ_FREEZABLE and WQ_MEM_RECLAIM are meaningful) * @lockdep_map: user-defined lockdep_map * @args: args for @fmt * * Same as alloc_ordered_workqueue but with the a user-define lockdep_map. * Useful for workqueues created with the same purpose and to avoid leaking a * lockdep_map on each workqueue creation. * * RETURNS: * Pointer to the allocated workqueue on success, %NULL on failure. */ #define alloc_ordered_workqueue_lockdep_map(fmt, flags, lockdep_map, args...) \ alloc_workqueue_lockdep_map(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), \ 1, lockdep_map, ##args) #endif /** * alloc_ordered_workqueue - allocate an ordered workqueue * @fmt: printf format for the name of the workqueue * @flags: WQ_* flags (only WQ_FREEZABLE and WQ_MEM_RECLAIM are meaningful) * @args: args for @fmt * * Allocate an ordered workqueue. An ordered workqueue executes at * most one work item at any given time in the queued order. They are * implemented as unbound workqueues with @max_active of one. * * RETURNS: * Pointer to the allocated workqueue on success, %NULL on failure. */ #define alloc_ordered_workqueue(fmt, flags, args...) \ alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), 1, ##args) #define create_workqueue(name) \ alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, 1, (name)) #define create_freezable_workqueue(name) \ alloc_workqueue("%s", __WQ_LEGACY | WQ_FREEZABLE | WQ_UNBOUND | \ WQ_MEM_RECLAIM, 1, (name)) #define create_singlethread_workqueue(name) \ alloc_ordered_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, name) #define from_work(var, callback_work, work_fieldname) \ container_of(callback_work, typeof(*var), work_fieldname) extern void destroy_workqueue(struct workqueue_struct *wq); struct workqueue_attrs *alloc_workqueue_attrs(void); void free_workqueue_attrs(struct workqueue_attrs *attrs); int apply_workqueue_attrs(struct workqueue_struct *wq, const struct workqueue_attrs *attrs); extern int workqueue_unbound_exclude_cpumask(cpumask_var_t cpumask); extern bool queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work); extern bool queue_work_node(int node, struct workqueue_struct *wq, struct work_struct *work); extern bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, struct delayed_work *work, unsigned long delay); extern bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay); extern bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork); extern void __flush_workqueue(struct workqueue_struct *wq); extern void drain_workqueue(struct workqueue_struct *wq); extern int schedule_on_each_cpu(work_func_t func); int execute_in_process_context(work_func_t fn, struct execute_work *); extern bool flush_work(struct work_struct *work); extern bool cancel_work(struct work_struct *work); extern bool cancel_work_sync(struct work_struct *work); extern bool flush_delayed_work(struct delayed_work *dwork); extern bool cancel_delayed_work(struct delayed_work *dwork); extern bool cancel_delayed_work_sync(struct delayed_work *dwork); extern bool disable_work(struct work_struct *work); extern bool disable_work_sync(struct work_struct *work); extern bool enable_work(struct work_struct *work); extern bool disable_delayed_work(struct delayed_work *dwork); extern bool disable_delayed_work_sync(struct delayed_work *dwork); extern bool enable_delayed_work(struct delayed_work *dwork); extern bool flush_rcu_work(struct rcu_work *rwork); extern void workqueue_set_max_active(struct workqueue_struct *wq, int max_active); extern void workqueue_set_min_active(struct workqueue_struct *wq, int min_active); extern struct work_struct *current_work(void); extern bool current_is_workqueue_rescuer(void); extern bool workqueue_congested(int cpu, struct workqueue_struct *wq); extern unsigned int work_busy(struct work_struct *work); extern __printf(1, 2) void set_worker_desc(const char *fmt, ...); extern void print_worker_info(const char *log_lvl, struct task_struct *task); extern void show_all_workqueues(void); extern void show_freezable_workqueues(void); extern void show_one_workqueue(struct workqueue_struct *wq); extern void wq_worker_comm(char *buf, size_t size, struct task_struct *task); /** * queue_work - queue work on a workqueue * @wq: workqueue to use * @work: work to queue * * Returns %false if @work was already on a queue, %true otherwise. * * We queue the work to the CPU on which it was submitted, but if the CPU dies * it can be processed by another CPU. * * Memory-ordering properties: If it returns %true, guarantees that all stores * preceding the call to queue_work() in the program order will be visible from * the CPU which will execute @work by the time such work executes, e.g., * * { x is initially 0 } * * CPU0 CPU1 * * WRITE_ONCE(x, 1); [ @work is being executed ] * r0 = queue_work(wq, work); r1 = READ_ONCE(x); * * Forbids: r0 == true && r1 == 0 */ static inline bool queue_work(struct workqueue_struct *wq, struct work_struct *work) { return queue_work_on(WORK_CPU_UNBOUND, wq, work); } /** * queue_delayed_work - queue work on a workqueue after delay * @wq: workqueue to use * @dwork: delayable work to queue * @delay: number of jiffies to wait before queueing * * Equivalent to queue_delayed_work_on() but tries to use the local CPU. */ static inline bool queue_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay) { return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); } /** * mod_delayed_work - modify delay of or queue a delayed work * @wq: workqueue to use * @dwork: work to queue * @delay: number of jiffies to wait before queueing * * mod_delayed_work_on() on local CPU. */ static inline bool mod_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay) { return mod_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); } /** * schedule_work_on - put work task on a specific cpu * @cpu: cpu to put the work task on * @work: job to be done * * This puts a job on a specific cpu */ static inline bool schedule_work_on(int cpu, struct work_struct *work) { return queue_work_on(cpu, system_wq, work); } /** * schedule_work - put work task in global workqueue * @work: job to be done * * Returns %false if @work was already on the kernel-global workqueue and * %true otherwise. * * This puts a job in the kernel-global workqueue if it was not already * queued and leaves it in the same position on the kernel-global * workqueue otherwise. * * Shares the same memory-ordering properties of queue_work(), cf. the * DocBook header of queue_work(). */ static inline bool schedule_work(struct work_struct *work) { return queue_work(system_wq, work); } /** * enable_and_queue_work - Enable and queue a work item on a specific workqueue * @wq: The target workqueue * @work: The work item to be enabled and queued * * This function combines the operations of enable_work() and queue_work(), * providing a convenient way to enable and queue a work item in a single call. * It invokes enable_work() on @work and then queues it if the disable depth * reached 0. Returns %true if the disable depth reached 0 and @work is queued, * and %false otherwise. * * Note that @work is always queued when disable depth reaches zero. If the * desired behavior is queueing only if certain events took place while @work is * disabled, the user should implement the necessary state tracking and perform * explicit conditional queueing after enable_work(). */ static inline bool enable_and_queue_work(struct workqueue_struct *wq, struct work_struct *work) { if (enable_work(work)) { queue_work(wq, work); return true; } return false; } /* * Detect attempt to flush system-wide workqueues at compile time when possible. * Warn attempt to flush system-wide workqueues at runtime. * * See https://lkml.kernel.org/r/49925af7-78a8-a3dd-bce6-cfc02e1a9236@I-love.SAKURA.ne.jp * for reasons and steps for converting system-wide workqueues into local workqueues. */ extern void __warn_flushing_systemwide_wq(void) __compiletime_warning("Please avoid flushing system-wide workqueues."); /* Please stop using this function, for this function will be removed in near future. */ #define flush_scheduled_work() \ ({ \ __warn_flushing_systemwide_wq(); \ __flush_workqueue(system_wq); \ }) #define flush_workqueue(wq) \ ({ \ struct workqueue_struct *_wq = (wq); \ \ if ((__builtin_constant_p(_wq == system_wq) && \ _wq == system_wq) || \ (__builtin_constant_p(_wq == system_highpri_wq) && \ _wq == system_highpri_wq) || \ (__builtin_constant_p(_wq == system_long_wq) && \ _wq == system_long_wq) || \ (__builtin_constant_p(_wq == system_unbound_wq) && \ _wq == system_unbound_wq) || \ (__builtin_constant_p(_wq == system_freezable_wq) && \ _wq == system_freezable_wq) || \ (__builtin_constant_p(_wq == system_power_efficient_wq) && \ _wq == system_power_efficient_wq) || \ (__builtin_constant_p(_wq == system_freezable_power_efficient_wq) && \ _wq == system_freezable_power_efficient_wq)) \ __warn_flushing_systemwide_wq(); \ __flush_workqueue(_wq); \ }) /** * schedule_delayed_work_on - queue work in global workqueue on CPU after delay * @cpu: cpu to use * @dwork: job to be done * @delay: number of jiffies to wait * * After waiting for a given time this puts a job in the kernel-global * workqueue on the specified CPU. */ static inline bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay) { return queue_delayed_work_on(cpu, system_wq, dwork, delay); } /** * schedule_delayed_work - put work task in global workqueue after delay * @dwork: job to be done * @delay: number of jiffies to wait or 0 for immediate execution * * After waiting for a given time this puts a job in the kernel-global * workqueue. */ static inline bool schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) { return queue_delayed_work(system_wq, dwork, delay); } #ifndef CONFIG_SMP static inline long work_on_cpu(int cpu, long (*fn)(void *), void *arg) { return fn(arg); } static inline long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg) { return fn(arg); } #else long work_on_cpu_key(int cpu, long (*fn)(void *), void *arg, struct lock_class_key *key); /* * A new key is defined for each caller to make sure the work * associated with the function doesn't share its locking class. */ #define work_on_cpu(_cpu, _fn, _arg) \ ({ \ static struct lock_class_key __key; \ \ work_on_cpu_key(_cpu, _fn, _arg, &__key); \ }) long work_on_cpu_safe_key(int cpu, long (*fn)(void *), void *arg, struct lock_class_key *key); /* * A new key is defined for each caller to make sure the work * associated with the function doesn't share its locking class. */ #define work_on_cpu_safe(_cpu, _fn, _arg) \ ({ \ static struct lock_class_key __key; \ \ work_on_cpu_safe_key(_cpu, _fn, _arg, &__key); \ }) #endif /* CONFIG_SMP */ #ifdef CONFIG_FREEZER extern void freeze_workqueues_begin(void); extern bool freeze_workqueues_busy(void); extern void thaw_workqueues(void); #endif /* CONFIG_FREEZER */ #ifdef CONFIG_SYSFS int workqueue_sysfs_register(struct workqueue_struct *wq); #else /* CONFIG_SYSFS */ static inline int workqueue_sysfs_register(struct workqueue_struct *wq) { return 0; } #endif /* CONFIG_SYSFS */ #ifdef CONFIG_WQ_WATCHDOG void wq_watchdog_touch(int cpu); #else /* CONFIG_WQ_WATCHDOG */ static inline void wq_watchdog_touch(int cpu) { } #endif /* CONFIG_WQ_WATCHDOG */ #ifdef CONFIG_SMP int workqueue_prepare_cpu(unsigned int cpu); int workqueue_online_cpu(unsigned int cpu); int workqueue_offline_cpu(unsigned int cpu); #endif void __init workqueue_init_early(void); void __init workqueue_init(void); void __init workqueue_init_topology(void); #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_GFP_H #define __LINUX_GFP_H #include <linux/gfp_types.h> #include <linux/mmzone.h> #include <linux/topology.h> #include <linux/alloc_tag.h> #include <linux/sched.h> struct vm_area_struct; struct mempolicy; /* Convert GFP flags to their corresponding migrate type */ #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE) #define GFP_MOVABLE_SHIFT 3 static inline int gfp_migratetype(const gfp_t gfp_flags) { VM_WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK); BUILD_BUG_ON((1UL << GFP_MOVABLE_SHIFT) != ___GFP_MOVABLE); BUILD_BUG_ON((___GFP_MOVABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_MOVABLE); BUILD_BUG_ON((___GFP_RECLAIMABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_RECLAIMABLE); BUILD_BUG_ON(((___GFP_MOVABLE | ___GFP_RECLAIMABLE) >> GFP_MOVABLE_SHIFT) != MIGRATE_HIGHATOMIC); if (unlikely(page_group_by_mobility_disabled)) return MIGRATE_UNMOVABLE; /* Group based on mobility */ return (__force unsigned long)(gfp_flags & GFP_MOVABLE_MASK) >> GFP_MOVABLE_SHIFT; } #undef GFP_MOVABLE_MASK #undef GFP_MOVABLE_SHIFT static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags) { return !!(gfp_flags & __GFP_DIRECT_RECLAIM); } static inline bool gfpflags_allow_spinning(const gfp_t gfp_flags) { /* * !__GFP_DIRECT_RECLAIM -> direct claim is not allowed. * !__GFP_KSWAPD_RECLAIM -> it's not safe to wake up kswapd. * All GFP_* flags including GFP_NOWAIT use one or both flags. * alloc_pages_nolock() is the only API that doesn't specify either flag. * * This is stronger than GFP_NOWAIT or GFP_ATOMIC because * those are guaranteed to never block on a sleeping lock. * Here we are enforcing that the allocation doesn't ever spin * on any locks (i.e. only trylocks). There is no high level * GFP_$FOO flag for this use in alloc_pages_nolock() as the * regular page allocator doesn't fully support this * allocation mode. */ return !!(gfp_flags & __GFP_RECLAIM); } #ifdef CONFIG_HIGHMEM #define OPT_ZONE_HIGHMEM ZONE_HIGHMEM #else #define OPT_ZONE_HIGHMEM ZONE_NORMAL #endif #ifdef CONFIG_ZONE_DMA #define OPT_ZONE_DMA ZONE_DMA #else #define OPT_ZONE_DMA ZONE_NORMAL #endif #ifdef CONFIG_ZONE_DMA32 #define OPT_ZONE_DMA32 ZONE_DMA32 #else #define OPT_ZONE_DMA32 ZONE_NORMAL #endif /* * GFP_ZONE_TABLE is a word size bitstring that is used for looking up the * zone to use given the lowest 4 bits of gfp_t. Entries are GFP_ZONES_SHIFT * bits long and there are 16 of them to cover all possible combinations of * __GFP_DMA, __GFP_DMA32, __GFP_MOVABLE and __GFP_HIGHMEM. * * The zone fallback order is MOVABLE=>HIGHMEM=>NORMAL=>DMA32=>DMA. * But GFP_MOVABLE is not only a zone specifier but also an allocation * policy. Therefore __GFP_MOVABLE plus another zone selector is valid. * Only 1 bit of the lowest 3 bits (DMA,DMA32,HIGHMEM) can be set to "1". * * bit result * ================= * 0x0 => NORMAL * 0x1 => DMA or NORMAL * 0x2 => HIGHMEM or NORMAL * 0x3 => BAD (DMA+HIGHMEM) * 0x4 => DMA32 or NORMAL * 0x5 => BAD (DMA+DMA32) * 0x6 => BAD (HIGHMEM+DMA32) * 0x7 => BAD (HIGHMEM+DMA32+DMA) * 0x8 => NORMAL (MOVABLE+0) * 0x9 => DMA or NORMAL (MOVABLE+DMA) * 0xa => MOVABLE (Movable is valid only if HIGHMEM is set too) * 0xb => BAD (MOVABLE+HIGHMEM+DMA) * 0xc => DMA32 or NORMAL (MOVABLE+DMA32) * 0xd => BAD (MOVABLE+DMA32+DMA) * 0xe => BAD (MOVABLE+DMA32+HIGHMEM) * 0xf => BAD (MOVABLE+DMA32+HIGHMEM+DMA) * * GFP_ZONES_SHIFT must be <= 2 on 32 bit platforms. */ #if defined(CONFIG_ZONE_DEVICE) && (MAX_NR_ZONES-1) <= 4 /* ZONE_DEVICE is not a valid GFP zone specifier */ #define GFP_ZONES_SHIFT 2 #else #define GFP_ZONES_SHIFT ZONES_SHIFT #endif #if 16 * GFP_ZONES_SHIFT > BITS_PER_LONG #error GFP_ZONES_SHIFT too large to create GFP_ZONE_TABLE integer #endif #define GFP_ZONE_TABLE ( \ (ZONE_NORMAL << 0 * GFP_ZONES_SHIFT) \ | (OPT_ZONE_DMA << ___GFP_DMA * GFP_ZONES_SHIFT) \ | (OPT_ZONE_HIGHMEM << ___GFP_HIGHMEM * GFP_ZONES_SHIFT) \ | (OPT_ZONE_DMA32 << ___GFP_DMA32 * GFP_ZONES_SHIFT) \ | (ZONE_NORMAL << ___GFP_MOVABLE * GFP_ZONES_SHIFT) \ | (OPT_ZONE_DMA << (___GFP_MOVABLE | ___GFP_DMA) * GFP_ZONES_SHIFT) \ | (ZONE_MOVABLE << (___GFP_MOVABLE | ___GFP_HIGHMEM) * GFP_ZONES_SHIFT)\ | (OPT_ZONE_DMA32 << (___GFP_MOVABLE | ___GFP_DMA32) * GFP_ZONES_SHIFT)\ ) /* * GFP_ZONE_BAD is a bitmap for all combinations of __GFP_DMA, __GFP_DMA32 * __GFP_HIGHMEM and __GFP_MOVABLE that are not permitted. One flag per * entry starting with bit 0. Bit is set if the combination is not * allowed. */ #define GFP_ZONE_BAD ( \ 1 << (___GFP_DMA | ___GFP_HIGHMEM) \ | 1 << (___GFP_DMA | ___GFP_DMA32) \ | 1 << (___GFP_DMA32 | ___GFP_HIGHMEM) \ | 1 << (___GFP_DMA | ___GFP_DMA32 | ___GFP_HIGHMEM) \ | 1 << (___GFP_MOVABLE | ___GFP_HIGHMEM | ___GFP_DMA) \ | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_DMA) \ | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_HIGHMEM) \ | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_DMA | ___GFP_HIGHMEM) \ ) static inline enum zone_type gfp_zone(gfp_t flags) { enum zone_type z; int bit = (__force int) (flags & GFP_ZONEMASK); z = (GFP_ZONE_TABLE >> (bit * GFP_ZONES_SHIFT)) & ((1 << GFP_ZONES_SHIFT) - 1); VM_BUG_ON((GFP_ZONE_BAD >> bit) & 1); return z; } /* * There is only one page-allocator function, and two main namespaces to * it. The alloc_page*() variants return 'struct page *' and as such * can allocate highmem pages, the *get*page*() variants return * virtual kernel addresses to the allocated page(s). */ static inline int gfp_zonelist(gfp_t flags) { #ifdef CONFIG_NUMA if (unlikely(flags & __GFP_THISNODE)) return ZONELIST_NOFALLBACK; #endif return ZONELIST_FALLBACK; } /* * gfp flag masking for nested internal allocations. * * For code that needs to do allocations inside the public allocation API (e.g. * memory allocation tracking code) the allocations need to obey the caller * allocation context constrains to prevent allocation context mismatches (e.g. * GFP_KERNEL allocations in GFP_NOFS contexts) from potential deadlock * situations. * * It is also assumed that these nested allocations are for internal kernel * object storage purposes only and are not going to be used for DMA, etc. Hence * we strip out all the zone information and leave just the context information * intact. * * Further, internal allocations must fail before the higher level allocation * can fail, so we must make them fail faster and fail silently. We also don't * want them to deplete emergency reserves. Hence nested allocations must be * prepared for these allocations to fail. */ static inline gfp_t gfp_nested_mask(gfp_t flags) { return ((flags & (GFP_KERNEL | GFP_ATOMIC | __GFP_NOLOCKDEP)) | (__GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN)); } /* * We get the zone list from the current node and the gfp_mask. * This zone list contains a maximum of MAX_NUMNODES*MAX_NR_ZONES zones. * There are two zonelists per node, one for all zones with memory and * one containing just zones from the node the zonelist belongs to. * * For the case of non-NUMA systems the NODE_DATA() gets optimized to * &contig_page_data at compile-time. */ static inline struct zonelist *node_zonelist(int nid, gfp_t flags) { return NODE_DATA(nid)->node_zonelists + gfp_zonelist(flags); } #ifndef HAVE_ARCH_FREE_PAGE static inline void arch_free_page(struct page *page, int order) { } #endif #ifndef HAVE_ARCH_ALLOC_PAGE static inline void arch_alloc_page(struct page *page, int order) { } #endif struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order, int preferred_nid, nodemask_t *nodemask); #define __alloc_pages(...) alloc_hooks(__alloc_pages_noprof(__VA_ARGS__)) struct folio *__folio_alloc_noprof(gfp_t gfp, unsigned int order, int preferred_nid, nodemask_t *nodemask); #define __folio_alloc(...) alloc_hooks(__folio_alloc_noprof(__VA_ARGS__)) unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, nodemask_t *nodemask, int nr_pages, struct page **page_array); #define __alloc_pages_bulk(...) alloc_hooks(alloc_pages_bulk_noprof(__VA_ARGS__)) unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp, unsigned long nr_pages, struct page **page_array); #define alloc_pages_bulk_mempolicy(...) \ alloc_hooks(alloc_pages_bulk_mempolicy_noprof(__VA_ARGS__)) /* Bulk allocate order-0 pages */ #define alloc_pages_bulk(_gfp, _nr_pages, _page_array) \ __alloc_pages_bulk(_gfp, numa_mem_id(), NULL, _nr_pages, _page_array) static inline unsigned long alloc_pages_bulk_node_noprof(gfp_t gfp, int nid, unsigned long nr_pages, struct page **page_array) { if (nid == NUMA_NO_NODE) nid = numa_mem_id(); return alloc_pages_bulk_noprof(gfp, nid, NULL, nr_pages, page_array); } #define alloc_pages_bulk_node(...) \ alloc_hooks(alloc_pages_bulk_node_noprof(__VA_ARGS__)) static inline void warn_if_node_offline(int this_node, gfp_t gfp_mask) { gfp_t warn_gfp = gfp_mask & (__GFP_THISNODE|__GFP_NOWARN); if (warn_gfp != (__GFP_THISNODE|__GFP_NOWARN)) return; if (node_online(this_node)) return; pr_warn("%pGg allocation from offline node %d\n", &gfp_mask, this_node); dump_stack(); } /* * Allocate pages, preferring the node given as nid. The node must be valid and * online. For more general interface, see alloc_pages_node(). */ static inline struct page * __alloc_pages_node_noprof(int nid, gfp_t gfp_mask, unsigned int order) { VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES); warn_if_node_offline(nid, gfp_mask); return __alloc_pages_noprof(gfp_mask, order, nid, NULL); } #define __alloc_pages_node(...) alloc_hooks(__alloc_pages_node_noprof(__VA_ARGS__)) static inline struct folio *__folio_alloc_node_noprof(gfp_t gfp, unsigned int order, int nid) { VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES); warn_if_node_offline(nid, gfp); return __folio_alloc_noprof(gfp, order, nid, NULL); } #define __folio_alloc_node(...) alloc_hooks(__folio_alloc_node_noprof(__VA_ARGS__)) /* * Allocate pages, preferring the node given as nid. When nid == NUMA_NO_NODE, * prefer the current CPU's closest node. Otherwise node must be valid and * online. */ static inline struct page *alloc_pages_node_noprof(int nid, gfp_t gfp_mask, unsigned int order) { if (nid == NUMA_NO_NODE) nid = numa_mem_id(); return __alloc_pages_node_noprof(nid, gfp_mask, order); } #define alloc_pages_node(...) alloc_hooks(alloc_pages_node_noprof(__VA_ARGS__)) #ifdef CONFIG_NUMA struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order); struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order); struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, struct mempolicy *mpol, pgoff_t ilx, int nid); struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma, unsigned long addr); #else static inline struct page *alloc_pages_noprof(gfp_t gfp_mask, unsigned int order) { return alloc_pages_node_noprof(numa_node_id(), gfp_mask, order); } static inline struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order) { return __folio_alloc_node_noprof(gfp, order, numa_node_id()); } static inline struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, struct mempolicy *mpol, pgoff_t ilx, int nid) { return folio_alloc_noprof(gfp, order); } #define vma_alloc_folio_noprof(gfp, order, vma, addr) \ folio_alloc_noprof(gfp, order) #endif #define alloc_pages(...) alloc_hooks(alloc_pages_noprof(__VA_ARGS__)) #define folio_alloc(...) alloc_hooks(folio_alloc_noprof(__VA_ARGS__)) #define folio_alloc_mpol(...) alloc_hooks(folio_alloc_mpol_noprof(__VA_ARGS__)) #define vma_alloc_folio(...) alloc_hooks(vma_alloc_folio_noprof(__VA_ARGS__)) #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) static inline struct page *alloc_page_vma_noprof(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) { struct folio *folio = vma_alloc_folio_noprof(gfp, 0, vma, addr); return &folio->page; } #define alloc_page_vma(...) alloc_hooks(alloc_page_vma_noprof(__VA_ARGS__)) struct page *alloc_pages_nolock_noprof(int nid, unsigned int order); #define alloc_pages_nolock(...) alloc_hooks(alloc_pages_nolock_noprof(__VA_ARGS__)) extern unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order); #define __get_free_pages(...) alloc_hooks(get_free_pages_noprof(__VA_ARGS__)) extern unsigned long get_zeroed_page_noprof(gfp_t gfp_mask); #define get_zeroed_page(...) alloc_hooks(get_zeroed_page_noprof(__VA_ARGS__)) void *alloc_pages_exact_noprof(size_t size, gfp_t gfp_mask) __alloc_size(1); #define alloc_pages_exact(...) alloc_hooks(alloc_pages_exact_noprof(__VA_ARGS__)) void free_pages_exact(void *virt, size_t size); __meminit void *alloc_pages_exact_nid_noprof(int nid, size_t size, gfp_t gfp_mask) __alloc_size(2); #define alloc_pages_exact_nid(...) \ alloc_hooks(alloc_pages_exact_nid_noprof(__VA_ARGS__)) #define __get_free_page(gfp_mask) \ __get_free_pages((gfp_mask), 0) #define __get_dma_pages(gfp_mask, order) \ __get_free_pages((gfp_mask) | GFP_DMA, (order)) extern void __free_pages(struct page *page, unsigned int order); extern void free_pages_nolock(struct page *page, unsigned int order); extern void free_pages(unsigned long addr, unsigned int order); #define __free_page(page) __free_pages((page), 0) #define free_page(addr) free_pages((addr), 0) void page_alloc_init_cpuhp(void); int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp); void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp); void drain_all_pages(struct zone *zone); void drain_local_pages(struct zone *zone); void page_alloc_init_late(void); void setup_pcp_cacheinfo(unsigned int cpu); /* * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what * GFP flags are used before interrupts are enabled. Once interrupts are * enabled, it is set to __GFP_BITS_MASK while the system is running. During * hibernation, it is used by PM to avoid I/O during memory allocation while * devices are suspended. */ extern gfp_t gfp_allowed_mask; /* Returns true if the gfp_mask allows use of ALLOC_NO_WATERMARK */ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask); static inline bool gfp_has_io_fs(gfp_t gfp) { return (gfp & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS); } /* * Check if the gfp flags allow compaction - GFP_NOIO is a really * tricky context because the migration might require IO. */ static inline bool gfp_compaction_allowed(gfp_t gfp_mask) { return IS_ENABLED(CONFIG_COMPACTION) && (gfp_mask & __GFP_IO); } extern gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma); #ifdef CONFIG_CONTIG_ALLOC /* The below functions must be run on a range from a single zone. */ extern int alloc_contig_range_noprof(unsigned long start, unsigned long end, unsigned migratetype, gfp_t gfp_mask); #define alloc_contig_range(...) alloc_hooks(alloc_contig_range_noprof(__VA_ARGS__)) extern struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask, int nid, nodemask_t *nodemask); #define alloc_contig_pages(...) alloc_hooks(alloc_contig_pages_noprof(__VA_ARGS__)) #endif void free_contig_range(unsigned long pfn, unsigned long nr_pages); #ifdef CONFIG_CONTIG_ALLOC static inline struct folio *folio_alloc_gigantic_noprof(int order, gfp_t gfp, int nid, nodemask_t *node) { struct page *page; if (WARN_ON(!order || !(gfp & __GFP_COMP))) return NULL; page = alloc_contig_pages_noprof(1 << order, gfp, nid, node); return page ? page_folio(page) : NULL; } #else static inline struct folio *folio_alloc_gigantic_noprof(int order, gfp_t gfp, int nid, nodemask_t *node) { return NULL; } #endif /* This should be paired with folio_put() rather than free_contig_range(). */ #define folio_alloc_gigantic(...) alloc_hooks(folio_alloc_gigantic_noprof(__VA_ARGS__)) #endif /* __LINUX_GFP_H */
6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 // SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) /* gw.c - CAN frame Gateway/Router/Bridge with netlink interface * * Copyright (c) 2019 Volkswagen Group Electronic Research * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of Volkswagen nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * Alternatively, provided that this notice is retained in full, this * software may be distributed under the terms of the GNU General * Public License ("GPL") version 2, in which case the provisions of the * GPL apply INSTEAD OF those given above. * * The provided data structures and external interfaces from this code * are not restricted to be used by modules with a GPL compatible license. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * */ #include <linux/module.h> #include <linux/init.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/rculist.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/can.h> #include <linux/can/core.h> #include <linux/can/skb.h> #include <linux/can/gw.h> #include <net/rtnetlink.h> #include <net/net_namespace.h> #include <net/sock.h> #define CAN_GW_NAME "can-gw" MODULE_DESCRIPTION("PF_CAN netlink gateway"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Oliver Hartkopp <oliver.hartkopp@volkswagen.de>"); MODULE_ALIAS(CAN_GW_NAME); #define CGW_MIN_HOPS 1 #define CGW_MAX_HOPS 6 #define CGW_DEFAULT_HOPS 1 static unsigned int max_hops __read_mostly = CGW_DEFAULT_HOPS; module_param(max_hops, uint, 0444); MODULE_PARM_DESC(max_hops, "maximum " CAN_GW_NAME " routing hops for CAN frames " "(valid values: " __stringify(CGW_MIN_HOPS) "-" __stringify(CGW_MAX_HOPS) " hops, " "default: " __stringify(CGW_DEFAULT_HOPS) ")"); static struct notifier_block notifier; static struct kmem_cache *cgw_cache __read_mostly; /* structure that contains the (on-the-fly) CAN frame modifications */ struct cf_mod { struct { struct canfd_frame and; struct canfd_frame or; struct canfd_frame xor; struct canfd_frame set; } modframe; struct { u8 and; u8 or; u8 xor; u8 set; } modtype; void (*modfunc[MAX_MODFUNCTIONS])(struct canfd_frame *cf, struct cf_mod *mod); /* CAN frame checksum calculation after CAN frame modifications */ struct { struct cgw_csum_xor xor; struct cgw_csum_crc8 crc8; } csum; struct { void (*xor)(struct canfd_frame *cf, struct cgw_csum_xor *xor); void (*crc8)(struct canfd_frame *cf, struct cgw_csum_crc8 *crc8); } csumfunc; u32 uid; }; /* So far we just support CAN -> CAN routing and frame modifications. * * The internal can_can_gw structure contains data and attributes for * a CAN -> CAN gateway job. */ struct can_can_gw { struct can_filter filter; int src_idx; int dst_idx; }; /* list entry for CAN gateways jobs */ struct cgw_job { struct hlist_node list; struct rcu_head rcu; u32 handled_frames; u32 dropped_frames; u32 deleted_frames; struct cf_mod __rcu *cf_mod; union { /* CAN frame data source */ struct net_device *dev; } src; union { /* CAN frame data destination */ struct net_device *dev; } dst; union { struct can_can_gw ccgw; /* tbc */ }; u8 gwtype; u8 limit_hops; u16 flags; }; /* modification functions that are invoked in the hot path in can_can_gw_rcv */ #define MODFUNC(func, op) static void func(struct canfd_frame *cf, \ struct cf_mod *mod) { op ; } MODFUNC(mod_and_id, cf->can_id &= mod->modframe.and.can_id) MODFUNC(mod_and_len, cf->len &= mod->modframe.and.len) MODFUNC(mod_and_flags, cf->flags &= mod->modframe.and.flags) MODFUNC(mod_and_data, *(u64 *)cf->data &= *(u64 *)mod->modframe.and.data) MODFUNC(mod_or_id, cf->can_id |= mod->modframe.or.can_id) MODFUNC(mod_or_len, cf->len |= mod->modframe.or.len) MODFUNC(mod_or_flags, cf->flags |= mod->modframe.or.flags) MODFUNC(mod_or_data, *(u64 *)cf->data |= *(u64 *)mod->modframe.or.data) MODFUNC(mod_xor_id, cf->can_id ^= mod->modframe.xor.can_id) MODFUNC(mod_xor_len, cf->len ^= mod->modframe.xor.len) MODFUNC(mod_xor_flags, cf->flags ^= mod->modframe.xor.flags) MODFUNC(mod_xor_data, *(u64 *)cf->data ^= *(u64 *)mod->modframe.xor.data) MODFUNC(mod_set_id, cf->can_id = mod->modframe.set.can_id) MODFUNC(mod_set_len, cf->len = mod->modframe.set.len) MODFUNC(mod_set_flags, cf->flags = mod->modframe.set.flags) MODFUNC(mod_set_data, *(u64 *)cf->data = *(u64 *)mod->modframe.set.data) static void mod_and_fddata(struct canfd_frame *cf, struct cf_mod *mod) { int i; for (i = 0; i < CANFD_MAX_DLEN; i += 8) *(u64 *)(cf->data + i) &= *(u64 *)(mod->modframe.and.data + i); } static void mod_or_fddata(struct canfd_frame *cf, struct cf_mod *mod) { int i; for (i = 0; i < CANFD_MAX_DLEN; i += 8) *(u64 *)(cf->data + i) |= *(u64 *)(mod->modframe.or.data + i); } static void mod_xor_fddata(struct canfd_frame *cf, struct cf_mod *mod) { int i; for (i = 0; i < CANFD_MAX_DLEN; i += 8) *(u64 *)(cf->data + i) ^= *(u64 *)(mod->modframe.xor.data + i); } static void mod_set_fddata(struct canfd_frame *cf, struct cf_mod *mod) { memcpy(cf->data, mod->modframe.set.data, CANFD_MAX_DLEN); } /* retrieve valid CC DLC value and store it into 'len' */ static void mod_retrieve_ccdlc(struct canfd_frame *cf) { struct can_frame *ccf = (struct can_frame *)cf; /* len8_dlc is only valid if len == CAN_MAX_DLEN */ if (ccf->len != CAN_MAX_DLEN) return; /* do we have a valid len8_dlc value from 9 .. 15 ? */ if (ccf->len8_dlc > CAN_MAX_DLEN && ccf->len8_dlc <= CAN_MAX_RAW_DLC) ccf->len = ccf->len8_dlc; } /* convert valid CC DLC value in 'len' into struct can_frame elements */ static void mod_store_ccdlc(struct canfd_frame *cf) { struct can_frame *ccf = (struct can_frame *)cf; /* clear potential leftovers */ ccf->len8_dlc = 0; /* plain data length 0 .. 8 - that was easy */ if (ccf->len <= CAN_MAX_DLEN) return; /* potentially broken values are caught in can_can_gw_rcv() */ if (ccf->len > CAN_MAX_RAW_DLC) return; /* we have a valid dlc value from 9 .. 15 in ccf->len */ ccf->len8_dlc = ccf->len; ccf->len = CAN_MAX_DLEN; } static void mod_and_ccdlc(struct canfd_frame *cf, struct cf_mod *mod) { mod_retrieve_ccdlc(cf); mod_and_len(cf, mod); mod_store_ccdlc(cf); } static void mod_or_ccdlc(struct canfd_frame *cf, struct cf_mod *mod) { mod_retrieve_ccdlc(cf); mod_or_len(cf, mod); mod_store_ccdlc(cf); } static void mod_xor_ccdlc(struct canfd_frame *cf, struct cf_mod *mod) { mod_retrieve_ccdlc(cf); mod_xor_len(cf, mod); mod_store_ccdlc(cf); } static void mod_set_ccdlc(struct canfd_frame *cf, struct cf_mod *mod) { mod_set_len(cf, mod); mod_store_ccdlc(cf); } static void canframecpy(struct canfd_frame *dst, struct can_frame *src) { /* Copy the struct members separately to ensure that no uninitialized * data are copied in the 3 bytes hole of the struct. This is needed * to make easy compares of the data in the struct cf_mod. */ dst->can_id = src->can_id; dst->len = src->len; *(u64 *)dst->data = *(u64 *)src->data; } static void canfdframecpy(struct canfd_frame *dst, struct canfd_frame *src) { /* Copy the struct members separately to ensure that no uninitialized * data are copied in the 2 bytes hole of the struct. This is needed * to make easy compares of the data in the struct cf_mod. */ dst->can_id = src->can_id; dst->flags = src->flags; dst->len = src->len; memcpy(dst->data, src->data, CANFD_MAX_DLEN); } static int cgw_chk_csum_parms(s8 fr, s8 to, s8 re, struct rtcanmsg *r) { s8 dlen = CAN_MAX_DLEN; if (r->flags & CGW_FLAGS_CAN_FD) dlen = CANFD_MAX_DLEN; /* absolute dlc values 0 .. 7 => 0 .. 7, e.g. data [0] * relative to received dlc -1 .. -8 : * e.g. for received dlc = 8 * -1 => index = 7 (data[7]) * -3 => index = 5 (data[5]) * -8 => index = 0 (data[0]) */ if (fr >= -dlen && fr < dlen && to >= -dlen && to < dlen && re >= -dlen && re < dlen) return 0; else return -EINVAL; } static inline int calc_idx(int idx, int rx_len) { if (idx < 0) return rx_len + idx; else return idx; } static void cgw_csum_xor_rel(struct canfd_frame *cf, struct cgw_csum_xor *xor) { int from = calc_idx(xor->from_idx, cf->len); int to = calc_idx(xor->to_idx, cf->len); int res = calc_idx(xor->result_idx, cf->len); u8 val = xor->init_xor_val; int i; if (from < 0 || to < 0 || res < 0) return; if (from <= to) { for (i = from; i <= to; i++) val ^= cf->data[i]; } else { for (i = from; i >= to; i--) val ^= cf->data[i]; } cf->data[res] = val; } static void cgw_csum_xor_pos(struct canfd_frame *cf, struct cgw_csum_xor *xor) { u8 val = xor->init_xor_val; int i; for (i = xor->from_idx; i <= xor->to_idx; i++) val ^= cf->data[i]; cf->data[xor->result_idx] = val; } static void cgw_csum_xor_neg(struct canfd_frame *cf, struct cgw_csum_xor *xor) { u8 val = xor->init_xor_val; int i; for (i = xor->from_idx; i >= xor->to_idx; i--) val ^= cf->data[i]; cf->data[xor->result_idx] = val; } static void cgw_csum_crc8_rel(struct canfd_frame *cf, struct cgw_csum_crc8 *crc8) { int from = calc_idx(crc8->from_idx, cf->len); int to = calc_idx(crc8->to_idx, cf->len); int res = calc_idx(crc8->result_idx, cf->len); u8 crc = crc8->init_crc_val; int i; if (from < 0 || to < 0 || res < 0) return; if (from <= to) { for (i = crc8->from_idx; i <= crc8->to_idx; i++) crc = crc8->crctab[crc ^ cf->data[i]]; } else { for (i = crc8->from_idx; i >= crc8->to_idx; i--) crc = crc8->crctab[crc ^ cf->data[i]]; } switch (crc8->profile) { case CGW_CRC8PRF_1U8: crc = crc8->crctab[crc ^ crc8->profile_data[0]]; break; case CGW_CRC8PRF_16U8: crc = crc8->crctab[crc ^ crc8->profile_data[cf->data[1] & 0xF]]; break; case CGW_CRC8PRF_SFFID_XOR: crc = crc8->crctab[crc ^ (cf->can_id & 0xFF) ^ (cf->can_id >> 8 & 0xFF)]; break; } cf->data[crc8->result_idx] = crc ^ crc8->final_xor_val; } static void cgw_csum_crc8_pos(struct canfd_frame *cf, struct cgw_csum_crc8 *crc8) { u8 crc = crc8->init_crc_val; int i; for (i = crc8->from_idx; i <= crc8->to_idx; i++) crc = crc8->crctab[crc ^ cf->data[i]]; switch (crc8->profile) { case CGW_CRC8PRF_1U8: crc = crc8->crctab[crc ^ crc8->profile_data[0]]; break; case CGW_CRC8PRF_16U8: crc = crc8->crctab[crc ^ crc8->profile_data[cf->data[1] & 0xF]]; break; case CGW_CRC8PRF_SFFID_XOR: crc = crc8->crctab[crc ^ (cf->can_id & 0xFF) ^ (cf->can_id >> 8 & 0xFF)]; break; } cf->data[crc8->result_idx] = crc ^ crc8->final_xor_val; } static void cgw_csum_crc8_neg(struct canfd_frame *cf, struct cgw_csum_crc8 *crc8) { u8 crc = crc8->init_crc_val; int i; for (i = crc8->from_idx; i >= crc8->to_idx; i--) crc = crc8->crctab[crc ^ cf->data[i]]; switch (crc8->profile) { case CGW_CRC8PRF_1U8: crc = crc8->crctab[crc ^ crc8->profile_data[0]]; break; case CGW_CRC8PRF_16U8: crc = crc8->crctab[crc ^ crc8->profile_data[cf->data[1] & 0xF]]; break; case CGW_CRC8PRF_SFFID_XOR: crc = crc8->crctab[crc ^ (cf->can_id & 0xFF) ^ (cf->can_id >> 8 & 0xFF)]; break; } cf->data[crc8->result_idx] = crc ^ crc8->final_xor_val; } /* the receive & process & send function */ static void can_can_gw_rcv(struct sk_buff *skb, void *data) { struct cgw_job *gwj = (struct cgw_job *)data; struct canfd_frame *cf; struct sk_buff *nskb; struct cf_mod *mod; int modidx = 0; /* process strictly Classic CAN or CAN FD frames */ if (gwj->flags & CGW_FLAGS_CAN_FD) { if (!can_is_canfd_skb(skb)) return; } else { if (!can_is_can_skb(skb)) return; } /* Do not handle CAN frames routed more than 'max_hops' times. * In general we should never catch this delimiter which is intended * to cover a misconfiguration protection (e.g. circular CAN routes). * * The Controller Area Network controllers only accept CAN frames with * correct CRCs - which are not visible in the controller registers. * According to skbuff.h documentation the csum_start element for IP * checksums is undefined/unused when ip_summed == CHECKSUM_UNNECESSARY. * Only CAN skbs can be processed here which already have this property. */ #define cgw_hops(skb) ((skb)->csum_start) BUG_ON(skb->ip_summed != CHECKSUM_UNNECESSARY); if (cgw_hops(skb) >= max_hops) { /* indicate deleted frames due to misconfiguration */ gwj->deleted_frames++; return; } if (!(gwj->dst.dev->flags & IFF_UP)) { gwj->dropped_frames++; return; } /* is sending the skb back to the incoming interface not allowed? */ if (!(gwj->flags & CGW_FLAGS_CAN_IIF_TX_OK) && can_skb_prv(skb)->ifindex == gwj->dst.dev->ifindex) return; /* clone the given skb, which has not been done in can_rcv() * * When there is at least one modification function activated, * we need to copy the skb as we want to modify skb->data. */ mod = rcu_dereference(gwj->cf_mod); if (mod->modfunc[0]) nskb = skb_copy(skb, GFP_ATOMIC); else nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) { gwj->dropped_frames++; return; } /* put the incremented hop counter in the cloned skb */ cgw_hops(nskb) = cgw_hops(skb) + 1; /* first processing of this CAN frame -> adjust to private hop limit */ if (gwj->limit_hops && cgw_hops(nskb) == 1) cgw_hops(nskb) = max_hops - gwj->limit_hops + 1; nskb->dev = gwj->dst.dev; /* pointer to modifiable CAN frame */ cf = (struct canfd_frame *)nskb->data; /* perform preprocessed modification functions if there are any */ while (modidx < MAX_MODFUNCTIONS && mod->modfunc[modidx]) (*mod->modfunc[modidx++])(cf, mod); /* Has the CAN frame been modified? */ if (modidx) { /* get available space for the processed CAN frame type */ int max_len = nskb->len - offsetof(struct canfd_frame, data); /* dlc may have changed, make sure it fits to the CAN frame */ if (cf->len > max_len) { /* delete frame due to misconfiguration */ gwj->deleted_frames++; kfree_skb(nskb); return; } /* check for checksum updates */ if (mod->csumfunc.crc8) (*mod->csumfunc.crc8)(cf, &mod->csum.crc8); if (mod->csumfunc.xor) (*mod->csumfunc.xor)(cf, &mod->csum.xor); } /* clear the skb timestamp if not configured the other way */ if (!(gwj->flags & CGW_FLAGS_CAN_SRC_TSTAMP)) nskb->tstamp = 0; /* send to netdevice */ if (can_send(nskb, gwj->flags & CGW_FLAGS_CAN_ECHO)) gwj->dropped_frames++; else gwj->handled_frames++; } static inline int cgw_register_filter(struct net *net, struct cgw_job *gwj) { return can_rx_register(net, gwj->src.dev, gwj->ccgw.filter.can_id, gwj->ccgw.filter.can_mask, can_can_gw_rcv, gwj, "gw", NULL); } static inline void cgw_unregister_filter(struct net *net, struct cgw_job *gwj) { can_rx_unregister(net, gwj->src.dev, gwj->ccgw.filter.can_id, gwj->ccgw.filter.can_mask, can_can_gw_rcv, gwj); } static void cgw_job_free_rcu(struct rcu_head *rcu_head) { struct cgw_job *gwj = container_of(rcu_head, struct cgw_job, rcu); /* cgw_job::cf_mod is always accessed from the same cgw_job object within * the same RCU read section. Once cgw_job is scheduled for removal, * cf_mod can also be removed without mandating an additional grace period. */ kfree(rcu_access_pointer(gwj->cf_mod)); kmem_cache_free(cgw_cache, gwj); } /* Return cgw_job::cf_mod with RTNL protected section */ static struct cf_mod *cgw_job_cf_mod(struct cgw_job *gwj) { return rcu_dereference_protected(gwj->cf_mod, rtnl_is_locked()); } static int cgw_notifier(struct notifier_block *nb, unsigned long msg, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); if (dev->type != ARPHRD_CAN) return NOTIFY_DONE; if (msg == NETDEV_UNREGISTER) { struct cgw_job *gwj = NULL; struct hlist_node *nx; ASSERT_RTNL(); hlist_for_each_entry_safe(gwj, nx, &net->can.cgw_list, list) { if (gwj->src.dev == dev || gwj->dst.dev == dev) { hlist_del(&gwj->list); cgw_unregister_filter(net, gwj); call_rcu(&gwj->rcu, cgw_job_free_rcu); } } } return NOTIFY_DONE; } static int cgw_put_job(struct sk_buff *skb, struct cgw_job *gwj, int type, u32 pid, u32 seq, int flags) { struct rtcanmsg *rtcan; struct nlmsghdr *nlh; struct cf_mod *mod; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtcan), flags); if (!nlh) return -EMSGSIZE; rtcan = nlmsg_data(nlh); rtcan->can_family = AF_CAN; rtcan->gwtype = gwj->gwtype; rtcan->flags = gwj->flags; /* add statistics if available */ if (gwj->handled_frames) { if (nla_put_u32(skb, CGW_HANDLED, gwj->handled_frames) < 0) goto cancel; } if (gwj->dropped_frames) { if (nla_put_u32(skb, CGW_DROPPED, gwj->dropped_frames) < 0) goto cancel; } if (gwj->deleted_frames) { if (nla_put_u32(skb, CGW_DELETED, gwj->deleted_frames) < 0) goto cancel; } /* check non default settings of attributes */ if (gwj->limit_hops) { if (nla_put_u8(skb, CGW_LIM_HOPS, gwj->limit_hops) < 0) goto cancel; } mod = cgw_job_cf_mod(gwj); if (gwj->flags & CGW_FLAGS_CAN_FD) { struct cgw_fdframe_mod mb; if (mod->modtype.and) { memcpy(&mb.cf, &mod->modframe.and, sizeof(mb.cf)); mb.modtype = mod->modtype.and; if (nla_put(skb, CGW_FDMOD_AND, sizeof(mb), &mb) < 0) goto cancel; } if (mod->modtype.or) { memcpy(&mb.cf, &mod->modframe.or, sizeof(mb.cf)); mb.modtype = mod->modtype.or; if (nla_put(skb, CGW_FDMOD_OR, sizeof(mb), &mb) < 0) goto cancel; } if (mod->modtype.xor) { memcpy(&mb.cf, &mod->modframe.xor, sizeof(mb.cf)); mb.modtype = mod->modtype.xor; if (nla_put(skb, CGW_FDMOD_XOR, sizeof(mb), &mb) < 0) goto cancel; } if (mod->modtype.set) { memcpy(&mb.cf, &mod->modframe.set, sizeof(mb.cf)); mb.modtype = mod->modtype.set; if (nla_put(skb, CGW_FDMOD_SET, sizeof(mb), &mb) < 0) goto cancel; } } else { struct cgw_frame_mod mb; if (mod->modtype.and) { memcpy(&mb.cf, &mod->modframe.and, sizeof(mb.cf)); mb.modtype = mod->modtype.and; if (nla_put(skb, CGW_MOD_AND, sizeof(mb), &mb) < 0) goto cancel; } if (mod->modtype.or) { memcpy(&mb.cf, &mod->modframe.or, sizeof(mb.cf)); mb.modtype = mod->modtype.or; if (nla_put(skb, CGW_MOD_OR, sizeof(mb), &mb) < 0) goto cancel; } if (mod->modtype.xor) { memcpy(&mb.cf, &mod->modframe.xor, sizeof(mb.cf)); mb.modtype = mod->modtype.xor; if (nla_put(skb, CGW_MOD_XOR, sizeof(mb), &mb) < 0) goto cancel; } if (mod->modtype.set) { memcpy(&mb.cf, &mod->modframe.set, sizeof(mb.cf)); mb.modtype = mod->modtype.set; if (nla_put(skb, CGW_MOD_SET, sizeof(mb), &mb) < 0) goto cancel; } } if (mod->uid) { if (nla_put_u32(skb, CGW_MOD_UID, mod->uid) < 0) goto cancel; } if (mod->csumfunc.crc8) { if (nla_put(skb, CGW_CS_CRC8, CGW_CS_CRC8_LEN, &mod->csum.crc8) < 0) goto cancel; } if (mod->csumfunc.xor) { if (nla_put(skb, CGW_CS_XOR, CGW_CS_XOR_LEN, &mod->csum.xor) < 0) goto cancel; } if (gwj->gwtype == CGW_TYPE_CAN_CAN) { if (gwj->ccgw.filter.can_id || gwj->ccgw.filter.can_mask) { if (nla_put(skb, CGW_FILTER, sizeof(struct can_filter), &gwj->ccgw.filter) < 0) goto cancel; } if (nla_put_u32(skb, CGW_SRC_IF, gwj->ccgw.src_idx) < 0) goto cancel; if (nla_put_u32(skb, CGW_DST_IF, gwj->ccgw.dst_idx) < 0) goto cancel; } nlmsg_end(skb, nlh); return 0; cancel: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } /* Dump information about all CAN gateway jobs, in response to RTM_GETROUTE */ static int cgw_dump_jobs(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct cgw_job *gwj = NULL; int idx = 0; int s_idx = cb->args[0]; rcu_read_lock(); hlist_for_each_entry_rcu(gwj, &net->can.cgw_list, list) { if (idx < s_idx) goto cont; if (cgw_put_job(skb, gwj, RTM_NEWROUTE, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI) < 0) break; cont: idx++; } rcu_read_unlock(); cb->args[0] = idx; return skb->len; } static const struct nla_policy cgw_policy[CGW_MAX + 1] = { [CGW_MOD_AND] = { .len = sizeof(struct cgw_frame_mod) }, [CGW_MOD_OR] = { .len = sizeof(struct cgw_frame_mod) }, [CGW_MOD_XOR] = { .len = sizeof(struct cgw_frame_mod) }, [CGW_MOD_SET] = { .len = sizeof(struct cgw_frame_mod) }, [CGW_CS_XOR] = { .len = sizeof(struct cgw_csum_xor) }, [CGW_CS_CRC8] = { .len = sizeof(struct cgw_csum_crc8) }, [CGW_SRC_IF] = { .type = NLA_U32 }, [CGW_DST_IF] = { .type = NLA_U32 }, [CGW_FILTER] = { .len = sizeof(struct can_filter) }, [CGW_LIM_HOPS] = { .type = NLA_U8 }, [CGW_MOD_UID] = { .type = NLA_U32 }, [CGW_FDMOD_AND] = { .len = sizeof(struct cgw_fdframe_mod) }, [CGW_FDMOD_OR] = { .len = sizeof(struct cgw_fdframe_mod) }, [CGW_FDMOD_XOR] = { .len = sizeof(struct cgw_fdframe_mod) }, [CGW_FDMOD_SET] = { .len = sizeof(struct cgw_fdframe_mod) }, }; /* check for common and gwtype specific attributes */ static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod, u8 gwtype, void *gwtypeattr, u8 *limhops) { struct nlattr *tb[CGW_MAX + 1]; struct rtcanmsg *r = nlmsg_data(nlh); int modidx = 0; int err = 0; /* initialize modification & checksum data space */ memset(mod, 0, sizeof(*mod)); err = nlmsg_parse_deprecated(nlh, sizeof(struct rtcanmsg), tb, CGW_MAX, cgw_policy, NULL); if (err < 0) return err; if (tb[CGW_LIM_HOPS]) { *limhops = nla_get_u8(tb[CGW_LIM_HOPS]); if (*limhops < 1 || *limhops > max_hops) return -EINVAL; } /* check for AND/OR/XOR/SET modifications */ if (r->flags & CGW_FLAGS_CAN_FD) { struct cgw_fdframe_mod mb; if (tb[CGW_FDMOD_AND]) { nla_memcpy(&mb, tb[CGW_FDMOD_AND], CGW_FDMODATTR_LEN); canfdframecpy(&mod->modframe.and, &mb.cf); mod->modtype.and = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_and_id; if (mb.modtype & CGW_MOD_LEN) mod->modfunc[modidx++] = mod_and_len; if (mb.modtype & CGW_MOD_FLAGS) mod->modfunc[modidx++] = mod_and_flags; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_and_fddata; } if (tb[CGW_FDMOD_OR]) { nla_memcpy(&mb, tb[CGW_FDMOD_OR], CGW_FDMODATTR_LEN); canfdframecpy(&mod->modframe.or, &mb.cf); mod->modtype.or = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_or_id; if (mb.modtype & CGW_MOD_LEN) mod->modfunc[modidx++] = mod_or_len; if (mb.modtype & CGW_MOD_FLAGS) mod->modfunc[modidx++] = mod_or_flags; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_or_fddata; } if (tb[CGW_FDMOD_XOR]) { nla_memcpy(&mb, tb[CGW_FDMOD_XOR], CGW_FDMODATTR_LEN); canfdframecpy(&mod->modframe.xor, &mb.cf); mod->modtype.xor = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_xor_id; if (mb.modtype & CGW_MOD_LEN) mod->modfunc[modidx++] = mod_xor_len; if (mb.modtype & CGW_MOD_FLAGS) mod->modfunc[modidx++] = mod_xor_flags; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_xor_fddata; } if (tb[CGW_FDMOD_SET]) { nla_memcpy(&mb, tb[CGW_FDMOD_SET], CGW_FDMODATTR_LEN); canfdframecpy(&mod->modframe.set, &mb.cf); mod->modtype.set = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_set_id; if (mb.modtype & CGW_MOD_LEN) mod->modfunc[modidx++] = mod_set_len; if (mb.modtype & CGW_MOD_FLAGS) mod->modfunc[modidx++] = mod_set_flags; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_set_fddata; } } else { struct cgw_frame_mod mb; if (tb[CGW_MOD_AND]) { nla_memcpy(&mb, tb[CGW_MOD_AND], CGW_MODATTR_LEN); canframecpy(&mod->modframe.and, &mb.cf); mod->modtype.and = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_and_id; if (mb.modtype & CGW_MOD_DLC) mod->modfunc[modidx++] = mod_and_ccdlc; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_and_data; } if (tb[CGW_MOD_OR]) { nla_memcpy(&mb, tb[CGW_MOD_OR], CGW_MODATTR_LEN); canframecpy(&mod->modframe.or, &mb.cf); mod->modtype.or = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_or_id; if (mb.modtype & CGW_MOD_DLC) mod->modfunc[modidx++] = mod_or_ccdlc; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_or_data; } if (tb[CGW_MOD_XOR]) { nla_memcpy(&mb, tb[CGW_MOD_XOR], CGW_MODATTR_LEN); canframecpy(&mod->modframe.xor, &mb.cf); mod->modtype.xor = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_xor_id; if (mb.modtype & CGW_MOD_DLC) mod->modfunc[modidx++] = mod_xor_ccdlc; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_xor_data; } if (tb[CGW_MOD_SET]) { nla_memcpy(&mb, tb[CGW_MOD_SET], CGW_MODATTR_LEN); canframecpy(&mod->modframe.set, &mb.cf); mod->modtype.set = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_set_id; if (mb.modtype & CGW_MOD_DLC) mod->modfunc[modidx++] = mod_set_ccdlc; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_set_data; } } /* check for checksum operations after CAN frame modifications */ if (modidx) { if (tb[CGW_CS_CRC8]) { struct cgw_csum_crc8 *c = nla_data(tb[CGW_CS_CRC8]); err = cgw_chk_csum_parms(c->from_idx, c->to_idx, c->result_idx, r); if (err) return err; nla_memcpy(&mod->csum.crc8, tb[CGW_CS_CRC8], CGW_CS_CRC8_LEN); /* select dedicated processing function to reduce * runtime operations in receive hot path. */ if (c->from_idx < 0 || c->to_idx < 0 || c->result_idx < 0) mod->csumfunc.crc8 = cgw_csum_crc8_rel; else if (c->from_idx <= c->to_idx) mod->csumfunc.crc8 = cgw_csum_crc8_pos; else mod->csumfunc.crc8 = cgw_csum_crc8_neg; } if (tb[CGW_CS_XOR]) { struct cgw_csum_xor *c = nla_data(tb[CGW_CS_XOR]); err = cgw_chk_csum_parms(c->from_idx, c->to_idx, c->result_idx, r); if (err) return err; nla_memcpy(&mod->csum.xor, tb[CGW_CS_XOR], CGW_CS_XOR_LEN); /* select dedicated processing function to reduce * runtime operations in receive hot path. */ if (c->from_idx < 0 || c->to_idx < 0 || c->result_idx < 0) mod->csumfunc.xor = cgw_csum_xor_rel; else if (c->from_idx <= c->to_idx) mod->csumfunc.xor = cgw_csum_xor_pos; else mod->csumfunc.xor = cgw_csum_xor_neg; } if (tb[CGW_MOD_UID]) nla_memcpy(&mod->uid, tb[CGW_MOD_UID], sizeof(u32)); } if (gwtype == CGW_TYPE_CAN_CAN) { /* check CGW_TYPE_CAN_CAN specific attributes */ struct can_can_gw *ccgw = (struct can_can_gw *)gwtypeattr; memset(ccgw, 0, sizeof(*ccgw)); /* check for can_filter in attributes */ if (tb[CGW_FILTER]) nla_memcpy(&ccgw->filter, tb[CGW_FILTER], sizeof(struct can_filter)); err = -ENODEV; /* specifying two interfaces is mandatory */ if (!tb[CGW_SRC_IF] || !tb[CGW_DST_IF]) return err; ccgw->src_idx = nla_get_u32(tb[CGW_SRC_IF]); ccgw->dst_idx = nla_get_u32(tb[CGW_DST_IF]); /* both indices set to 0 for flushing all routing entries */ if (!ccgw->src_idx && !ccgw->dst_idx) return 0; /* only one index set to 0 is an error */ if (!ccgw->src_idx || !ccgw->dst_idx) return err; } /* add the checks for other gwtypes here */ return 0; } static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct rtcanmsg *r; struct cgw_job *gwj; struct cf_mod *mod; struct can_can_gw ccgw; u8 limhops = 0; int err = 0; if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if (nlmsg_len(nlh) < sizeof(*r)) return -EINVAL; r = nlmsg_data(nlh); if (r->can_family != AF_CAN) return -EPFNOSUPPORT; /* so far we only support CAN -> CAN routings */ if (r->gwtype != CGW_TYPE_CAN_CAN) return -EINVAL; mod = kmalloc(sizeof(*mod), GFP_KERNEL); if (!mod) return -ENOMEM; err = cgw_parse_attr(nlh, mod, CGW_TYPE_CAN_CAN, &ccgw, &limhops); if (err < 0) goto out_free_cf; if (mod->uid) { ASSERT_RTNL(); /* check for updating an existing job with identical uid */ hlist_for_each_entry(gwj, &net->can.cgw_list, list) { struct cf_mod *old_cf; old_cf = cgw_job_cf_mod(gwj); if (old_cf->uid != mod->uid) continue; /* interfaces & filters must be identical */ if (memcmp(&gwj->ccgw, &ccgw, sizeof(ccgw))) { err = -EINVAL; goto out_free_cf; } rcu_assign_pointer(gwj->cf_mod, mod); kfree_rcu_mightsleep(old_cf); return 0; } } /* ifindex == 0 is not allowed for job creation */ if (!ccgw.src_idx || !ccgw.dst_idx) { err = -ENODEV; goto out_free_cf; } gwj = kmem_cache_alloc(cgw_cache, GFP_KERNEL); if (!gwj) { err = -ENOMEM; goto out_free_cf; } gwj->handled_frames = 0; gwj->dropped_frames = 0; gwj->deleted_frames = 0; gwj->flags = r->flags; gwj->gwtype = r->gwtype; gwj->limit_hops = limhops; /* insert already parsed information */ RCU_INIT_POINTER(gwj->cf_mod, mod); memcpy(&gwj->ccgw, &ccgw, sizeof(ccgw)); err = -ENODEV; gwj->src.dev = __dev_get_by_index(net, gwj->ccgw.src_idx); if (!gwj->src.dev) goto out; if (gwj->src.dev->type != ARPHRD_CAN) goto out; gwj->dst.dev = __dev_get_by_index(net, gwj->ccgw.dst_idx); if (!gwj->dst.dev) goto out; if (gwj->dst.dev->type != ARPHRD_CAN) goto out; /* is sending the skb back to the incoming interface intended? */ if (gwj->src.dev == gwj->dst.dev && !(gwj->flags & CGW_FLAGS_CAN_IIF_TX_OK)) { err = -EINVAL; goto out; } ASSERT_RTNL(); err = cgw_register_filter(net, gwj); if (!err) hlist_add_head_rcu(&gwj->list, &net->can.cgw_list); out: if (err) { kmem_cache_free(cgw_cache, gwj); out_free_cf: kfree(mod); } return err; } static void cgw_remove_all_jobs(struct net *net) { struct cgw_job *gwj = NULL; struct hlist_node *nx; ASSERT_RTNL(); hlist_for_each_entry_safe(gwj, nx, &net->can.cgw_list, list) { hlist_del(&gwj->list); cgw_unregister_filter(net, gwj); call_rcu(&gwj->rcu, cgw_job_free_rcu); } } static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct cgw_job *gwj = NULL; struct hlist_node *nx; struct rtcanmsg *r; struct cf_mod mod; struct can_can_gw ccgw; u8 limhops = 0; int err = 0; if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if (nlmsg_len(nlh) < sizeof(*r)) return -EINVAL; r = nlmsg_data(nlh); if (r->can_family != AF_CAN) return -EPFNOSUPPORT; /* so far we only support CAN -> CAN routings */ if (r->gwtype != CGW_TYPE_CAN_CAN) return -EINVAL; err = cgw_parse_attr(nlh, &mod, CGW_TYPE_CAN_CAN, &ccgw, &limhops); if (err < 0) return err; /* two interface indices both set to 0 => remove all entries */ if (!ccgw.src_idx && !ccgw.dst_idx) { cgw_remove_all_jobs(net); return 0; } err = -EINVAL; ASSERT_RTNL(); /* remove only the first matching entry */ hlist_for_each_entry_safe(gwj, nx, &net->can.cgw_list, list) { struct cf_mod *cf_mod; if (gwj->flags != r->flags) continue; if (gwj->limit_hops != limhops) continue; cf_mod = cgw_job_cf_mod(gwj); /* we have a match when uid is enabled and identical */ if (cf_mod->uid || mod.uid) { if (cf_mod->uid != mod.uid) continue; } else { /* no uid => check for identical modifications */ if (memcmp(cf_mod, &mod, sizeof(mod))) continue; } /* if (r->gwtype == CGW_TYPE_CAN_CAN) - is made sure here */ if (memcmp(&gwj->ccgw, &ccgw, sizeof(ccgw))) continue; hlist_del(&gwj->list); cgw_unregister_filter(net, gwj); call_rcu(&gwj->rcu, cgw_job_free_rcu); err = 0; break; } return err; } static int __net_init cangw_pernet_init(struct net *net) { INIT_HLIST_HEAD(&net->can.cgw_list); return 0; } static void __net_exit cangw_pernet_exit_batch(struct list_head *net_list) { struct net *net; rtnl_lock(); list_for_each_entry(net, net_list, exit_list) cgw_remove_all_jobs(net); rtnl_unlock(); } static struct pernet_operations cangw_pernet_ops = { .init = cangw_pernet_init, .exit_batch = cangw_pernet_exit_batch, }; static const struct rtnl_msg_handler cgw_rtnl_msg_handlers[] __initconst_or_module = { {.owner = THIS_MODULE, .protocol = PF_CAN, .msgtype = RTM_NEWROUTE, .doit = cgw_create_job}, {.owner = THIS_MODULE, .protocol = PF_CAN, .msgtype = RTM_DELROUTE, .doit = cgw_remove_job}, {.owner = THIS_MODULE, .protocol = PF_CAN, .msgtype = RTM_GETROUTE, .dumpit = cgw_dump_jobs}, }; static __init int cgw_module_init(void) { int ret; /* sanitize given module parameter */ max_hops = clamp_t(unsigned int, max_hops, CGW_MIN_HOPS, CGW_MAX_HOPS); pr_info("can: netlink gateway - max_hops=%d\n", max_hops); ret = register_pernet_subsys(&cangw_pernet_ops); if (ret) return ret; ret = -ENOMEM; cgw_cache = kmem_cache_create("can_gw", sizeof(struct cgw_job), 0, 0, NULL); if (!cgw_cache) goto out_cache_create; /* set notifier */ notifier.notifier_call = cgw_notifier; ret = register_netdevice_notifier(&notifier); if (ret) goto out_register_notifier; ret = rtnl_register_many(cgw_rtnl_msg_handlers); if (ret) goto out_rtnl_register; return 0; out_rtnl_register: unregister_netdevice_notifier(&notifier); out_register_notifier: kmem_cache_destroy(cgw_cache); out_cache_create: unregister_pernet_subsys(&cangw_pernet_ops); return ret; } static __exit void cgw_module_exit(void) { rtnl_unregister_all(PF_CAN); unregister_netdevice_notifier(&notifier); unregister_pernet_subsys(&cangw_pernet_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ kmem_cache_destroy(cgw_cache); } module_init(cgw_module_init); module_exit(cgw_module_exit);
166 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * include/linux/eventpoll.h ( Efficient event polling implementation ) * Copyright (C) 2001,...,2006 Davide Libenzi * * Davide Libenzi <davidel@xmailserver.org> */ #ifndef _LINUX_EVENTPOLL_H #define _LINUX_EVENTPOLL_H #include <uapi/linux/eventpoll.h> #include <uapi/linux/kcmp.h> /* Forward declarations to avoid compiler errors */ struct file; #ifdef CONFIG_EPOLL #ifdef CONFIG_KCMP struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long toff); #endif /* Used to release the epoll bits inside the "struct file" */ void eventpoll_release_file(struct file *file); /* Copy ready events to userspace */ int epoll_sendevents(struct file *file, struct epoll_event __user *events, int maxevents); /* * This is called from inside fs/file_table.c:__fput() to unlink files * from the eventpoll interface. We need to have this facility to cleanup * correctly files that are closed without being removed from the eventpoll * interface. */ static inline void eventpoll_release(struct file *file) { /* * Fast check to avoid the get/release of the semaphore. Since * we're doing this outside the semaphore lock, it might return * false negatives, but we don't care. It'll help in 99.99% of cases * to avoid the semaphore lock. False positives simply cannot happen * because the file in on the way to be removed and nobody ( but * eventpoll ) has still a reference to this file. */ if (likely(!READ_ONCE(file->f_ep))) return; /* * The file is being closed while it is still linked to an epoll * descriptor. We need to handle this by correctly unlinking it * from its containers. */ eventpoll_release_file(file); } int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, bool nonblock); /* Tells if the epoll_ctl(2) operation needs an event copy from userspace */ static inline int ep_op_has_event(int op) { return op != EPOLL_CTL_DEL; } #else static inline void eventpoll_release(struct file *file) {} #endif #if defined(CONFIG_ARM) && defined(CONFIG_OABI_COMPAT) /* ARM OABI has an incompatible struct layout and needs a special handler */ extern struct epoll_event __user * epoll_put_uevent(__poll_t revents, __u64 data, struct epoll_event __user *uevent); #else static inline struct epoll_event __user * epoll_put_uevent(__poll_t revents, __u64 data, struct epoll_event __user *uevent) { if (__put_user(revents, &uevent->events) || __put_user(data, &uevent->data)) return NULL; return uevent+1; } #endif #endif /* #ifndef _LINUX_EVENTPOLL_H */
6 6 2 3 1 6 1 3 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 // SPDX-License-Identifier: GPL-2.0-only #include <linux/slab.h> #include <linux/stat.h> #include <linux/sched/xacct.h> #include <linux/fcntl.h> #include <linux/file.h> #include <linux/uio.h> #include <linux/fsnotify.h> #include <linux/security.h> #include <linux/export.h> #include <linux/syscalls.h> #include <linux/pagemap.h> #include <linux/splice.h> #include <linux/compat.h> #include <linux/mount.h> #include <linux/fs.h> #include <linux/dax.h> #include <linux/overflow.h> #include "internal.h" #include <linux/uaccess.h> #include <asm/unistd.h> /* * Performs necessary checks before doing a clone. * * Can adjust amount of bytes to clone via @req_count argument. * Returns appropriate error code that caller should return or * zero in case the clone should be allowed. */ static int generic_remap_checks(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *req_count, unsigned int remap_flags) { struct inode *inode_in = file_in->f_mapping->host; struct inode *inode_out = file_out->f_mapping->host; uint64_t count = *req_count; uint64_t bcount; loff_t size_in, size_out; loff_t bs = inode_out->i_sb->s_blocksize; int ret; /* The start of both ranges must be aligned to an fs block. */ if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs)) return -EINVAL; /* Ensure offsets don't wrap. */ if (pos_in + count < pos_in || pos_out + count < pos_out) return -EINVAL; size_in = i_size_read(inode_in); size_out = i_size_read(inode_out); /* Dedupe requires both ranges to be within EOF. */ if ((remap_flags & REMAP_FILE_DEDUP) && (pos_in >= size_in || pos_in + count > size_in || pos_out >= size_out || pos_out + count > size_out)) return -EINVAL; /* Ensure the infile range is within the infile. */ if (pos_in >= size_in) return -EINVAL; count = min(count, size_in - (uint64_t)pos_in); ret = generic_write_check_limits(file_out, pos_out, &count); if (ret) return ret; /* * If the user wanted us to link to the infile's EOF, round up to the * next block boundary for this check. * * Otherwise, make sure the count is also block-aligned, having * already confirmed the starting offsets' block alignment. */ if (pos_in + count == size_in && (!(remap_flags & REMAP_FILE_DEDUP) || pos_out + count == size_out)) { bcount = ALIGN(size_in, bs) - pos_in; } else { if (!IS_ALIGNED(count, bs)) count = ALIGN_DOWN(count, bs); bcount = count; } /* Don't allow overlapped cloning within the same file. */ if (inode_in == inode_out && pos_out + bcount > pos_in && pos_out < pos_in + bcount) return -EINVAL; /* * We shortened the request but the caller can't deal with that, so * bounce the request back to userspace. */ if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN)) return -EINVAL; *req_count = count; return 0; } int remap_verify_area(struct file *file, loff_t pos, loff_t len, bool write) { int mask = write ? MAY_WRITE : MAY_READ; loff_t tmp; int ret; if (unlikely(pos < 0 || len < 0)) return -EINVAL; if (unlikely(check_add_overflow(pos, len, &tmp))) return -EINVAL; ret = security_file_permission(file, mask); if (ret) return ret; return fsnotify_file_area_perm(file, mask, &pos, len); } EXPORT_SYMBOL_GPL(remap_verify_area); /* * Ensure that we don't remap a partial EOF block in the middle of something * else. Assume that the offsets have already been checked for block * alignment. * * For clone we only link a partial EOF block above or at the destination file's * EOF. For deduplication we accept a partial EOF block only if it ends at the * destination file's EOF (can not link it into the middle of a file). * * Shorten the request if possible. */ static int generic_remap_check_len(struct inode *inode_in, struct inode *inode_out, loff_t pos_out, loff_t *len, unsigned int remap_flags) { u64 blkmask = i_blocksize(inode_in) - 1; loff_t new_len = *len; if ((*len & blkmask) == 0) return 0; if (pos_out + *len < i_size_read(inode_out)) new_len &= ~blkmask; if (new_len == *len) return 0; if (remap_flags & REMAP_FILE_CAN_SHORTEN) { *len = new_len; return 0; } return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL; } /* Read a page's worth of file data into the page cache. */ static struct folio *vfs_dedupe_get_folio(struct file *file, loff_t pos) { return read_mapping_folio(file->f_mapping, pos >> PAGE_SHIFT, file); } /* * Lock two folios, ensuring that we lock in offset order if the folios * are from the same file. */ static void vfs_lock_two_folios(struct folio *folio1, struct folio *folio2) { /* Always lock in order of increasing index. */ if (folio1->index > folio2->index) swap(folio1, folio2); folio_lock(folio1); if (folio1 != folio2) folio_lock(folio2); } /* Unlock two folios, being careful not to unlock the same folio twice. */ static void vfs_unlock_two_folios(struct folio *folio1, struct folio *folio2) { folio_unlock(folio1); if (folio1 != folio2) folio_unlock(folio2); } /* * Compare extents of two files to see if they are the same. * Caller must have locked both inodes to prevent write races. */ static int vfs_dedupe_file_range_compare(struct file *src, loff_t srcoff, struct file *dest, loff_t dstoff, loff_t len, bool *is_same) { bool same = true; int error = -EINVAL; while (len) { struct folio *src_folio, *dst_folio; void *src_addr, *dst_addr; loff_t cmp_len = min(PAGE_SIZE - offset_in_page(srcoff), PAGE_SIZE - offset_in_page(dstoff)); cmp_len = min(cmp_len, len); if (cmp_len <= 0) goto out_error; src_folio = vfs_dedupe_get_folio(src, srcoff); if (IS_ERR(src_folio)) { error = PTR_ERR(src_folio); goto out_error; } dst_folio = vfs_dedupe_get_folio(dest, dstoff); if (IS_ERR(dst_folio)) { error = PTR_ERR(dst_folio); folio_put(src_folio); goto out_error; } vfs_lock_two_folios(src_folio, dst_folio); /* * Now that we've locked both folios, make sure they're still * mapped to the file data we're interested in. If not, * someone is invalidating pages on us and we lose. */ if (!folio_test_uptodate(src_folio) || !folio_test_uptodate(dst_folio) || src_folio->mapping != src->f_mapping || dst_folio->mapping != dest->f_mapping) { same = false; goto unlock; } src_addr = kmap_local_folio(src_folio, offset_in_folio(src_folio, srcoff)); dst_addr = kmap_local_folio(dst_folio, offset_in_folio(dst_folio, dstoff)); flush_dcache_folio(src_folio); flush_dcache_folio(dst_folio); if (memcmp(src_addr, dst_addr, cmp_len)) same = false; kunmap_local(dst_addr); kunmap_local(src_addr); unlock: vfs_unlock_two_folios(src_folio, dst_folio); folio_put(dst_folio); folio_put(src_folio); if (!same) break; srcoff += cmp_len; dstoff += cmp_len; len -= cmp_len; } *is_same = same; return 0; out_error: return error; } /* * Check that the two inodes are eligible for cloning, the ranges make * sense, and then flush all dirty data. Caller must ensure that the * inodes have been locked against any other modifications. * * If there's an error, then the usual negative error code is returned. * Otherwise returns 0 with *len set to the request length. */ int __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *len, unsigned int remap_flags, const struct iomap_ops *dax_read_ops) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); bool same_inode = (inode_in == inode_out); int ret; /* Don't touch certain kinds of inodes */ if (IS_IMMUTABLE(inode_out)) return -EPERM; if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) return -ETXTBSY; /* Don't reflink dirs, pipes, sockets... */ if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) return -EISDIR; if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) return -EINVAL; /* Zero length dedupe exits immediately; reflink goes to EOF. */ if (*len == 0) { loff_t isize = i_size_read(inode_in); if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize) return 0; if (pos_in > isize) return -EINVAL; *len = isize - pos_in; if (*len == 0) return 0; } /* Check that we don't violate system file offset limits. */ ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len, remap_flags); if (ret || *len == 0) return ret; /* Wait for the completion of any pending IOs on both files */ inode_dio_wait(inode_in); if (!same_inode) inode_dio_wait(inode_out); ret = filemap_write_and_wait_range(inode_in->i_mapping, pos_in, pos_in + *len - 1); if (ret) return ret; ret = filemap_write_and_wait_range(inode_out->i_mapping, pos_out, pos_out + *len - 1); if (ret) return ret; /* * Check that the extents are the same. */ if (remap_flags & REMAP_FILE_DEDUP) { bool is_same = false; if (!IS_DAX(inode_in)) ret = vfs_dedupe_file_range_compare(file_in, pos_in, file_out, pos_out, *len, &is_same); else if (dax_read_ops) ret = dax_dedupe_file_range_compare(inode_in, pos_in, inode_out, pos_out, *len, &is_same, dax_read_ops); else return -EINVAL; if (ret) return ret; if (!is_same) return -EBADE; } ret = generic_remap_check_len(inode_in, inode_out, pos_out, len, remap_flags); if (ret || *len == 0) return ret; /* If can't alter the file contents, we're done. */ if (!(remap_flags & REMAP_FILE_DEDUP)) ret = file_modified(file_out); return ret; } int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *len, unsigned int remap_flags) { return __generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, len, remap_flags, NULL); } EXPORT_SYMBOL(generic_remap_file_range_prep); loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t len, unsigned int remap_flags) { loff_t ret; WARN_ON_ONCE(remap_flags & REMAP_FILE_DEDUP); if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) return -EXDEV; ret = generic_file_rw_checks(file_in, file_out); if (ret < 0) return ret; if (!file_in->f_op->remap_file_range) return -EOPNOTSUPP; ret = remap_verify_area(file_in, pos_in, len, false); if (ret) return ret; ret = remap_verify_area(file_out, pos_out, len, true); if (ret) return ret; file_start_write(file_out); ret = file_in->f_op->remap_file_range(file_in, pos_in, file_out, pos_out, len, remap_flags); file_end_write(file_out); if (ret < 0) return ret; fsnotify_access(file_in); fsnotify_modify(file_out); return ret; } EXPORT_SYMBOL(vfs_clone_file_range); /* Check whether we are allowed to dedupe the destination file */ static bool may_dedupe_file(struct file *file) { struct mnt_idmap *idmap = file_mnt_idmap(file); struct inode *inode = file_inode(file); if (capable(CAP_SYS_ADMIN)) return true; if (file->f_mode & FMODE_WRITE) return true; if (vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), current_fsuid())) return true; if (!inode_permission(idmap, inode, MAY_WRITE)) return true; return false; } loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, struct file *dst_file, loff_t dst_pos, loff_t len, unsigned int remap_flags) { loff_t ret; WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_CAN_SHORTEN)); /* * This is redundant if called from vfs_dedupe_file_range(), but other * callers need it and it's not performance sesitive... */ ret = remap_verify_area(src_file, src_pos, len, false); if (ret) return ret; ret = remap_verify_area(dst_file, dst_pos, len, true); if (ret) return ret; /* * This needs to be called after remap_verify_area() because of * sb_start_write() and before may_dedupe_file() because the mount's * MAY_WRITE need to be checked with mnt_get_write_access_file() held. */ ret = mnt_want_write_file(dst_file); if (ret) return ret; ret = -EPERM; if (!may_dedupe_file(dst_file)) goto out_drop_write; ret = -EXDEV; if (file_inode(src_file)->i_sb != file_inode(dst_file)->i_sb) goto out_drop_write; ret = -EISDIR; if (S_ISDIR(file_inode(dst_file)->i_mode)) goto out_drop_write; ret = -EINVAL; if (!dst_file->f_op->remap_file_range) goto out_drop_write; if (len == 0) { ret = 0; goto out_drop_write; } ret = dst_file->f_op->remap_file_range(src_file, src_pos, dst_file, dst_pos, len, remap_flags | REMAP_FILE_DEDUP); out_drop_write: mnt_drop_write_file(dst_file); return ret; } EXPORT_SYMBOL(vfs_dedupe_file_range_one); int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) { struct file_dedupe_range_info *info; struct inode *src = file_inode(file); u64 off; u64 len; int i; int ret; u16 count = same->dest_count; loff_t deduped; if (!(file->f_mode & FMODE_READ)) return -EINVAL; if (same->reserved1 || same->reserved2) return -EINVAL; off = same->src_offset; len = same->src_length; if (S_ISDIR(src->i_mode)) return -EISDIR; if (!S_ISREG(src->i_mode)) return -EINVAL; if (!file->f_op->remap_file_range) return -EOPNOTSUPP; ret = remap_verify_area(file, off, len, false); if (ret < 0) return ret; ret = 0; if (off + len > i_size_read(src)) return -EINVAL; /* Arbitrary 1G limit on a single dedupe request, can be raised. */ len = min_t(u64, len, 1 << 30); /* pre-format output fields to sane values */ for (i = 0; i < count; i++) { same->info[i].bytes_deduped = 0ULL; same->info[i].status = FILE_DEDUPE_RANGE_SAME; } for (i = 0, info = same->info; i < count; i++, info++) { CLASS(fd, dst_fd)(info->dest_fd); if (fd_empty(dst_fd)) { info->status = -EBADF; goto next_loop; } if (info->reserved) { info->status = -EINVAL; goto next_loop; } deduped = vfs_dedupe_file_range_one(file, off, fd_file(dst_fd), info->dest_offset, len, REMAP_FILE_CAN_SHORTEN); if (deduped == -EBADE) info->status = FILE_DEDUPE_RANGE_DIFFERS; else if (deduped < 0) info->status = deduped; else info->bytes_deduped = len; next_loop: if (fatal_signal_pending(current)) break; } return ret; } EXPORT_SYMBOL(vfs_dedupe_file_range);
5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 // SPDX-License-Identifier: GPL-2.0 /* Copyright 2011-2014 Autronica Fire and Security AS * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se * * Frame handler other utility functions for HSR and PRP. */ #include "hsr_slave.h" #include <linux/etherdevice.h> #include <linux/if_arp.h> #include <linux/if_vlan.h> #include "hsr_main.h" #include "hsr_device.h" #include "hsr_forward.h" #include "hsr_framereg.h" bool hsr_invalid_dan_ingress_frame(__be16 protocol) { return (protocol != htons(ETH_P_PRP) && protocol != htons(ETH_P_HSR)); } static rx_handler_result_t hsr_handle_frame(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; struct hsr_port *port; struct hsr_priv *hsr; __be16 protocol; /* Packets from dev_loopback_xmit() do not have L2 header, bail out */ if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) return RX_HANDLER_PASS; if (!skb_mac_header_was_set(skb)) { WARN_ONCE(1, "%s: skb invalid", __func__); return RX_HANDLER_PASS; } port = hsr_port_get_rcu(skb->dev); if (!port) goto finish_pass; hsr = port->hsr; if (hsr_addr_is_self(port->hsr, eth_hdr(skb)->h_source)) { /* Directly kill frames sent by ourselves */ kfree_skb(skb); goto finish_consume; } /* For HSR, only tagged frames are expected (unless the device offloads * HSR tag removal), but for PRP there could be non tagged frames as * well from Single attached nodes (SANs). */ protocol = eth_hdr(skb)->h_proto; if (!(port->dev->features & NETIF_F_HW_HSR_TAG_RM) && port->type != HSR_PT_INTERLINK && hsr->proto_ops->invalid_dan_ingress_frame && hsr->proto_ops->invalid_dan_ingress_frame(protocol)) goto finish_pass; skb_push(skb, ETH_HLEN); skb_reset_mac_header(skb); if ((!hsr->prot_version && protocol == htons(ETH_P_PRP)) || protocol == htons(ETH_P_HSR)) skb_set_network_header(skb, ETH_HLEN + HSR_HLEN); skb_reset_mac_len(skb); /* Only the frames received over the interlink port will assign a * sequence number and require synchronisation vs other sender. */ if (port->type == HSR_PT_INTERLINK) { spin_lock_bh(&hsr->seqnr_lock); hsr_forward_skb(skb, port); spin_unlock_bh(&hsr->seqnr_lock); } else { hsr_forward_skb(skb, port); } finish_consume: return RX_HANDLER_CONSUMED; finish_pass: return RX_HANDLER_PASS; } bool hsr_port_exists(const struct net_device *dev) { return rcu_access_pointer(dev->rx_handler) == hsr_handle_frame; } static int hsr_check_dev_ok(struct net_device *dev, struct netlink_ext_ack *extack) { /* Don't allow HSR on non-ethernet like devices */ if ((dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER || dev->addr_len != ETH_ALEN) { NL_SET_ERR_MSG_MOD(extack, "Cannot use loopback or non-ethernet device as HSR slave."); return -EINVAL; } /* Don't allow enslaving hsr devices */ if (is_hsr_master(dev)) { NL_SET_ERR_MSG_MOD(extack, "Cannot create trees of HSR devices."); return -EINVAL; } if (hsr_port_exists(dev)) { NL_SET_ERR_MSG_MOD(extack, "This device is already a HSR slave."); return -EINVAL; } if (is_vlan_dev(dev)) { NL_SET_ERR_MSG_MOD(extack, "HSR on top of VLAN is not yet supported in this driver."); return -EINVAL; } if (dev->priv_flags & IFF_DONT_BRIDGE) { NL_SET_ERR_MSG_MOD(extack, "This device does not support bridging."); return -EOPNOTSUPP; } /* HSR over bonded devices has not been tested, but I'm not sure it * won't work... */ return 0; } /* Setup device to be added to the HSR bridge. */ static int hsr_portdev_setup(struct hsr_priv *hsr, struct net_device *dev, struct hsr_port *port, struct netlink_ext_ack *extack) { struct net_device *hsr_dev; struct hsr_port *master; int res; /* Don't use promiscuous mode for offload since L2 frame forward * happens at the offloaded hardware. */ if (!port->hsr->fwd_offloaded) { res = dev_set_promiscuity(dev, 1); if (res) return res; } master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); hsr_dev = master->dev; res = netdev_upper_dev_link(dev, hsr_dev, extack); if (res) goto fail_upper_dev_link; res = netdev_rx_handler_register(dev, hsr_handle_frame, port); if (res) goto fail_rx_handler; dev_disable_lro(dev); return 0; fail_rx_handler: netdev_upper_dev_unlink(dev, hsr_dev); fail_upper_dev_link: if (!port->hsr->fwd_offloaded) dev_set_promiscuity(dev, -1); return res; } int hsr_add_port(struct hsr_priv *hsr, struct net_device *dev, enum hsr_port_type type, struct netlink_ext_ack *extack) { struct hsr_port *port, *master; int res; if (type != HSR_PT_MASTER) { res = hsr_check_dev_ok(dev, extack); if (res) return res; } port = hsr_port_get_hsr(hsr, type); if (port) return -EBUSY; /* This port already exists */ port = kzalloc(sizeof(*port), GFP_KERNEL); if (!port) return -ENOMEM; port->hsr = hsr; port->dev = dev; port->type = type; ether_addr_copy(port->original_macaddress, dev->dev_addr); if (type != HSR_PT_MASTER) { res = hsr_portdev_setup(hsr, dev, port, extack); if (res) goto fail_dev_setup; } list_add_tail_rcu(&port->port_list, &hsr->ports); master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); netdev_update_features(master->dev); dev_set_mtu(master->dev, hsr_get_max_mtu(hsr)); return 0; fail_dev_setup: kfree(port); return res; } void hsr_del_port(struct hsr_port *port) { struct hsr_priv *hsr; struct hsr_port *master; hsr = port->hsr; master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); list_del_rcu(&port->port_list); if (port != master) { netdev_update_features(master->dev); dev_set_mtu(master->dev, hsr_get_max_mtu(hsr)); netdev_rx_handler_unregister(port->dev); if (!port->hsr->fwd_offloaded) dev_set_promiscuity(port->dev, -1); netdev_upper_dev_unlink(port->dev, master->dev); eth_hw_addr_set(port->dev, port->original_macaddress); } kfree_rcu(port, rcu); }
6 6 6 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/sch_cbs.c Credit Based Shaper * * Authors: Vinicius Costa Gomes <vinicius.gomes@intel.com> */ /* Credit Based Shaper (CBS) * ========================= * * This is a simple rate-limiting shaper aimed at TSN applications on * systems with known traffic workloads. * * Its algorithm is defined by the IEEE 802.1Q-2014 Specification, * Section 8.6.8.2, and explained in more detail in the Annex L of the * same specification. * * There are four tunables to be considered: * * 'idleslope': Idleslope is the rate of credits that is * accumulated (in kilobits per second) when there is at least * one packet waiting for transmission. Packets are transmitted * when the current value of credits is equal or greater than * zero. When there is no packet to be transmitted the amount of * credits is set to zero. This is the main tunable of the CBS * algorithm. * * 'sendslope': * Sendslope is the rate of credits that is depleted (it should be a * negative number of kilobits per second) when a transmission is * ocurring. It can be calculated as follows, (IEEE 802.1Q-2014 Section * 8.6.8.2 item g): * * sendslope = idleslope - port_transmit_rate * * 'hicredit': Hicredit defines the maximum amount of credits (in * bytes) that can be accumulated. Hicredit depends on the * characteristics of interfering traffic, * 'max_interference_size' is the maximum size of any burst of * traffic that can delay the transmission of a frame that is * available for transmission for this traffic class, (IEEE * 802.1Q-2014 Annex L, Equation L-3): * * hicredit = max_interference_size * (idleslope / port_transmit_rate) * * 'locredit': Locredit is the minimum amount of credits that can * be reached. It is a function of the traffic flowing through * this qdisc (IEEE 802.1Q-2014 Annex L, Equation L-2): * * locredit = max_frame_size * (sendslope / port_transmit_rate) */ #include <linux/ethtool.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/units.h> #include <net/netevent.h> #include <net/netlink.h> #include <net/sch_generic.h> #include <net/pkt_sched.h> static LIST_HEAD(cbs_list); static DEFINE_SPINLOCK(cbs_list_lock); struct cbs_sched_data { bool offload; int queue; atomic64_t port_rate; /* in bytes/s */ s64 last; /* timestamp in ns */ s64 credits; /* in bytes */ s32 locredit; /* in bytes */ s32 hicredit; /* in bytes */ s64 sendslope; /* in bytes/s */ s64 idleslope; /* in bytes/s */ struct qdisc_watchdog watchdog; int (*enqueue)(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free); struct sk_buff *(*dequeue)(struct Qdisc *sch); struct Qdisc *qdisc; struct list_head cbs_list; }; static int cbs_child_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct Qdisc *child, struct sk_buff **to_free) { unsigned int len = qdisc_pkt_len(skb); int err; err = child->ops->enqueue(skb, child, to_free); if (err != NET_XMIT_SUCCESS) return err; sch->qstats.backlog += len; sch->q.qlen++; return NET_XMIT_SUCCESS; } static int cbs_enqueue_offload(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct cbs_sched_data *q = qdisc_priv(sch); struct Qdisc *qdisc = q->qdisc; return cbs_child_enqueue(skb, sch, qdisc, to_free); } static int cbs_enqueue_soft(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct cbs_sched_data *q = qdisc_priv(sch); struct Qdisc *qdisc = q->qdisc; if (sch->q.qlen == 0 && q->credits > 0) { /* We need to stop accumulating credits when there's * no enqueued packets and q->credits is positive. */ q->credits = 0; q->last = ktime_get_ns(); } return cbs_child_enqueue(skb, sch, qdisc, to_free); } static int cbs_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct cbs_sched_data *q = qdisc_priv(sch); return q->enqueue(skb, sch, to_free); } /* timediff is in ns, slope is in bytes/s */ static s64 timediff_to_credits(s64 timediff, s64 slope) { return div64_s64(timediff * slope, NSEC_PER_SEC); } static s64 delay_from_credits(s64 credits, s64 slope) { if (unlikely(slope == 0)) return S64_MAX; return div64_s64(-credits * NSEC_PER_SEC, slope); } static s64 credits_from_len(unsigned int len, s64 slope, s64 port_rate) { if (unlikely(port_rate == 0)) return S64_MAX; return div64_s64(len * slope, port_rate); } static struct sk_buff *cbs_child_dequeue(struct Qdisc *sch, struct Qdisc *child) { struct sk_buff *skb; skb = child->ops->dequeue(child); if (!skb) return NULL; qdisc_qstats_backlog_dec(sch, skb); qdisc_bstats_update(sch, skb); sch->q.qlen--; return skb; } static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch) { struct cbs_sched_data *q = qdisc_priv(sch); struct Qdisc *qdisc = q->qdisc; s64 now = ktime_get_ns(); struct sk_buff *skb; s64 credits; int len; /* The previous packet is still being sent */ if (now < q->last) { qdisc_watchdog_schedule_ns(&q->watchdog, q->last); return NULL; } if (q->credits < 0) { credits = timediff_to_credits(now - q->last, q->idleslope); credits = q->credits + credits; q->credits = min_t(s64, credits, q->hicredit); if (q->credits < 0) { s64 delay; delay = delay_from_credits(q->credits, q->idleslope); qdisc_watchdog_schedule_ns(&q->watchdog, now + delay); q->last = now; return NULL; } } skb = cbs_child_dequeue(sch, qdisc); if (!skb) return NULL; len = qdisc_pkt_len(skb); /* As sendslope is a negative number, this will decrease the * amount of q->credits. */ credits = credits_from_len(len, q->sendslope, atomic64_read(&q->port_rate)); credits += q->credits; q->credits = max_t(s64, credits, q->locredit); /* Estimate of the transmission of the last byte of the packet in ns */ if (unlikely(atomic64_read(&q->port_rate) == 0)) q->last = now; else q->last = now + div64_s64(len * NSEC_PER_SEC, atomic64_read(&q->port_rate)); return skb; } static struct sk_buff *cbs_dequeue_offload(struct Qdisc *sch) { struct cbs_sched_data *q = qdisc_priv(sch); struct Qdisc *qdisc = q->qdisc; return cbs_child_dequeue(sch, qdisc); } static struct sk_buff *cbs_dequeue(struct Qdisc *sch) { struct cbs_sched_data *q = qdisc_priv(sch); return q->dequeue(sch); } static const struct nla_policy cbs_policy[TCA_CBS_MAX + 1] = { [TCA_CBS_PARMS] = { .len = sizeof(struct tc_cbs_qopt) }, }; static void cbs_disable_offload(struct net_device *dev, struct cbs_sched_data *q) { struct tc_cbs_qopt_offload cbs = { }; const struct net_device_ops *ops; int err; if (!q->offload) return; q->enqueue = cbs_enqueue_soft; q->dequeue = cbs_dequeue_soft; ops = dev->netdev_ops; if (!ops->ndo_setup_tc) return; cbs.queue = q->queue; cbs.enable = 0; err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_CBS, &cbs); if (err < 0) pr_warn("Couldn't disable CBS offload for queue %d\n", cbs.queue); } static int cbs_enable_offload(struct net_device *dev, struct cbs_sched_data *q, const struct tc_cbs_qopt *opt, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; struct tc_cbs_qopt_offload cbs = { }; int err; if (!ops->ndo_setup_tc) { NL_SET_ERR_MSG(extack, "Specified device does not support cbs offload"); return -EOPNOTSUPP; } cbs.queue = q->queue; cbs.enable = 1; cbs.hicredit = opt->hicredit; cbs.locredit = opt->locredit; cbs.idleslope = opt->idleslope; cbs.sendslope = opt->sendslope; err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_CBS, &cbs); if (err < 0) { NL_SET_ERR_MSG(extack, "Specified device failed to setup cbs hardware offload"); return err; } q->enqueue = cbs_enqueue_offload; q->dequeue = cbs_dequeue_offload; return 0; } static void cbs_set_port_rate(struct net_device *dev, struct cbs_sched_data *q) { struct ethtool_link_ksettings ecmd; int speed = SPEED_10; s64 port_rate; int err; err = __ethtool_get_link_ksettings(dev, &ecmd); if (err < 0) goto skip; if (ecmd.base.speed && ecmd.base.speed != SPEED_UNKNOWN) speed = ecmd.base.speed; skip: port_rate = speed * 1000 * BYTES_PER_KBIT; atomic64_set(&q->port_rate, port_rate); netdev_dbg(dev, "cbs: set %s's port_rate to: %lld, linkspeed: %d\n", dev->name, (long long)atomic64_read(&q->port_rate), ecmd.base.speed); } static int cbs_dev_notifier(struct notifier_block *nb, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct cbs_sched_data *q; struct net_device *qdev; bool found = false; ASSERT_RTNL(); if (event != NETDEV_UP && event != NETDEV_CHANGE) return NOTIFY_DONE; spin_lock(&cbs_list_lock); list_for_each_entry(q, &cbs_list, cbs_list) { qdev = qdisc_dev(q->qdisc); if (qdev == dev) { found = true; break; } } spin_unlock(&cbs_list_lock); if (found) cbs_set_port_rate(dev, q); return NOTIFY_DONE; } static int cbs_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct cbs_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct nlattr *tb[TCA_CBS_MAX + 1]; struct tc_cbs_qopt *qopt; int err; err = nla_parse_nested_deprecated(tb, TCA_CBS_MAX, opt, cbs_policy, extack); if (err < 0) return err; if (!tb[TCA_CBS_PARMS]) { NL_SET_ERR_MSG(extack, "Missing CBS parameter which are mandatory"); return -EINVAL; } qopt = nla_data(tb[TCA_CBS_PARMS]); if (!qopt->offload) { cbs_set_port_rate(dev, q); cbs_disable_offload(dev, q); } else { err = cbs_enable_offload(dev, q, qopt, extack); if (err < 0) return err; } /* Everything went OK, save the parameters used. */ WRITE_ONCE(q->hicredit, qopt->hicredit); WRITE_ONCE(q->locredit, qopt->locredit); WRITE_ONCE(q->idleslope, qopt->idleslope * BYTES_PER_KBIT); WRITE_ONCE(q->sendslope, qopt->sendslope * BYTES_PER_KBIT); WRITE_ONCE(q->offload, qopt->offload); return 0; } static int cbs_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct cbs_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); if (!opt) { NL_SET_ERR_MSG(extack, "Missing CBS qdisc options which are mandatory"); return -EINVAL; } q->qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, sch->handle, extack); if (!q->qdisc) return -ENOMEM; spin_lock(&cbs_list_lock); list_add(&q->cbs_list, &cbs_list); spin_unlock(&cbs_list_lock); qdisc_hash_add(q->qdisc, false); q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0); q->enqueue = cbs_enqueue_soft; q->dequeue = cbs_dequeue_soft; qdisc_watchdog_init(&q->watchdog, sch); return cbs_change(sch, opt, extack); } static void cbs_destroy(struct Qdisc *sch) { struct cbs_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); /* Nothing to do if we couldn't create the underlying qdisc */ if (!q->qdisc) return; qdisc_watchdog_cancel(&q->watchdog); cbs_disable_offload(dev, q); spin_lock(&cbs_list_lock); list_del(&q->cbs_list); spin_unlock(&cbs_list_lock); qdisc_put(q->qdisc); } static int cbs_dump(struct Qdisc *sch, struct sk_buff *skb) { struct cbs_sched_data *q = qdisc_priv(sch); struct tc_cbs_qopt opt = { }; struct nlattr *nest; nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (!nest) goto nla_put_failure; opt.hicredit = READ_ONCE(q->hicredit); opt.locredit = READ_ONCE(q->locredit); opt.sendslope = div64_s64(READ_ONCE(q->sendslope), BYTES_PER_KBIT); opt.idleslope = div64_s64(READ_ONCE(q->idleslope), BYTES_PER_KBIT); opt.offload = READ_ONCE(q->offload); if (nla_put(skb, TCA_CBS_PARMS, sizeof(opt), &opt)) goto nla_put_failure; return nla_nest_end(skb, nest); nla_put_failure: nla_nest_cancel(skb, nest); return -1; } static int cbs_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) { struct cbs_sched_data *q = qdisc_priv(sch); if (cl != 1 || !q->qdisc) /* only one class */ return -ENOENT; tcm->tcm_handle |= TC_H_MIN(1); tcm->tcm_info = q->qdisc->handle; return 0; } static int cbs_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, struct Qdisc **old, struct netlink_ext_ack *extack) { struct cbs_sched_data *q = qdisc_priv(sch); if (!new) { new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, sch->handle, NULL); if (!new) new = &noop_qdisc; } *old = qdisc_replace(sch, new, &q->qdisc); return 0; } static struct Qdisc *cbs_leaf(struct Qdisc *sch, unsigned long arg) { struct cbs_sched_data *q = qdisc_priv(sch); return q->qdisc; } static unsigned long cbs_find(struct Qdisc *sch, u32 classid) { return 1; } static void cbs_walk(struct Qdisc *sch, struct qdisc_walker *walker) { if (!walker->stop) { tc_qdisc_stats_dump(sch, 1, walker); } } static const struct Qdisc_class_ops cbs_class_ops = { .graft = cbs_graft, .leaf = cbs_leaf, .find = cbs_find, .walk = cbs_walk, .dump = cbs_dump_class, }; static struct Qdisc_ops cbs_qdisc_ops __read_mostly = { .id = "cbs", .cl_ops = &cbs_class_ops, .priv_size = sizeof(struct cbs_sched_data), .enqueue = cbs_enqueue, .dequeue = cbs_dequeue, .peek = qdisc_peek_dequeued, .init = cbs_init, .reset = qdisc_reset_queue, .destroy = cbs_destroy, .change = cbs_change, .dump = cbs_dump, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("cbs"); static struct notifier_block cbs_device_notifier = { .notifier_call = cbs_dev_notifier, }; static int __init cbs_module_init(void) { int err; err = register_netdevice_notifier(&cbs_device_notifier); if (err) return err; err = register_qdisc(&cbs_qdisc_ops); if (err) unregister_netdevice_notifier(&cbs_device_notifier); return err; } static void __exit cbs_module_exit(void) { unregister_qdisc(&cbs_qdisc_ops); unregister_netdevice_notifier(&cbs_device_notifier); } module_init(cbs_module_init) module_exit(cbs_module_exit) MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Credit Based shaper");
233 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/fs.h> #define DEVCG_ACC_MKNOD 1 #define DEVCG_ACC_READ 2 #define DEVCG_ACC_WRITE 4 #define DEVCG_ACC_MASK (DEVCG_ACC_MKNOD | DEVCG_ACC_READ | DEVCG_ACC_WRITE) #define DEVCG_DEV_BLOCK 1 #define DEVCG_DEV_CHAR 2 #define DEVCG_DEV_ALL 4 /* this represents all devices */ #if defined(CONFIG_CGROUP_DEVICE) || defined(CONFIG_CGROUP_BPF) int devcgroup_check_permission(short type, u32 major, u32 minor, short access); static inline int devcgroup_inode_permission(struct inode *inode, int mask) { short type, access = 0; if (likely(!S_ISBLK(inode->i_mode) && !S_ISCHR(inode->i_mode))) return 0; if (likely(!inode->i_rdev)) return 0; if (S_ISBLK(inode->i_mode)) type = DEVCG_DEV_BLOCK; else /* S_ISCHR by the test above */ type = DEVCG_DEV_CHAR; if (mask & MAY_WRITE) access |= DEVCG_ACC_WRITE; if (mask & MAY_READ) access |= DEVCG_ACC_READ; return devcgroup_check_permission(type, imajor(inode), iminor(inode), access); } static inline int devcgroup_inode_mknod(int mode, dev_t dev) { short type; if (!S_ISBLK(mode) && !S_ISCHR(mode)) return 0; if (S_ISCHR(mode) && dev == WHITEOUT_DEV) return 0; if (S_ISBLK(mode)) type = DEVCG_DEV_BLOCK; else type = DEVCG_DEV_CHAR; return devcgroup_check_permission(type, MAJOR(dev), MINOR(dev), DEVCG_ACC_MKNOD); } #else static inline int devcgroup_check_permission(short type, u32 major, u32 minor, short access) { return 0; } static inline int devcgroup_inode_permission(struct inode *inode, int mask) { return 0; } static inline int devcgroup_inode_mknod(int mode, dev_t dev) { return 0; } #endif
6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 // SPDX-License-Identifier: GPL-2.0-or-later /* * NetLabel Unlabeled Support * * This file defines functions for dealing with unlabeled packets for the * NetLabel system. The NetLabel system manages static and dynamic label * mappings for network protocols such as CIPSO and RIPSO. * * Author: Paul Moore <paul@paul-moore.com> */ /* * (c) Copyright Hewlett-Packard Development Company, L.P., 2006 - 2008 */ #include <linux/types.h> #include <linux/rcupdate.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/socket.h> #include <linux/string.h> #include <linux/skbuff.h> #include <linux/audit.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/notifier.h> #include <linux/netdevice.h> #include <linux/security.h> #include <linux/slab.h> #include <net/sock.h> #include <net/netlink.h> #include <net/genetlink.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/net_namespace.h> #include <net/netlabel.h> #include <asm/bug.h> #include <linux/atomic.h> #include "netlabel_user.h" #include "netlabel_addrlist.h" #include "netlabel_domainhash.h" #include "netlabel_unlabeled.h" #include "netlabel_mgmt.h" /* NOTE: at present we always use init's network namespace since we don't * presently support different namespaces even though the majority of * the functions in this file are "namespace safe" */ /* The unlabeled connection hash table which we use to map network interfaces * and addresses of unlabeled packets to a user specified secid value for the * LSM. The hash table is used to lookup the network interface entry * (struct netlbl_unlhsh_iface) and then the interface entry is used to * lookup an IP address match from an ordered list. If a network interface * match can not be found in the hash table then the default entry * (netlbl_unlhsh_def) is used. The IP address entry list * (struct netlbl_unlhsh_addr) is ordered such that the entries with a * larger netmask come first. */ struct netlbl_unlhsh_tbl { struct list_head *tbl; u32 size; }; #define netlbl_unlhsh_addr4_entry(iter) \ container_of(iter, struct netlbl_unlhsh_addr4, list) struct netlbl_unlhsh_addr4 { u32 secid; struct netlbl_af4list list; struct rcu_head rcu; }; #define netlbl_unlhsh_addr6_entry(iter) \ container_of(iter, struct netlbl_unlhsh_addr6, list) struct netlbl_unlhsh_addr6 { u32 secid; struct netlbl_af6list list; struct rcu_head rcu; }; struct netlbl_unlhsh_iface { int ifindex; struct list_head addr4_list; struct list_head addr6_list; u32 valid; struct list_head list; struct rcu_head rcu; }; /* Argument struct for netlbl_unlhsh_walk() */ struct netlbl_unlhsh_walk_arg { struct netlink_callback *nl_cb; struct sk_buff *skb; u32 seq; }; /* Unlabeled connection hash table */ /* updates should be so rare that having one spinlock for the entire * hash table should be okay */ static DEFINE_SPINLOCK(netlbl_unlhsh_lock); #define netlbl_unlhsh_rcu_deref(p) \ rcu_dereference_check(p, lockdep_is_held(&netlbl_unlhsh_lock)) static struct netlbl_unlhsh_tbl __rcu *netlbl_unlhsh; static struct netlbl_unlhsh_iface __rcu *netlbl_unlhsh_def; /* Accept unlabeled packets flag */ static u8 netlabel_unlabel_acceptflg; /* NetLabel Generic NETLINK unlabeled family */ static struct genl_family netlbl_unlabel_gnl_family; /* NetLabel Netlink attribute policy */ static const struct nla_policy netlbl_unlabel_genl_policy[NLBL_UNLABEL_A_MAX + 1] = { [NLBL_UNLABEL_A_ACPTFLG] = { .type = NLA_U8 }, [NLBL_UNLABEL_A_IPV6ADDR] = { .type = NLA_BINARY, .len = sizeof(struct in6_addr) }, [NLBL_UNLABEL_A_IPV6MASK] = { .type = NLA_BINARY, .len = sizeof(struct in6_addr) }, [NLBL_UNLABEL_A_IPV4ADDR] = { .type = NLA_BINARY, .len = sizeof(struct in_addr) }, [NLBL_UNLABEL_A_IPV4MASK] = { .type = NLA_BINARY, .len = sizeof(struct in_addr) }, [NLBL_UNLABEL_A_IFACE] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, [NLBL_UNLABEL_A_SECCTX] = { .type = NLA_BINARY } }; /* * Unlabeled Connection Hash Table Functions */ /** * netlbl_unlhsh_free_iface - Frees an interface entry from the hash table * @entry: the entry's RCU field * * Description: * This function is designed to be used as a callback to the call_rcu() * function so that memory allocated to a hash table interface entry can be * released safely. It is important to note that this function does not free * the IPv4 and IPv6 address lists contained as part of an interface entry. It * is up to the rest of the code to make sure an interface entry is only freed * once it's address lists are empty. * */ static void netlbl_unlhsh_free_iface(struct rcu_head *entry) { struct netlbl_unlhsh_iface *iface; struct netlbl_af4list *iter4; struct netlbl_af4list *tmp4; #if IS_ENABLED(CONFIG_IPV6) struct netlbl_af6list *iter6; struct netlbl_af6list *tmp6; #endif /* IPv6 */ iface = container_of(entry, struct netlbl_unlhsh_iface, rcu); /* no need for locks here since we are the only one with access to this * structure */ netlbl_af4list_foreach_safe(iter4, tmp4, &iface->addr4_list) { netlbl_af4list_remove_entry(iter4); kfree(netlbl_unlhsh_addr4_entry(iter4)); } #if IS_ENABLED(CONFIG_IPV6) netlbl_af6list_foreach_safe(iter6, tmp6, &iface->addr6_list) { netlbl_af6list_remove_entry(iter6); kfree(netlbl_unlhsh_addr6_entry(iter6)); } #endif /* IPv6 */ kfree(iface); } /** * netlbl_unlhsh_hash - Hashing function for the hash table * @ifindex: the network interface/device to hash * * Description: * This is the hashing function for the unlabeled hash table, it returns the * bucket number for the given device/interface. The caller is responsible for * ensuring that the hash table is protected with either a RCU read lock or * the hash table lock. * */ static u32 netlbl_unlhsh_hash(int ifindex) { return ifindex & (netlbl_unlhsh_rcu_deref(netlbl_unlhsh)->size - 1); } /** * netlbl_unlhsh_search_iface - Search for a matching interface entry * @ifindex: the network interface * * Description: * Searches the unlabeled connection hash table and returns a pointer to the * interface entry which matches @ifindex, otherwise NULL is returned. The * caller is responsible for ensuring that the hash table is protected with * either a RCU read lock or the hash table lock. * */ static struct netlbl_unlhsh_iface *netlbl_unlhsh_search_iface(int ifindex) { u32 bkt; struct list_head *bkt_list; struct netlbl_unlhsh_iface *iter; bkt = netlbl_unlhsh_hash(ifindex); bkt_list = &netlbl_unlhsh_rcu_deref(netlbl_unlhsh)->tbl[bkt]; list_for_each_entry_rcu(iter, bkt_list, list, lockdep_is_held(&netlbl_unlhsh_lock)) if (iter->valid && iter->ifindex == ifindex) return iter; return NULL; } /** * netlbl_unlhsh_add_addr4 - Add a new IPv4 address entry to the hash table * @iface: the associated interface entry * @addr: IPv4 address in network byte order * @mask: IPv4 address mask in network byte order * @secid: LSM secid value for entry * * Description: * Add a new address entry into the unlabeled connection hash table using the * interface entry specified by @iface. On success zero is returned, otherwise * a negative value is returned. * */ static int netlbl_unlhsh_add_addr4(struct netlbl_unlhsh_iface *iface, const struct in_addr *addr, const struct in_addr *mask, u32 secid) { int ret_val; struct netlbl_unlhsh_addr4 *entry; entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (entry == NULL) return -ENOMEM; entry->list.addr = addr->s_addr & mask->s_addr; entry->list.mask = mask->s_addr; entry->list.valid = 1; entry->secid = secid; spin_lock(&netlbl_unlhsh_lock); ret_val = netlbl_af4list_add(&entry->list, &iface->addr4_list); spin_unlock(&netlbl_unlhsh_lock); if (ret_val != 0) kfree(entry); return ret_val; } #if IS_ENABLED(CONFIG_IPV6) /** * netlbl_unlhsh_add_addr6 - Add a new IPv6 address entry to the hash table * @iface: the associated interface entry * @addr: IPv6 address in network byte order * @mask: IPv6 address mask in network byte order * @secid: LSM secid value for entry * * Description: * Add a new address entry into the unlabeled connection hash table using the * interface entry specified by @iface. On success zero is returned, otherwise * a negative value is returned. * */ static int netlbl_unlhsh_add_addr6(struct netlbl_unlhsh_iface *iface, const struct in6_addr *addr, const struct in6_addr *mask, u32 secid) { int ret_val; struct netlbl_unlhsh_addr6 *entry; entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (entry == NULL) return -ENOMEM; entry->list.addr = *addr; entry->list.addr.s6_addr32[0] &= mask->s6_addr32[0]; entry->list.addr.s6_addr32[1] &= mask->s6_addr32[1]; entry->list.addr.s6_addr32[2] &= mask->s6_addr32[2]; entry->list.addr.s6_addr32[3] &= mask->s6_addr32[3]; entry->list.mask = *mask; entry->list.valid = 1; entry->secid = secid; spin_lock(&netlbl_unlhsh_lock); ret_val = netlbl_af6list_add(&entry->list, &iface->addr6_list); spin_unlock(&netlbl_unlhsh_lock); if (ret_val != 0) kfree(entry); return 0; } #endif /* IPv6 */ /** * netlbl_unlhsh_add_iface - Adds a new interface entry to the hash table * @ifindex: network interface * * Description: * Add a new, empty, interface entry into the unlabeled connection hash table. * On success a pointer to the new interface entry is returned, on failure NULL * is returned. * */ static struct netlbl_unlhsh_iface *netlbl_unlhsh_add_iface(int ifindex) { u32 bkt; struct netlbl_unlhsh_iface *iface; iface = kzalloc(sizeof(*iface), GFP_ATOMIC); if (iface == NULL) return NULL; iface->ifindex = ifindex; INIT_LIST_HEAD(&iface->addr4_list); INIT_LIST_HEAD(&iface->addr6_list); iface->valid = 1; spin_lock(&netlbl_unlhsh_lock); if (ifindex > 0) { bkt = netlbl_unlhsh_hash(ifindex); if (netlbl_unlhsh_search_iface(ifindex) != NULL) goto add_iface_failure; list_add_tail_rcu(&iface->list, &netlbl_unlhsh_rcu_deref(netlbl_unlhsh)->tbl[bkt]); } else { INIT_LIST_HEAD(&iface->list); if (netlbl_unlhsh_rcu_deref(netlbl_unlhsh_def) != NULL) goto add_iface_failure; rcu_assign_pointer(netlbl_unlhsh_def, iface); } spin_unlock(&netlbl_unlhsh_lock); return iface; add_iface_failure: spin_unlock(&netlbl_unlhsh_lock); kfree(iface); return NULL; } /** * netlbl_unlhsh_add - Adds a new entry to the unlabeled connection hash table * @net: network namespace * @dev_name: interface name * @addr: IP address in network byte order * @mask: address mask in network byte order * @addr_len: length of address/mask (4 for IPv4, 16 for IPv6) * @secid: LSM secid value for the entry * @audit_info: NetLabel audit information * * Description: * Adds a new entry to the unlabeled connection hash table. Returns zero on * success, negative values on failure. * */ int netlbl_unlhsh_add(struct net *net, const char *dev_name, const void *addr, const void *mask, u32 addr_len, u32 secid, struct netlbl_audit *audit_info) { int ret_val; int ifindex; struct net_device *dev; struct netlbl_unlhsh_iface *iface; struct audit_buffer *audit_buf = NULL; struct lsm_context ctx; if (addr_len != sizeof(struct in_addr) && addr_len != sizeof(struct in6_addr)) return -EINVAL; rcu_read_lock(); if (dev_name != NULL) { dev = dev_get_by_name_rcu(net, dev_name); if (dev == NULL) { ret_val = -ENODEV; goto unlhsh_add_return; } ifindex = dev->ifindex; iface = netlbl_unlhsh_search_iface(ifindex); } else { ifindex = 0; iface = rcu_dereference(netlbl_unlhsh_def); } if (iface == NULL) { iface = netlbl_unlhsh_add_iface(ifindex); if (iface == NULL) { ret_val = -ENOMEM; goto unlhsh_add_return; } } audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCADD, audit_info); switch (addr_len) { case sizeof(struct in_addr): { const struct in_addr *addr4 = addr; const struct in_addr *mask4 = mask; ret_val = netlbl_unlhsh_add_addr4(iface, addr4, mask4, secid); if (audit_buf != NULL) netlbl_af4list_audit_addr(audit_buf, 1, dev_name, addr4->s_addr, mask4->s_addr); break; } #if IS_ENABLED(CONFIG_IPV6) case sizeof(struct in6_addr): { const struct in6_addr *addr6 = addr; const struct in6_addr *mask6 = mask; ret_val = netlbl_unlhsh_add_addr6(iface, addr6, mask6, secid); if (audit_buf != NULL) netlbl_af6list_audit_addr(audit_buf, 1, dev_name, addr6, mask6); break; } #endif /* IPv6 */ default: ret_val = -EINVAL; } if (ret_val == 0) atomic_inc(&netlabel_mgmt_protocount); unlhsh_add_return: rcu_read_unlock(); if (audit_buf != NULL) { if (security_secid_to_secctx(secid, &ctx) >= 0) { audit_log_format(audit_buf, " sec_obj=%s", ctx.context); security_release_secctx(&ctx); } audit_log_format(audit_buf, " res=%u", ret_val == 0 ? 1 : 0); audit_log_end(audit_buf); } return ret_val; } /** * netlbl_unlhsh_remove_addr4 - Remove an IPv4 address entry * @net: network namespace * @iface: interface entry * @addr: IP address * @mask: IP address mask * @audit_info: NetLabel audit information * * Description: * Remove an IP address entry from the unlabeled connection hash table. * Returns zero on success, negative values on failure. * */ static int netlbl_unlhsh_remove_addr4(struct net *net, struct netlbl_unlhsh_iface *iface, const struct in_addr *addr, const struct in_addr *mask, struct netlbl_audit *audit_info) { struct netlbl_af4list *list_entry; struct netlbl_unlhsh_addr4 *entry; struct audit_buffer *audit_buf; struct net_device *dev; struct lsm_context ctx; spin_lock(&netlbl_unlhsh_lock); list_entry = netlbl_af4list_remove(addr->s_addr, mask->s_addr, &iface->addr4_list); spin_unlock(&netlbl_unlhsh_lock); if (list_entry != NULL) entry = netlbl_unlhsh_addr4_entry(list_entry); else entry = NULL; audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCDEL, audit_info); if (audit_buf != NULL) { dev = dev_get_by_index(net, iface->ifindex); netlbl_af4list_audit_addr(audit_buf, 1, (dev != NULL ? dev->name : NULL), addr->s_addr, mask->s_addr); dev_put(dev); if (entry != NULL && security_secid_to_secctx(entry->secid, &ctx) >= 0) { audit_log_format(audit_buf, " sec_obj=%s", ctx.context); security_release_secctx(&ctx); } audit_log_format(audit_buf, " res=%u", entry != NULL ? 1 : 0); audit_log_end(audit_buf); } if (entry == NULL) return -ENOENT; kfree_rcu(entry, rcu); return 0; } #if IS_ENABLED(CONFIG_IPV6) /** * netlbl_unlhsh_remove_addr6 - Remove an IPv6 address entry * @net: network namespace * @iface: interface entry * @addr: IP address * @mask: IP address mask * @audit_info: NetLabel audit information * * Description: * Remove an IP address entry from the unlabeled connection hash table. * Returns zero on success, negative values on failure. * */ static int netlbl_unlhsh_remove_addr6(struct net *net, struct netlbl_unlhsh_iface *iface, const struct in6_addr *addr, const struct in6_addr *mask, struct netlbl_audit *audit_info) { struct netlbl_af6list *list_entry; struct netlbl_unlhsh_addr6 *entry; struct audit_buffer *audit_buf; struct net_device *dev; struct lsm_context ctx; spin_lock(&netlbl_unlhsh_lock); list_entry = netlbl_af6list_remove(addr, mask, &iface->addr6_list); spin_unlock(&netlbl_unlhsh_lock); if (list_entry != NULL) entry = netlbl_unlhsh_addr6_entry(list_entry); else entry = NULL; audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCDEL, audit_info); if (audit_buf != NULL) { dev = dev_get_by_index(net, iface->ifindex); netlbl_af6list_audit_addr(audit_buf, 1, (dev != NULL ? dev->name : NULL), addr, mask); dev_put(dev); if (entry != NULL && security_secid_to_secctx(entry->secid, &ctx) >= 0) { audit_log_format(audit_buf, " sec_obj=%s", ctx.context); security_release_secctx(&ctx); } audit_log_format(audit_buf, " res=%u", entry != NULL ? 1 : 0); audit_log_end(audit_buf); } if (entry == NULL) return -ENOENT; kfree_rcu(entry, rcu); return 0; } #endif /* IPv6 */ /** * netlbl_unlhsh_condremove_iface - Remove an interface entry * @iface: the interface entry * * Description: * Remove an interface entry from the unlabeled connection hash table if it is * empty. An interface entry is considered to be empty if there are no * address entries assigned to it. * */ static void netlbl_unlhsh_condremove_iface(struct netlbl_unlhsh_iface *iface) { struct netlbl_af4list *iter4; #if IS_ENABLED(CONFIG_IPV6) struct netlbl_af6list *iter6; #endif /* IPv6 */ spin_lock(&netlbl_unlhsh_lock); netlbl_af4list_foreach_rcu(iter4, &iface->addr4_list) goto unlhsh_condremove_failure; #if IS_ENABLED(CONFIG_IPV6) netlbl_af6list_foreach_rcu(iter6, &iface->addr6_list) goto unlhsh_condremove_failure; #endif /* IPv6 */ iface->valid = 0; if (iface->ifindex > 0) list_del_rcu(&iface->list); else RCU_INIT_POINTER(netlbl_unlhsh_def, NULL); spin_unlock(&netlbl_unlhsh_lock); call_rcu(&iface->rcu, netlbl_unlhsh_free_iface); return; unlhsh_condremove_failure: spin_unlock(&netlbl_unlhsh_lock); } /** * netlbl_unlhsh_remove - Remove an entry from the unlabeled hash table * @net: network namespace * @dev_name: interface name * @addr: IP address in network byte order * @mask: address mask in network byte order * @addr_len: length of address/mask (4 for IPv4, 16 for IPv6) * @audit_info: NetLabel audit information * * Description: * Removes and existing entry from the unlabeled connection hash table. * Returns zero on success, negative values on failure. * */ int netlbl_unlhsh_remove(struct net *net, const char *dev_name, const void *addr, const void *mask, u32 addr_len, struct netlbl_audit *audit_info) { int ret_val; struct net_device *dev; struct netlbl_unlhsh_iface *iface; if (addr_len != sizeof(struct in_addr) && addr_len != sizeof(struct in6_addr)) return -EINVAL; rcu_read_lock(); if (dev_name != NULL) { dev = dev_get_by_name_rcu(net, dev_name); if (dev == NULL) { ret_val = -ENODEV; goto unlhsh_remove_return; } iface = netlbl_unlhsh_search_iface(dev->ifindex); } else iface = rcu_dereference(netlbl_unlhsh_def); if (iface == NULL) { ret_val = -ENOENT; goto unlhsh_remove_return; } switch (addr_len) { case sizeof(struct in_addr): ret_val = netlbl_unlhsh_remove_addr4(net, iface, addr, mask, audit_info); break; #if IS_ENABLED(CONFIG_IPV6) case sizeof(struct in6_addr): ret_val = netlbl_unlhsh_remove_addr6(net, iface, addr, mask, audit_info); break; #endif /* IPv6 */ default: ret_val = -EINVAL; } if (ret_val == 0) { netlbl_unlhsh_condremove_iface(iface); atomic_dec(&netlabel_mgmt_protocount); } unlhsh_remove_return: rcu_read_unlock(); return ret_val; } /* * General Helper Functions */ /** * netlbl_unlhsh_netdev_handler - Network device notification handler * @this: notifier block * @event: the event * @ptr: the netdevice notifier info (cast to void) * * Description: * Handle network device events, although at present all we care about is a * network device going away. In the case of a device going away we clear any * related entries from the unlabeled connection hash table. * */ static int netlbl_unlhsh_netdev_handler(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct netlbl_unlhsh_iface *iface = NULL; if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; /* XXX - should this be a check for NETDEV_DOWN or _UNREGISTER? */ if (event == NETDEV_DOWN) { spin_lock(&netlbl_unlhsh_lock); iface = netlbl_unlhsh_search_iface(dev->ifindex); if (iface != NULL && iface->valid) { iface->valid = 0; list_del_rcu(&iface->list); } else iface = NULL; spin_unlock(&netlbl_unlhsh_lock); } if (iface != NULL) call_rcu(&iface->rcu, netlbl_unlhsh_free_iface); return NOTIFY_DONE; } /** * netlbl_unlabel_acceptflg_set - Set the unlabeled accept flag * @value: desired value * @audit_info: NetLabel audit information * * Description: * Set the value of the unlabeled accept flag to @value. * */ static void netlbl_unlabel_acceptflg_set(u8 value, struct netlbl_audit *audit_info) { struct audit_buffer *audit_buf; u8 old_val; old_val = netlabel_unlabel_acceptflg; netlabel_unlabel_acceptflg = value; audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_ALLOW, audit_info); if (audit_buf != NULL) { audit_log_format(audit_buf, " unlbl_accept=%u old=%u", value, old_val); audit_log_end(audit_buf); } } /** * netlbl_unlabel_addrinfo_get - Get the IPv4/6 address information * @info: the Generic NETLINK info block * @addr: the IP address * @mask: the IP address mask * @len: the address length * * Description: * Examine the Generic NETLINK message and extract the IP address information. * Returns zero on success, negative values on failure. * */ static int netlbl_unlabel_addrinfo_get(struct genl_info *info, void **addr, void **mask, u32 *len) { u32 addr_len; if (info->attrs[NLBL_UNLABEL_A_IPV4ADDR] && info->attrs[NLBL_UNLABEL_A_IPV4MASK]) { addr_len = nla_len(info->attrs[NLBL_UNLABEL_A_IPV4ADDR]); if (addr_len != sizeof(struct in_addr) && addr_len != nla_len(info->attrs[NLBL_UNLABEL_A_IPV4MASK])) return -EINVAL; *len = addr_len; *addr = nla_data(info->attrs[NLBL_UNLABEL_A_IPV4ADDR]); *mask = nla_data(info->attrs[NLBL_UNLABEL_A_IPV4MASK]); return 0; } else if (info->attrs[NLBL_UNLABEL_A_IPV6ADDR]) { addr_len = nla_len(info->attrs[NLBL_UNLABEL_A_IPV6ADDR]); if (addr_len != sizeof(struct in6_addr) && addr_len != nla_len(info->attrs[NLBL_UNLABEL_A_IPV6MASK])) return -EINVAL; *len = addr_len; *addr = nla_data(info->attrs[NLBL_UNLABEL_A_IPV6ADDR]); *mask = nla_data(info->attrs[NLBL_UNLABEL_A_IPV6MASK]); return 0; } return -EINVAL; } /* * NetLabel Command Handlers */ /** * netlbl_unlabel_accept - Handle an ACCEPT message * @skb: the NETLINK buffer * @info: the Generic NETLINK info block * * Description: * Process a user generated ACCEPT message and set the accept flag accordingly. * Returns zero on success, negative values on failure. * */ static int netlbl_unlabel_accept(struct sk_buff *skb, struct genl_info *info) { u8 value; struct netlbl_audit audit_info; if (info->attrs[NLBL_UNLABEL_A_ACPTFLG]) { value = nla_get_u8(info->attrs[NLBL_UNLABEL_A_ACPTFLG]); if (value == 1 || value == 0) { netlbl_netlink_auditinfo(&audit_info); netlbl_unlabel_acceptflg_set(value, &audit_info); return 0; } } return -EINVAL; } /** * netlbl_unlabel_list - Handle a LIST message * @skb: the NETLINK buffer * @info: the Generic NETLINK info block * * Description: * Process a user generated LIST message and respond with the current status. * Returns zero on success, negative values on failure. * */ static int netlbl_unlabel_list(struct sk_buff *skb, struct genl_info *info) { int ret_val = -EINVAL; struct sk_buff *ans_skb; void *data; ans_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (ans_skb == NULL) goto list_failure; data = genlmsg_put_reply(ans_skb, info, &netlbl_unlabel_gnl_family, 0, NLBL_UNLABEL_C_LIST); if (data == NULL) { ret_val = -ENOMEM; goto list_failure; } ret_val = nla_put_u8(ans_skb, NLBL_UNLABEL_A_ACPTFLG, netlabel_unlabel_acceptflg); if (ret_val != 0) goto list_failure; genlmsg_end(ans_skb, data); return genlmsg_reply(ans_skb, info); list_failure: kfree_skb(ans_skb); return ret_val; } /** * netlbl_unlabel_staticadd - Handle a STATICADD message * @skb: the NETLINK buffer * @info: the Generic NETLINK info block * * Description: * Process a user generated STATICADD message and add a new unlabeled * connection entry to the hash table. Returns zero on success, negative * values on failure. * */ static int netlbl_unlabel_staticadd(struct sk_buff *skb, struct genl_info *info) { int ret_val; char *dev_name; void *addr; void *mask; u32 addr_len; u32 secid; struct netlbl_audit audit_info; /* Don't allow users to add both IPv4 and IPv6 addresses for a * single entry. However, allow users to create two entries, one each * for IPv4 and IPv6, with the same LSM security context which should * achieve the same result. */ if (!info->attrs[NLBL_UNLABEL_A_SECCTX] || !info->attrs[NLBL_UNLABEL_A_IFACE] || !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] || !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^ (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] || !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) return -EINVAL; netlbl_netlink_auditinfo(&audit_info); ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); if (ret_val != 0) return ret_val; dev_name = nla_data(info->attrs[NLBL_UNLABEL_A_IFACE]); ret_val = security_secctx_to_secid( nla_data(info->attrs[NLBL_UNLABEL_A_SECCTX]), nla_len(info->attrs[NLBL_UNLABEL_A_SECCTX]), &secid); if (ret_val != 0) return ret_val; return netlbl_unlhsh_add(&init_net, dev_name, addr, mask, addr_len, secid, &audit_info); } /** * netlbl_unlabel_staticadddef - Handle a STATICADDDEF message * @skb: the NETLINK buffer * @info: the Generic NETLINK info block * * Description: * Process a user generated STATICADDDEF message and add a new default * unlabeled connection entry. Returns zero on success, negative values on * failure. * */ static int netlbl_unlabel_staticadddef(struct sk_buff *skb, struct genl_info *info) { int ret_val; void *addr; void *mask; u32 addr_len; u32 secid; struct netlbl_audit audit_info; /* Don't allow users to add both IPv4 and IPv6 addresses for a * single entry. However, allow users to create two entries, one each * for IPv4 and IPv6, with the same LSM security context which should * achieve the same result. */ if (!info->attrs[NLBL_UNLABEL_A_SECCTX] || !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] || !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^ (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] || !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) return -EINVAL; netlbl_netlink_auditinfo(&audit_info); ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); if (ret_val != 0) return ret_val; ret_val = security_secctx_to_secid( nla_data(info->attrs[NLBL_UNLABEL_A_SECCTX]), nla_len(info->attrs[NLBL_UNLABEL_A_SECCTX]), &secid); if (ret_val != 0) return ret_val; return netlbl_unlhsh_add(&init_net, NULL, addr, mask, addr_len, secid, &audit_info); } /** * netlbl_unlabel_staticremove - Handle a STATICREMOVE message * @skb: the NETLINK buffer * @info: the Generic NETLINK info block * * Description: * Process a user generated STATICREMOVE message and remove the specified * unlabeled connection entry. Returns zero on success, negative values on * failure. * */ static int netlbl_unlabel_staticremove(struct sk_buff *skb, struct genl_info *info) { int ret_val; char *dev_name; void *addr; void *mask; u32 addr_len; struct netlbl_audit audit_info; /* See the note in netlbl_unlabel_staticadd() about not allowing both * IPv4 and IPv6 in the same entry. */ if (!info->attrs[NLBL_UNLABEL_A_IFACE] || !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] || !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^ (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] || !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) return -EINVAL; netlbl_netlink_auditinfo(&audit_info); ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); if (ret_val != 0) return ret_val; dev_name = nla_data(info->attrs[NLBL_UNLABEL_A_IFACE]); return netlbl_unlhsh_remove(&init_net, dev_name, addr, mask, addr_len, &audit_info); } /** * netlbl_unlabel_staticremovedef - Handle a STATICREMOVEDEF message * @skb: the NETLINK buffer * @info: the Generic NETLINK info block * * Description: * Process a user generated STATICREMOVEDEF message and remove the default * unlabeled connection entry. Returns zero on success, negative values on * failure. * */ static int netlbl_unlabel_staticremovedef(struct sk_buff *skb, struct genl_info *info) { int ret_val; void *addr; void *mask; u32 addr_len; struct netlbl_audit audit_info; /* See the note in netlbl_unlabel_staticadd() about not allowing both * IPv4 and IPv6 in the same entry. */ if (!((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] || !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^ (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] || !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) return -EINVAL; netlbl_netlink_auditinfo(&audit_info); ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); if (ret_val != 0) return ret_val; return netlbl_unlhsh_remove(&init_net, NULL, addr, mask, addr_len, &audit_info); } /** * netlbl_unlabel_staticlist_gen - Generate messages for STATICLIST[DEF] * @cmd: command/message * @iface: the interface entry * @addr4: the IPv4 address entry * @addr6: the IPv6 address entry * @arg: the netlbl_unlhsh_walk_arg structure * * Description: * This function is designed to be used to generate a response for a * STATICLIST or STATICLISTDEF message. When called either @addr4 or @addr6 * can be specified, not both, the other unspecified entry should be set to * NULL by the caller. Returns the size of the message on success, negative * values on failure. * */ static int netlbl_unlabel_staticlist_gen(u32 cmd, const struct netlbl_unlhsh_iface *iface, const struct netlbl_unlhsh_addr4 *addr4, const struct netlbl_unlhsh_addr6 *addr6, void *arg) { int ret_val = -ENOMEM; struct netlbl_unlhsh_walk_arg *cb_arg = arg; struct net_device *dev; struct lsm_context ctx; void *data; u32 secid; data = genlmsg_put(cb_arg->skb, NETLINK_CB(cb_arg->nl_cb->skb).portid, cb_arg->seq, &netlbl_unlabel_gnl_family, NLM_F_MULTI, cmd); if (data == NULL) goto list_cb_failure; if (iface->ifindex > 0) { dev = dev_get_by_index(&init_net, iface->ifindex); if (!dev) { ret_val = -ENODEV; goto list_cb_failure; } ret_val = nla_put_string(cb_arg->skb, NLBL_UNLABEL_A_IFACE, dev->name); dev_put(dev); if (ret_val != 0) goto list_cb_failure; } if (addr4) { struct in_addr addr_struct; addr_struct.s_addr = addr4->list.addr; ret_val = nla_put_in_addr(cb_arg->skb, NLBL_UNLABEL_A_IPV4ADDR, addr_struct.s_addr); if (ret_val != 0) goto list_cb_failure; addr_struct.s_addr = addr4->list.mask; ret_val = nla_put_in_addr(cb_arg->skb, NLBL_UNLABEL_A_IPV4MASK, addr_struct.s_addr); if (ret_val != 0) goto list_cb_failure; secid = addr4->secid; } else { ret_val = nla_put_in6_addr(cb_arg->skb, NLBL_UNLABEL_A_IPV6ADDR, &addr6->list.addr); if (ret_val != 0) goto list_cb_failure; ret_val = nla_put_in6_addr(cb_arg->skb, NLBL_UNLABEL_A_IPV6MASK, &addr6->list.mask); if (ret_val != 0) goto list_cb_failure; secid = addr6->secid; } ret_val = security_secid_to_secctx(secid, &ctx); if (ret_val < 0) goto list_cb_failure; ret_val = nla_put(cb_arg->skb, NLBL_UNLABEL_A_SECCTX, ctx.len, ctx.context); security_release_secctx(&ctx); if (ret_val != 0) goto list_cb_failure; cb_arg->seq++; genlmsg_end(cb_arg->skb, data); return 0; list_cb_failure: genlmsg_cancel(cb_arg->skb, data); return ret_val; } /** * netlbl_unlabel_staticlist - Handle a STATICLIST message * @skb: the NETLINK buffer * @cb: the NETLINK callback * * Description: * Process a user generated STATICLIST message and dump the unlabeled * connection hash table in a form suitable for use in a kernel generated * STATICLIST message. Returns the length of @skb. * */ static int netlbl_unlabel_staticlist(struct sk_buff *skb, struct netlink_callback *cb) { struct netlbl_unlhsh_walk_arg cb_arg; u32 skip_bkt = cb->args[0]; u32 skip_chain = cb->args[1]; u32 skip_addr4 = cb->args[2]; u32 iter_bkt, iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0; struct netlbl_unlhsh_iface *iface; struct list_head *iter_list; struct netlbl_af4list *addr4; #if IS_ENABLED(CONFIG_IPV6) u32 skip_addr6 = cb->args[3]; struct netlbl_af6list *addr6; #endif cb_arg.nl_cb = cb; cb_arg.skb = skb; cb_arg.seq = cb->nlh->nlmsg_seq; rcu_read_lock(); for (iter_bkt = skip_bkt; iter_bkt < rcu_dereference(netlbl_unlhsh)->size; iter_bkt++) { iter_list = &rcu_dereference(netlbl_unlhsh)->tbl[iter_bkt]; list_for_each_entry_rcu(iface, iter_list, list) { if (!iface->valid || iter_chain++ < skip_chain) continue; netlbl_af4list_foreach_rcu(addr4, &iface->addr4_list) { if (iter_addr4++ < skip_addr4) continue; if (netlbl_unlabel_staticlist_gen( NLBL_UNLABEL_C_STATICLIST, iface, netlbl_unlhsh_addr4_entry(addr4), NULL, &cb_arg) < 0) { iter_addr4--; iter_chain--; goto unlabel_staticlist_return; } } iter_addr4 = 0; skip_addr4 = 0; #if IS_ENABLED(CONFIG_IPV6) netlbl_af6list_foreach_rcu(addr6, &iface->addr6_list) { if (iter_addr6++ < skip_addr6) continue; if (netlbl_unlabel_staticlist_gen( NLBL_UNLABEL_C_STATICLIST, iface, NULL, netlbl_unlhsh_addr6_entry(addr6), &cb_arg) < 0) { iter_addr6--; iter_chain--; goto unlabel_staticlist_return; } } iter_addr6 = 0; skip_addr6 = 0; #endif /* IPv6 */ } iter_chain = 0; skip_chain = 0; } unlabel_staticlist_return: rcu_read_unlock(); cb->args[0] = iter_bkt; cb->args[1] = iter_chain; cb->args[2] = iter_addr4; cb->args[3] = iter_addr6; return skb->len; } /** * netlbl_unlabel_staticlistdef - Handle a STATICLISTDEF message * @skb: the NETLINK buffer * @cb: the NETLINK callback * * Description: * Process a user generated STATICLISTDEF message and dump the default * unlabeled connection entry in a form suitable for use in a kernel generated * STATICLISTDEF message. Returns the length of @skb. * */ static int netlbl_unlabel_staticlistdef(struct sk_buff *skb, struct netlink_callback *cb) { struct netlbl_unlhsh_walk_arg cb_arg; struct netlbl_unlhsh_iface *iface; u32 iter_addr4 = 0, iter_addr6 = 0; struct netlbl_af4list *addr4; #if IS_ENABLED(CONFIG_IPV6) struct netlbl_af6list *addr6; #endif cb_arg.nl_cb = cb; cb_arg.skb = skb; cb_arg.seq = cb->nlh->nlmsg_seq; rcu_read_lock(); iface = rcu_dereference(netlbl_unlhsh_def); if (iface == NULL || !iface->valid) goto unlabel_staticlistdef_return; netlbl_af4list_foreach_rcu(addr4, &iface->addr4_list) { if (iter_addr4++ < cb->args[0]) continue; if (netlbl_unlabel_staticlist_gen(NLBL_UNLABEL_C_STATICLISTDEF, iface, netlbl_unlhsh_addr4_entry(addr4), NULL, &cb_arg) < 0) { iter_addr4--; goto unlabel_staticlistdef_return; } } #if IS_ENABLED(CONFIG_IPV6) netlbl_af6list_foreach_rcu(addr6, &iface->addr6_list) { if (iter_addr6++ < cb->args[1]) continue; if (netlbl_unlabel_staticlist_gen(NLBL_UNLABEL_C_STATICLISTDEF, iface, NULL, netlbl_unlhsh_addr6_entry(addr6), &cb_arg) < 0) { iter_addr6--; goto unlabel_staticlistdef_return; } } #endif /* IPv6 */ unlabel_staticlistdef_return: rcu_read_unlock(); cb->args[0] = iter_addr4; cb->args[1] = iter_addr6; return skb->len; } /* * NetLabel Generic NETLINK Command Definitions */ static const struct genl_small_ops netlbl_unlabel_genl_ops[] = { { .cmd = NLBL_UNLABEL_C_STATICADD, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = netlbl_unlabel_staticadd, .dumpit = NULL, }, { .cmd = NLBL_UNLABEL_C_STATICREMOVE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = netlbl_unlabel_staticremove, .dumpit = NULL, }, { .cmd = NLBL_UNLABEL_C_STATICLIST, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, .doit = NULL, .dumpit = netlbl_unlabel_staticlist, }, { .cmd = NLBL_UNLABEL_C_STATICADDDEF, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = netlbl_unlabel_staticadddef, .dumpit = NULL, }, { .cmd = NLBL_UNLABEL_C_STATICREMOVEDEF, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = netlbl_unlabel_staticremovedef, .dumpit = NULL, }, { .cmd = NLBL_UNLABEL_C_STATICLISTDEF, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, .doit = NULL, .dumpit = netlbl_unlabel_staticlistdef, }, { .cmd = NLBL_UNLABEL_C_ACCEPT, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = netlbl_unlabel_accept, .dumpit = NULL, }, { .cmd = NLBL_UNLABEL_C_LIST, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, .doit = netlbl_unlabel_list, .dumpit = NULL, }, }; static struct genl_family netlbl_unlabel_gnl_family __ro_after_init = { .hdrsize = 0, .name = NETLBL_NLTYPE_UNLABELED_NAME, .version = NETLBL_PROTO_VERSION, .maxattr = NLBL_UNLABEL_A_MAX, .policy = netlbl_unlabel_genl_policy, .module = THIS_MODULE, .small_ops = netlbl_unlabel_genl_ops, .n_small_ops = ARRAY_SIZE(netlbl_unlabel_genl_ops), .resv_start_op = NLBL_UNLABEL_C_STATICLISTDEF + 1, }; /* * NetLabel Generic NETLINK Protocol Functions */ /** * netlbl_unlabel_genl_init - Register the Unlabeled NetLabel component * * Description: * Register the unlabeled packet NetLabel component with the Generic NETLINK * mechanism. Returns zero on success, negative values on failure. * */ int __init netlbl_unlabel_genl_init(void) { return genl_register_family(&netlbl_unlabel_gnl_family); } /* * NetLabel KAPI Hooks */ static struct notifier_block netlbl_unlhsh_netdev_notifier = { .notifier_call = netlbl_unlhsh_netdev_handler, }; /** * netlbl_unlabel_init - Initialize the unlabeled connection hash table * @size: the number of bits to use for the hash buckets * * Description: * Initializes the unlabeled connection hash table and registers a network * device notification handler. This function should only be called by the * NetLabel subsystem itself during initialization. Returns zero on success, * non-zero values on error. * */ int __init netlbl_unlabel_init(u32 size) { u32 iter; struct netlbl_unlhsh_tbl *hsh_tbl; if (size == 0) return -EINVAL; hsh_tbl = kmalloc(sizeof(*hsh_tbl), GFP_KERNEL); if (hsh_tbl == NULL) return -ENOMEM; hsh_tbl->size = 1 << size; hsh_tbl->tbl = kcalloc(hsh_tbl->size, sizeof(struct list_head), GFP_KERNEL); if (hsh_tbl->tbl == NULL) { kfree(hsh_tbl); return -ENOMEM; } for (iter = 0; iter < hsh_tbl->size; iter++) INIT_LIST_HEAD(&hsh_tbl->tbl[iter]); spin_lock(&netlbl_unlhsh_lock); rcu_assign_pointer(netlbl_unlhsh, hsh_tbl); spin_unlock(&netlbl_unlhsh_lock); register_netdevice_notifier(&netlbl_unlhsh_netdev_notifier); return 0; } /** * netlbl_unlabel_getattr - Get the security attributes for an unlabled packet * @skb: the packet * @family: protocol family * @secattr: the security attributes * * Description: * Determine the security attributes, if any, for an unlabled packet and return * them in @secattr. Returns zero on success and negative values on failure. * */ int netlbl_unlabel_getattr(const struct sk_buff *skb, u16 family, struct netlbl_lsm_secattr *secattr) { struct netlbl_unlhsh_iface *iface; rcu_read_lock(); iface = netlbl_unlhsh_search_iface(skb->skb_iif); if (iface == NULL) iface = rcu_dereference(netlbl_unlhsh_def); if (iface == NULL || !iface->valid) goto unlabel_getattr_nolabel; #if IS_ENABLED(CONFIG_IPV6) /* When resolving a fallback label, check the sk_buff version as * it is possible (e.g. SCTP) to have family = PF_INET6 while * receiving ip_hdr(skb)->version = 4. */ if (family == PF_INET6 && ip_hdr(skb)->version == 4) family = PF_INET; #endif /* IPv6 */ switch (family) { case PF_INET: { struct iphdr *hdr4; struct netlbl_af4list *addr4; hdr4 = ip_hdr(skb); addr4 = netlbl_af4list_search(hdr4->saddr, &iface->addr4_list); if (addr4 == NULL) goto unlabel_getattr_nolabel; secattr->attr.secid = netlbl_unlhsh_addr4_entry(addr4)->secid; break; } #if IS_ENABLED(CONFIG_IPV6) case PF_INET6: { struct ipv6hdr *hdr6; struct netlbl_af6list *addr6; hdr6 = ipv6_hdr(skb); addr6 = netlbl_af6list_search(&hdr6->saddr, &iface->addr6_list); if (addr6 == NULL) goto unlabel_getattr_nolabel; secattr->attr.secid = netlbl_unlhsh_addr6_entry(addr6)->secid; break; } #endif /* IPv6 */ default: goto unlabel_getattr_nolabel; } rcu_read_unlock(); secattr->flags |= NETLBL_SECATTR_SECID; secattr->type = NETLBL_NLTYPE_UNLABELED; return 0; unlabel_getattr_nolabel: rcu_read_unlock(); if (netlabel_unlabel_acceptflg == 0) return -ENOMSG; secattr->type = NETLBL_NLTYPE_UNLABELED; return 0; } /** * netlbl_unlabel_defconf - Set the default config to allow unlabeled packets * * Description: * Set the default NetLabel configuration to allow incoming unlabeled packets * and to send unlabeled network traffic by default. * */ int __init netlbl_unlabel_defconf(void) { int ret_val; struct netlbl_dom_map *entry; struct netlbl_audit audit_info; /* Only the kernel is allowed to call this function and the only time * it is called is at bootup before the audit subsystem is reporting * messages so don't worry to much about these values. */ security_current_getlsmprop_subj(&audit_info.prop); audit_info.loginuid = GLOBAL_ROOT_UID; audit_info.sessionid = 0; entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (entry == NULL) return -ENOMEM; entry->family = AF_UNSPEC; entry->def.type = NETLBL_NLTYPE_UNLABELED; ret_val = netlbl_domhsh_add_default(entry, &audit_info); if (ret_val != 0) return ret_val; netlbl_unlabel_acceptflg_set(1, &audit_info); return 0; }
6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 // SPDX-License-Identifier: GPL-2.0-or-later /* * IP multicast routing support for mrouted 3.6/3.8 * * (c) 1995 Alan Cox, <alan@lxorguk.ukuu.org.uk> * Linux Consultancy and Custom Driver Development * * Fixes: * Michael Chastain : Incorrect size of copying. * Alan Cox : Added the cache manager code * Alan Cox : Fixed the clone/copy bug and device race. * Mike McLagan : Routing by source * Malcolm Beattie : Buffer handling fixes. * Alexey Kuznetsov : Double buffer free and other fixes. * SVR Anand : Fixed several multicast bugs and problems. * Alexey Kuznetsov : Status, optimisations and more. * Brad Parker : Better behaviour on mrouted upcall * overflow. * Carlos Picoto : PIMv1 Support * Pavlin Ivanov Radoslavov: PIMv2 Registers must checksum only PIM header * Relax this requirement to work with older peers. */ #include <linux/uaccess.h> #include <linux/types.h> #include <linux/cache.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/mm.h> #include <linux/kernel.h> #include <linux/fcntl.h> #include <linux/stat.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/inetdevice.h> #include <linux/igmp.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/mroute.h> #include <linux/init.h> #include <linux/if_ether.h> #include <linux/slab.h> #include <net/net_namespace.h> #include <net/ip.h> #include <net/protocol.h> #include <linux/skbuff.h> #include <net/route.h> #include <net/icmp.h> #include <net/udp.h> #include <net/raw.h> #include <linux/notifier.h> #include <linux/if_arp.h> #include <linux/netfilter_ipv4.h> #include <linux/compat.h> #include <linux/export.h> #include <linux/rhashtable.h> #include <net/ip_tunnels.h> #include <net/checksum.h> #include <net/netlink.h> #include <net/fib_rules.h> #include <linux/netconf.h> #include <net/rtnh.h> #include <net/inet_dscp.h> #include <linux/nospec.h> struct ipmr_rule { struct fib_rule common; }; struct ipmr_result { struct mr_table *mrt; }; /* Big lock, protecting vif table, mrt cache and mroute socket state. * Note that the changes are semaphored via rtnl_lock. */ static DEFINE_SPINLOCK(mrt_lock); static struct net_device *vif_dev_read(const struct vif_device *vif) { return rcu_dereference(vif->dev); } /* Multicast router control variables */ /* Special spinlock for queue of unresolved entries */ static DEFINE_SPINLOCK(mfc_unres_lock); /* We return to original Alan's scheme. Hash table of resolved * entries is changed only in process context and protected * with weak lock mrt_lock. Queue of unresolved entries is protected * with strong spinlock mfc_unres_lock. * * In this case data path is free of exclusive locks at all. */ static struct kmem_cache *mrt_cachep __ro_after_init; static struct mr_table *ipmr_new_table(struct net *net, u32 id); static void ipmr_free_table(struct mr_table *mrt); static void ip_mr_forward(struct net *net, struct mr_table *mrt, struct net_device *dev, struct sk_buff *skb, struct mfc_cache *cache, int local); static int ipmr_cache_report(const struct mr_table *mrt, struct sk_buff *pkt, vifi_t vifi, int assert); static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc, int cmd); static void igmpmsg_netlink_event(const struct mr_table *mrt, struct sk_buff *pkt); static void mroute_clean_tables(struct mr_table *mrt, int flags); static void ipmr_expire_process(struct timer_list *t); #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES #define ipmr_for_each_table(mrt, net) \ list_for_each_entry_rcu(mrt, &net->ipv4.mr_tables, list, \ lockdep_rtnl_is_held() || \ list_empty(&net->ipv4.mr_tables)) static struct mr_table *ipmr_mr_table_iter(struct net *net, struct mr_table *mrt) { struct mr_table *ret; if (!mrt) ret = list_entry_rcu(net->ipv4.mr_tables.next, struct mr_table, list); else ret = list_entry_rcu(mrt->list.next, struct mr_table, list); if (&ret->list == &net->ipv4.mr_tables) return NULL; return ret; } static struct mr_table *__ipmr_get_table(struct net *net, u32 id) { struct mr_table *mrt; ipmr_for_each_table(mrt, net) { if (mrt->id == id) return mrt; } return NULL; } static struct mr_table *ipmr_get_table(struct net *net, u32 id) { struct mr_table *mrt; rcu_read_lock(); mrt = __ipmr_get_table(net, id); rcu_read_unlock(); return mrt; } static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4, struct mr_table **mrt) { int err; struct ipmr_result res; struct fib_lookup_arg arg = { .result = &res, .flags = FIB_LOOKUP_NOREF, }; /* update flow if oif or iif point to device enslaved to l3mdev */ l3mdev_update_flow(net, flowi4_to_flowi(flp4)); err = fib_rules_lookup(net->ipv4.mr_rules_ops, flowi4_to_flowi(flp4), 0, &arg); if (err < 0) return err; *mrt = res.mrt; return 0; } static int ipmr_rule_action(struct fib_rule *rule, struct flowi *flp, int flags, struct fib_lookup_arg *arg) { struct ipmr_result *res = arg->result; struct mr_table *mrt; switch (rule->action) { case FR_ACT_TO_TBL: break; case FR_ACT_UNREACHABLE: return -ENETUNREACH; case FR_ACT_PROHIBIT: return -EACCES; case FR_ACT_BLACKHOLE: default: return -EINVAL; } arg->table = fib_rule_get_table(rule, arg); mrt = __ipmr_get_table(rule->fr_net, arg->table); if (!mrt) return -EAGAIN; res->mrt = mrt; return 0; } static int ipmr_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) { return 1; } static int ipmr_rule_configure(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh, struct nlattr **tb, struct netlink_ext_ack *extack) { return 0; } static int ipmr_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, struct nlattr **tb) { return 1; } static int ipmr_rule_fill(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh) { frh->dst_len = 0; frh->src_len = 0; frh->tos = 0; return 0; } static const struct fib_rules_ops __net_initconst ipmr_rules_ops_template = { .family = RTNL_FAMILY_IPMR, .rule_size = sizeof(struct ipmr_rule), .addr_size = sizeof(u32), .action = ipmr_rule_action, .match = ipmr_rule_match, .configure = ipmr_rule_configure, .compare = ipmr_rule_compare, .fill = ipmr_rule_fill, .nlgroup = RTNLGRP_IPV4_RULE, .owner = THIS_MODULE, }; static int __net_init ipmr_rules_init(struct net *net) { struct fib_rules_ops *ops; struct mr_table *mrt; int err; ops = fib_rules_register(&ipmr_rules_ops_template, net); if (IS_ERR(ops)) return PTR_ERR(ops); INIT_LIST_HEAD(&net->ipv4.mr_tables); mrt = ipmr_new_table(net, RT_TABLE_DEFAULT); if (IS_ERR(mrt)) { err = PTR_ERR(mrt); goto err1; } err = fib_default_rule_add(ops, 0x7fff, RT_TABLE_DEFAULT); if (err < 0) goto err2; net->ipv4.mr_rules_ops = ops; return 0; err2: rtnl_lock(); ipmr_free_table(mrt); rtnl_unlock(); err1: fib_rules_unregister(ops); return err; } static void __net_exit ipmr_rules_exit(struct net *net) { struct mr_table *mrt, *next; ASSERT_RTNL(); list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) { list_del(&mrt->list); ipmr_free_table(mrt); } fib_rules_unregister(net->ipv4.mr_rules_ops); } static int ipmr_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { return fib_rules_dump(net, nb, RTNL_FAMILY_IPMR, extack); } static unsigned int ipmr_rules_seq_read(const struct net *net) { return fib_rules_seq_read(net, RTNL_FAMILY_IPMR); } bool ipmr_rule_default(const struct fib_rule *rule) { return fib_rule_matchall(rule) && rule->table == RT_TABLE_DEFAULT; } EXPORT_SYMBOL(ipmr_rule_default); #else #define ipmr_for_each_table(mrt, net) \ for (mrt = net->ipv4.mrt; mrt; mrt = NULL) static struct mr_table *ipmr_mr_table_iter(struct net *net, struct mr_table *mrt) { if (!mrt) return net->ipv4.mrt; return NULL; } static struct mr_table *ipmr_get_table(struct net *net, u32 id) { return net->ipv4.mrt; } #define __ipmr_get_table ipmr_get_table static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4, struct mr_table **mrt) { *mrt = net->ipv4.mrt; return 0; } static int __net_init ipmr_rules_init(struct net *net) { struct mr_table *mrt; mrt = ipmr_new_table(net, RT_TABLE_DEFAULT); if (IS_ERR(mrt)) return PTR_ERR(mrt); net->ipv4.mrt = mrt; return 0; } static void __net_exit ipmr_rules_exit(struct net *net) { ASSERT_RTNL(); ipmr_free_table(net->ipv4.mrt); net->ipv4.mrt = NULL; } static int ipmr_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { return 0; } static unsigned int ipmr_rules_seq_read(const struct net *net) { return 0; } bool ipmr_rule_default(const struct fib_rule *rule) { return true; } EXPORT_SYMBOL(ipmr_rule_default); #endif static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg, const void *ptr) { const struct mfc_cache_cmp_arg *cmparg = arg->key; const struct mfc_cache *c = ptr; return cmparg->mfc_mcastgrp != c->mfc_mcastgrp || cmparg->mfc_origin != c->mfc_origin; } static const struct rhashtable_params ipmr_rht_params = { .head_offset = offsetof(struct mr_mfc, mnode), .key_offset = offsetof(struct mfc_cache, cmparg), .key_len = sizeof(struct mfc_cache_cmp_arg), .nelem_hint = 3, .obj_cmpfn = ipmr_hash_cmp, .automatic_shrinking = true, }; static void ipmr_new_table_set(struct mr_table *mrt, struct net *net) { #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES list_add_tail_rcu(&mrt->list, &net->ipv4.mr_tables); #endif } static struct mfc_cache_cmp_arg ipmr_mr_table_ops_cmparg_any = { .mfc_mcastgrp = htonl(INADDR_ANY), .mfc_origin = htonl(INADDR_ANY), }; static struct mr_table_ops ipmr_mr_table_ops = { .rht_params = &ipmr_rht_params, .cmparg_any = &ipmr_mr_table_ops_cmparg_any, }; static struct mr_table *ipmr_new_table(struct net *net, u32 id) { struct mr_table *mrt; /* "pimreg%u" should not exceed 16 bytes (IFNAMSIZ) */ if (id != RT_TABLE_DEFAULT && id >= 1000000000) return ERR_PTR(-EINVAL); mrt = __ipmr_get_table(net, id); if (mrt) return mrt; return mr_table_alloc(net, id, &ipmr_mr_table_ops, ipmr_expire_process, ipmr_new_table_set); } static void ipmr_free_table(struct mr_table *mrt) { struct net *net = read_pnet(&mrt->net); WARN_ON_ONCE(!mr_can_free_table(net)); timer_shutdown_sync(&mrt->ipmr_expire_timer); mroute_clean_tables(mrt, MRT_FLUSH_VIFS | MRT_FLUSH_VIFS_STATIC | MRT_FLUSH_MFC | MRT_FLUSH_MFC_STATIC); rhltable_destroy(&mrt->mfc_hash); kfree(mrt); } /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */ /* Initialize ipmr pimreg/tunnel in_device */ static bool ipmr_init_vif_indev(const struct net_device *dev) { struct in_device *in_dev; ASSERT_RTNL(); in_dev = __in_dev_get_rtnl(dev); if (!in_dev) return false; ipv4_devconf_setall(in_dev); neigh_parms_data_state_setall(in_dev->arp_parms); IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0; return true; } static struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v) { struct net_device *tunnel_dev, *new_dev; struct ip_tunnel_parm_kern p = { }; int err; tunnel_dev = __dev_get_by_name(net, "tunl0"); if (!tunnel_dev) goto out; p.iph.daddr = v->vifc_rmt_addr.s_addr; p.iph.saddr = v->vifc_lcl_addr.s_addr; p.iph.version = 4; p.iph.ihl = 5; p.iph.protocol = IPPROTO_IPIP; sprintf(p.name, "dvmrp%d", v->vifc_vifi); if (!tunnel_dev->netdev_ops->ndo_tunnel_ctl) goto out; err = tunnel_dev->netdev_ops->ndo_tunnel_ctl(tunnel_dev, &p, SIOCADDTUNNEL); if (err) goto out; new_dev = __dev_get_by_name(net, p.name); if (!new_dev) goto out; new_dev->flags |= IFF_MULTICAST; if (!ipmr_init_vif_indev(new_dev)) goto out_unregister; if (dev_open(new_dev, NULL)) goto out_unregister; dev_hold(new_dev); err = dev_set_allmulti(new_dev, 1); if (err) { dev_close(new_dev); tunnel_dev->netdev_ops->ndo_tunnel_ctl(tunnel_dev, &p, SIOCDELTUNNEL); dev_put(new_dev); new_dev = ERR_PTR(err); } return new_dev; out_unregister: unregister_netdevice(new_dev); out: return ERR_PTR(-ENOBUFS); } #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2) static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) { struct net *net = dev_net(dev); struct mr_table *mrt; struct flowi4 fl4 = { .flowi4_oif = dev->ifindex, .flowi4_iif = skb->skb_iif ? : LOOPBACK_IFINDEX, .flowi4_mark = skb->mark, }; int err; err = ipmr_fib_lookup(net, &fl4, &mrt); if (err < 0) { kfree_skb(skb); return err; } DEV_STATS_ADD(dev, tx_bytes, skb->len); DEV_STATS_INC(dev, tx_packets); rcu_read_lock(); /* Pairs with WRITE_ONCE() in vif_add() and vif_delete() */ ipmr_cache_report(mrt, skb, READ_ONCE(mrt->mroute_reg_vif_num), IGMPMSG_WHOLEPKT); rcu_read_unlock(); kfree_skb(skb); return NETDEV_TX_OK; } static int reg_vif_get_iflink(const struct net_device *dev) { return 0; } static const struct net_device_ops reg_vif_netdev_ops = { .ndo_start_xmit = reg_vif_xmit, .ndo_get_iflink = reg_vif_get_iflink, }; static void reg_vif_setup(struct net_device *dev) { dev->type = ARPHRD_PIMREG; dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 8; dev->flags = IFF_NOARP; dev->netdev_ops = &reg_vif_netdev_ops; dev->needs_free_netdev = true; dev->netns_immutable = true; } static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) { struct net_device *dev; char name[IFNAMSIZ]; if (mrt->id == RT_TABLE_DEFAULT) sprintf(name, "pimreg"); else sprintf(name, "pimreg%u", mrt->id); dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, reg_vif_setup); if (!dev) return NULL; dev_net_set(dev, net); if (register_netdevice(dev)) { free_netdev(dev); return NULL; } if (!ipmr_init_vif_indev(dev)) goto failure; if (dev_open(dev, NULL)) goto failure; dev_hold(dev); return dev; failure: unregister_netdevice(dev); return NULL; } /* called with rcu_read_lock() */ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb, unsigned int pimlen) { struct net_device *reg_dev = NULL; struct iphdr *encap; int vif_num; encap = (struct iphdr *)(skb_transport_header(skb) + pimlen); /* Check that: * a. packet is really sent to a multicast group * b. packet is not a NULL-REGISTER * c. packet is not truncated */ if (!ipv4_is_multicast(encap->daddr) || encap->tot_len == 0 || ntohs(encap->tot_len) + pimlen > skb->len) return 1; /* Pairs with WRITE_ONCE() in vif_add()/vid_delete() */ vif_num = READ_ONCE(mrt->mroute_reg_vif_num); if (vif_num >= 0) reg_dev = vif_dev_read(&mrt->vif_table[vif_num]); if (!reg_dev) return 1; skb->mac_header = skb->network_header; skb_pull(skb, (u8 *)encap - skb->data); skb_reset_network_header(skb); skb->protocol = htons(ETH_P_IP); skb->ip_summed = CHECKSUM_NONE; skb_tunnel_rx(skb, reg_dev, dev_net(reg_dev)); netif_rx(skb); return NET_RX_SUCCESS; } #else static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) { return NULL; } #endif static int call_ipmr_vif_entry_notifiers(struct net *net, enum fib_event_type event_type, struct vif_device *vif, struct net_device *vif_dev, vifi_t vif_index, u32 tb_id) { return mr_call_vif_notifiers(net, RTNL_FAMILY_IPMR, event_type, vif, vif_dev, vif_index, tb_id, &net->ipv4.ipmr_seq); } static int call_ipmr_mfc_entry_notifiers(struct net *net, enum fib_event_type event_type, struct mfc_cache *mfc, u32 tb_id) { return mr_call_mfc_notifiers(net, RTNL_FAMILY_IPMR, event_type, &mfc->_c, tb_id, &net->ipv4.ipmr_seq); } /** * vif_delete - Delete a VIF entry * @mrt: Table to delete from * @vifi: VIF identifier to delete * @notify: Set to 1, if the caller is a notifier_call * @head: if unregistering the VIF, place it on this queue */ static int vif_delete(struct mr_table *mrt, int vifi, int notify, struct list_head *head) { struct net *net = read_pnet(&mrt->net); struct vif_device *v; struct net_device *dev; struct in_device *in_dev; if (vifi < 0 || vifi >= mrt->maxvif) return -EADDRNOTAVAIL; v = &mrt->vif_table[vifi]; dev = rtnl_dereference(v->dev); if (!dev) return -EADDRNOTAVAIL; spin_lock(&mrt_lock); call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_DEL, v, dev, vifi, mrt->id); RCU_INIT_POINTER(v->dev, NULL); if (vifi == mrt->mroute_reg_vif_num) { /* Pairs with READ_ONCE() in ipmr_cache_report() and reg_vif_xmit() */ WRITE_ONCE(mrt->mroute_reg_vif_num, -1); } if (vifi + 1 == mrt->maxvif) { int tmp; for (tmp = vifi - 1; tmp >= 0; tmp--) { if (VIF_EXISTS(mrt, tmp)) break; } WRITE_ONCE(mrt->maxvif, tmp + 1); } spin_unlock(&mrt_lock); dev_set_allmulti(dev, -1); in_dev = __in_dev_get_rtnl(dev); if (in_dev) { IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--; inet_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, dev->ifindex, &in_dev->cnf); ip_rt_multicast_event(in_dev); } if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER) && !notify) unregister_netdevice_queue(dev, head); netdev_put(dev, &v->dev_tracker); return 0; } static void ipmr_cache_free_rcu(struct rcu_head *head) { struct mr_mfc *c = container_of(head, struct mr_mfc, rcu); kmem_cache_free(mrt_cachep, (struct mfc_cache *)c); } static void ipmr_cache_free(struct mfc_cache *c) { call_rcu(&c->_c.rcu, ipmr_cache_free_rcu); } /* Destroy an unresolved cache entry, killing queued skbs * and reporting error to netlink readers. */ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c) { struct net *net = read_pnet(&mrt->net); struct sk_buff *skb; struct nlmsgerr *e; atomic_dec(&mrt->cache_resolve_queue_len); while ((skb = skb_dequeue(&c->_c.mfc_un.unres.unresolved))) { if (ip_hdr(skb)->version == 0) { struct nlmsghdr *nlh = skb_pull(skb, sizeof(struct iphdr)); nlh->nlmsg_type = NLMSG_ERROR; nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr)); skb_trim(skb, nlh->nlmsg_len); e = nlmsg_data(nlh); e->error = -ETIMEDOUT; memset(&e->msg, 0, sizeof(e->msg)); rtnl_unicast(skb, net, NETLINK_CB(skb).portid); } else { kfree_skb(skb); } } ipmr_cache_free(c); } /* Timer process for the unresolved queue. */ static void ipmr_expire_process(struct timer_list *t) { struct mr_table *mrt = timer_container_of(mrt, t, ipmr_expire_timer); struct mr_mfc *c, *next; unsigned long expires; unsigned long now; if (!spin_trylock(&mfc_unres_lock)) { mod_timer(&mrt->ipmr_expire_timer, jiffies+HZ/10); return; } if (list_empty(&mrt->mfc_unres_queue)) goto out; now = jiffies; expires = 10*HZ; list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) { if (time_after(c->mfc_un.unres.expires, now)) { unsigned long interval = c->mfc_un.unres.expires - now; if (interval < expires) expires = interval; continue; } list_del(&c->list); mroute_netlink_event(mrt, (struct mfc_cache *)c, RTM_DELROUTE); ipmr_destroy_unres(mrt, (struct mfc_cache *)c); } if (!list_empty(&mrt->mfc_unres_queue)) mod_timer(&mrt->ipmr_expire_timer, jiffies + expires); out: spin_unlock(&mfc_unres_lock); } /* Fill oifs list. It is called under locked mrt_lock. */ static void ipmr_update_thresholds(struct mr_table *mrt, struct mr_mfc *cache, unsigned char *ttls) { int vifi; cache->mfc_un.res.minvif = MAXVIFS; cache->mfc_un.res.maxvif = 0; memset(cache->mfc_un.res.ttls, 255, MAXVIFS); for (vifi = 0; vifi < mrt->maxvif; vifi++) { if (VIF_EXISTS(mrt, vifi) && ttls[vifi] && ttls[vifi] < 255) { cache->mfc_un.res.ttls[vifi] = ttls[vifi]; if (cache->mfc_un.res.minvif > vifi) cache->mfc_un.res.minvif = vifi; if (cache->mfc_un.res.maxvif <= vifi) cache->mfc_un.res.maxvif = vifi + 1; } } WRITE_ONCE(cache->mfc_un.res.lastuse, jiffies); } static int vif_add(struct net *net, struct mr_table *mrt, struct vifctl *vifc, int mrtsock) { struct netdev_phys_item_id ppid = { }; int vifi = vifc->vifc_vifi; struct vif_device *v = &mrt->vif_table[vifi]; struct net_device *dev; struct in_device *in_dev; int err; /* Is vif busy ? */ if (VIF_EXISTS(mrt, vifi)) return -EADDRINUSE; switch (vifc->vifc_flags) { case VIFF_REGISTER: if (!ipmr_pimsm_enabled()) return -EINVAL; /* Special Purpose VIF in PIM * All the packets will be sent to the daemon */ if (mrt->mroute_reg_vif_num >= 0) return -EADDRINUSE; dev = ipmr_reg_vif(net, mrt); if (!dev) return -ENOBUFS; err = dev_set_allmulti(dev, 1); if (err) { unregister_netdevice(dev); dev_put(dev); return err; } break; case VIFF_TUNNEL: dev = ipmr_new_tunnel(net, vifc); if (IS_ERR(dev)) return PTR_ERR(dev); break; case VIFF_USE_IFINDEX: case 0: if (vifc->vifc_flags == VIFF_USE_IFINDEX) { dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex); if (dev && !__in_dev_get_rtnl(dev)) { dev_put(dev); return -EADDRNOTAVAIL; } } else { dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr); } if (!dev) return -EADDRNOTAVAIL; err = dev_set_allmulti(dev, 1); if (err) { dev_put(dev); return err; } break; default: return -EINVAL; } in_dev = __in_dev_get_rtnl(dev); if (!in_dev) { dev_put(dev); return -EADDRNOTAVAIL; } IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++; inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, dev->ifindex, &in_dev->cnf); ip_rt_multicast_event(in_dev); /* Fill in the VIF structures */ vif_device_init(v, dev, vifc->vifc_rate_limit, vifc->vifc_threshold, vifc->vifc_flags | (!mrtsock ? VIFF_STATIC : 0), (VIFF_TUNNEL | VIFF_REGISTER)); err = dev_get_port_parent_id(dev, &ppid, true); if (err == 0) { memcpy(v->dev_parent_id.id, ppid.id, ppid.id_len); v->dev_parent_id.id_len = ppid.id_len; } else { v->dev_parent_id.id_len = 0; } v->local = vifc->vifc_lcl_addr.s_addr; v->remote = vifc->vifc_rmt_addr.s_addr; /* And finish update writing critical data */ spin_lock(&mrt_lock); rcu_assign_pointer(v->dev, dev); netdev_tracker_alloc(dev, &v->dev_tracker, GFP_ATOMIC); if (v->flags & VIFF_REGISTER) { /* Pairs with READ_ONCE() in ipmr_cache_report() and reg_vif_xmit() */ WRITE_ONCE(mrt->mroute_reg_vif_num, vifi); } if (vifi+1 > mrt->maxvif) WRITE_ONCE(mrt->maxvif, vifi + 1); spin_unlock(&mrt_lock); call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_ADD, v, dev, vifi, mrt->id); return 0; } /* called with rcu_read_lock() */ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt, __be32 origin, __be32 mcastgrp) { struct mfc_cache_cmp_arg arg = { .mfc_mcastgrp = mcastgrp, .mfc_origin = origin }; return mr_mfc_find(mrt, &arg); } /* Look for a (*,G) entry */ static struct mfc_cache *ipmr_cache_find_any(struct mr_table *mrt, __be32 mcastgrp, int vifi) { struct mfc_cache_cmp_arg arg = { .mfc_mcastgrp = mcastgrp, .mfc_origin = htonl(INADDR_ANY) }; if (mcastgrp == htonl(INADDR_ANY)) return mr_mfc_find_any_parent(mrt, vifi); return mr_mfc_find_any(mrt, vifi, &arg); } /* Look for a (S,G,iif) entry if parent != -1 */ static struct mfc_cache *ipmr_cache_find_parent(struct mr_table *mrt, __be32 origin, __be32 mcastgrp, int parent) { struct mfc_cache_cmp_arg arg = { .mfc_mcastgrp = mcastgrp, .mfc_origin = origin, }; return mr_mfc_find_parent(mrt, &arg, parent); } /* Allocate a multicast cache entry */ static struct mfc_cache *ipmr_cache_alloc(void) { struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL); if (c) { c->_c.mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1; c->_c.mfc_un.res.minvif = MAXVIFS; c->_c.free = ipmr_cache_free_rcu; refcount_set(&c->_c.mfc_un.res.refcount, 1); } return c; } static struct mfc_cache *ipmr_cache_alloc_unres(void) { struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC); if (c) { skb_queue_head_init(&c->_c.mfc_un.unres.unresolved); c->_c.mfc_un.unres.expires = jiffies + 10 * HZ; } return c; } /* A cache entry has gone into a resolved state from queued */ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt, struct mfc_cache *uc, struct mfc_cache *c) { struct sk_buff *skb; struct nlmsgerr *e; /* Play the pending entries through our router */ while ((skb = __skb_dequeue(&uc->_c.mfc_un.unres.unresolved))) { if (ip_hdr(skb)->version == 0) { struct nlmsghdr *nlh = skb_pull(skb, sizeof(struct iphdr)); if (mr_fill_mroute(mrt, skb, &c->_c, nlmsg_data(nlh)) > 0) { nlh->nlmsg_len = skb_tail_pointer(skb) - (u8 *)nlh; } else { nlh->nlmsg_type = NLMSG_ERROR; nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr)); skb_trim(skb, nlh->nlmsg_len); e = nlmsg_data(nlh); e->error = -EMSGSIZE; memset(&e->msg, 0, sizeof(e->msg)); } rtnl_unicast(skb, net, NETLINK_CB(skb).portid); } else { rcu_read_lock(); ip_mr_forward(net, mrt, skb->dev, skb, c, 0); rcu_read_unlock(); } } } /* Bounce a cache query up to mrouted and netlink. * * Called under rcu_read_lock(). */ static int ipmr_cache_report(const struct mr_table *mrt, struct sk_buff *pkt, vifi_t vifi, int assert) { const int ihl = ip_hdrlen(pkt); struct sock *mroute_sk; struct igmphdr *igmp; struct igmpmsg *msg; struct sk_buff *skb; int ret; mroute_sk = rcu_dereference(mrt->mroute_sk); if (!mroute_sk) return -EINVAL; if (assert == IGMPMSG_WHOLEPKT || assert == IGMPMSG_WRVIFWHOLE) skb = skb_realloc_headroom(pkt, sizeof(struct iphdr)); else skb = alloc_skb(128, GFP_ATOMIC); if (!skb) return -ENOBUFS; if (assert == IGMPMSG_WHOLEPKT || assert == IGMPMSG_WRVIFWHOLE) { /* Ugly, but we have no choice with this interface. * Duplicate old header, fix ihl, length etc. * And all this only to mangle msg->im_msgtype and * to set msg->im_mbz to "mbz" :-) */ skb_push(skb, sizeof(struct iphdr)); skb_reset_network_header(skb); skb_reset_transport_header(skb); msg = (struct igmpmsg *)skb_network_header(skb); memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr)); msg->im_msgtype = assert; msg->im_mbz = 0; if (assert == IGMPMSG_WRVIFWHOLE) { msg->im_vif = vifi; msg->im_vif_hi = vifi >> 8; } else { /* Pairs with WRITE_ONCE() in vif_add() and vif_delete() */ int vif_num = READ_ONCE(mrt->mroute_reg_vif_num); msg->im_vif = vif_num; msg->im_vif_hi = vif_num >> 8; } ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2; ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) + sizeof(struct iphdr)); } else { /* Copy the IP header */ skb_set_network_header(skb, skb->len); skb_put(skb, ihl); skb_copy_to_linear_data(skb, pkt->data, ihl); /* Flag to the kernel this is a route add */ ip_hdr(skb)->protocol = 0; msg = (struct igmpmsg *)skb_network_header(skb); msg->im_vif = vifi; msg->im_vif_hi = vifi >> 8; ipv4_pktinfo_prepare(mroute_sk, pkt, false); memcpy(skb->cb, pkt->cb, sizeof(skb->cb)); /* Add our header */ igmp = skb_put(skb, sizeof(struct igmphdr)); igmp->type = assert; msg->im_msgtype = assert; igmp->code = 0; ip_hdr(skb)->tot_len = htons(skb->len); /* Fix the length */ skb->transport_header = skb->network_header; } igmpmsg_netlink_event(mrt, skb); /* Deliver to mrouted */ ret = sock_queue_rcv_skb(mroute_sk, skb); if (ret < 0) { net_warn_ratelimited("mroute: pending queue full, dropping entries\n"); kfree_skb(skb); } return ret; } /* Queue a packet for resolution. It gets locked cache entry! */ /* Called under rcu_read_lock() */ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb, struct net_device *dev) { const struct iphdr *iph = ip_hdr(skb); struct mfc_cache *c; bool found = false; int err; spin_lock_bh(&mfc_unres_lock); list_for_each_entry(c, &mrt->mfc_unres_queue, _c.list) { if (c->mfc_mcastgrp == iph->daddr && c->mfc_origin == iph->saddr) { found = true; break; } } if (!found) { /* Create a new entry if allowable */ c = ipmr_cache_alloc_unres(); if (!c) { spin_unlock_bh(&mfc_unres_lock); kfree_skb(skb); return -ENOBUFS; } /* Fill in the new cache entry */ c->_c.mfc_parent = -1; c->mfc_origin = iph->saddr; c->mfc_mcastgrp = iph->daddr; /* Reflect first query at mrouted. */ err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE); if (err < 0) { /* If the report failed throw the cache entry out - Brad Parker */ spin_unlock_bh(&mfc_unres_lock); ipmr_cache_free(c); kfree_skb(skb); return err; } atomic_inc(&mrt->cache_resolve_queue_len); list_add(&c->_c.list, &mrt->mfc_unres_queue); mroute_netlink_event(mrt, c, RTM_NEWROUTE); if (atomic_read(&mrt->cache_resolve_queue_len) == 1) mod_timer(&mrt->ipmr_expire_timer, c->_c.mfc_un.unres.expires); } /* See if we can append the packet */ if (c->_c.mfc_un.unres.unresolved.qlen > 3) { kfree_skb(skb); err = -ENOBUFS; } else { if (dev) { skb->dev = dev; skb->skb_iif = dev->ifindex; } skb_queue_tail(&c->_c.mfc_un.unres.unresolved, skb); err = 0; } spin_unlock_bh(&mfc_unres_lock); return err; } /* MFC cache manipulation by user space mroute daemon */ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent) { struct net *net = read_pnet(&mrt->net); struct mfc_cache *c; /* The entries are added/deleted only under RTNL */ rcu_read_lock(); c = ipmr_cache_find_parent(mrt, mfc->mfcc_origin.s_addr, mfc->mfcc_mcastgrp.s_addr, parent); rcu_read_unlock(); if (!c) return -ENOENT; rhltable_remove(&mrt->mfc_hash, &c->_c.mnode, ipmr_rht_params); list_del_rcu(&c->_c.list); call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, c, mrt->id); mroute_netlink_event(mrt, c, RTM_DELROUTE); mr_cache_put(&c->_c); return 0; } static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, struct mfcctl *mfc, int mrtsock, int parent) { struct mfc_cache *uc, *c; struct mr_mfc *_uc; bool found; int ret; if (mfc->mfcc_parent >= MAXVIFS) return -ENFILE; /* The entries are added/deleted only under RTNL */ rcu_read_lock(); c = ipmr_cache_find_parent(mrt, mfc->mfcc_origin.s_addr, mfc->mfcc_mcastgrp.s_addr, parent); rcu_read_unlock(); if (c) { spin_lock(&mrt_lock); c->_c.mfc_parent = mfc->mfcc_parent; ipmr_update_thresholds(mrt, &c->_c, mfc->mfcc_ttls); if (!mrtsock) c->_c.mfc_flags |= MFC_STATIC; spin_unlock(&mrt_lock); call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, c, mrt->id); mroute_netlink_event(mrt, c, RTM_NEWROUTE); return 0; } if (mfc->mfcc_mcastgrp.s_addr != htonl(INADDR_ANY) && !ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr)) return -EINVAL; c = ipmr_cache_alloc(); if (!c) return -ENOMEM; c->mfc_origin = mfc->mfcc_origin.s_addr; c->mfc_mcastgrp = mfc->mfcc_mcastgrp.s_addr; c->_c.mfc_parent = mfc->mfcc_parent; ipmr_update_thresholds(mrt, &c->_c, mfc->mfcc_ttls); if (!mrtsock) c->_c.mfc_flags |= MFC_STATIC; ret = rhltable_insert_key(&mrt->mfc_hash, &c->cmparg, &c->_c.mnode, ipmr_rht_params); if (ret) { pr_err("ipmr: rhtable insert error %d\n", ret); ipmr_cache_free(c); return ret; } list_add_tail_rcu(&c->_c.list, &mrt->mfc_cache_list); /* Check to see if we resolved a queued list. If so we * need to send on the frames and tidy up. */ found = false; spin_lock_bh(&mfc_unres_lock); list_for_each_entry(_uc, &mrt->mfc_unres_queue, list) { uc = (struct mfc_cache *)_uc; if (uc->mfc_origin == c->mfc_origin && uc->mfc_mcastgrp == c->mfc_mcastgrp) { list_del(&_uc->list); atomic_dec(&mrt->cache_resolve_queue_len); found = true; break; } } if (list_empty(&mrt->mfc_unres_queue)) timer_delete(&mrt->ipmr_expire_timer); spin_unlock_bh(&mfc_unres_lock); if (found) { ipmr_cache_resolve(net, mrt, uc, c); ipmr_cache_free(uc); } call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, c, mrt->id); mroute_netlink_event(mrt, c, RTM_NEWROUTE); return 0; } /* Close the multicast socket, and clear the vif tables etc */ static void mroute_clean_tables(struct mr_table *mrt, int flags) { struct net *net = read_pnet(&mrt->net); struct mr_mfc *c, *tmp; struct mfc_cache *cache; LIST_HEAD(list); int i; /* Shut down all active vif entries */ if (flags & (MRT_FLUSH_VIFS | MRT_FLUSH_VIFS_STATIC)) { for (i = 0; i < mrt->maxvif; i++) { if (((mrt->vif_table[i].flags & VIFF_STATIC) && !(flags & MRT_FLUSH_VIFS_STATIC)) || (!(mrt->vif_table[i].flags & VIFF_STATIC) && !(flags & MRT_FLUSH_VIFS))) continue; vif_delete(mrt, i, 0, &list); } unregister_netdevice_many(&list); } /* Wipe the cache */ if (flags & (MRT_FLUSH_MFC | MRT_FLUSH_MFC_STATIC)) { list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) { if (((c->mfc_flags & MFC_STATIC) && !(flags & MRT_FLUSH_MFC_STATIC)) || (!(c->mfc_flags & MFC_STATIC) && !(flags & MRT_FLUSH_MFC))) continue; rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params); list_del_rcu(&c->list); cache = (struct mfc_cache *)c; call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, cache, mrt->id); mroute_netlink_event(mrt, cache, RTM_DELROUTE); mr_cache_put(c); } } if (flags & MRT_FLUSH_MFC) { if (atomic_read(&mrt->cache_resolve_queue_len) != 0) { spin_lock_bh(&mfc_unres_lock); list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) { list_del(&c->list); cache = (struct mfc_cache *)c; mroute_netlink_event(mrt, cache, RTM_DELROUTE); ipmr_destroy_unres(mrt, cache); } spin_unlock_bh(&mfc_unres_lock); } } } /* called from ip_ra_control(), before an RCU grace period, * we don't need to call synchronize_rcu() here */ static void mrtsock_destruct(struct sock *sk) { struct net *net = sock_net(sk); struct mr_table *mrt; rtnl_lock(); ipmr_for_each_table(mrt, net) { if (sk == rtnl_dereference(mrt->mroute_sk)) { IPV4_DEVCONF_ALL(net, MC_FORWARDING)--; inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, NETCONFA_IFINDEX_ALL, net->ipv4.devconf_all); RCU_INIT_POINTER(mrt->mroute_sk, NULL); mroute_clean_tables(mrt, MRT_FLUSH_VIFS | MRT_FLUSH_MFC); } } rtnl_unlock(); } /* Socket options and virtual interface manipulation. The whole * virtual interface system is a complete heap, but unfortunately * that's how BSD mrouted happens to think. Maybe one day with a proper * MOSPF/PIM router set up we can clean this up. */ int ip_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval, unsigned int optlen) { struct net *net = sock_net(sk); int val, ret = 0, parent = 0; struct mr_table *mrt; struct vifctl vif; struct mfcctl mfc; bool do_wrvifwhole; u32 uval; /* There's one exception to the lock - MRT_DONE which needs to unlock */ rtnl_lock(); if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_IGMP) { ret = -EOPNOTSUPP; goto out_unlock; } mrt = __ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT); if (!mrt) { ret = -ENOENT; goto out_unlock; } if (optname != MRT_INIT) { if (sk != rcu_access_pointer(mrt->mroute_sk) && !ns_capable(net->user_ns, CAP_NET_ADMIN)) { ret = -EACCES; goto out_unlock; } } switch (optname) { case MRT_INIT: if (optlen != sizeof(int)) { ret = -EINVAL; break; } if (rtnl_dereference(mrt->mroute_sk)) { ret = -EADDRINUSE; break; } ret = ip_ra_control(sk, 1, mrtsock_destruct); if (ret == 0) { rcu_assign_pointer(mrt->mroute_sk, sk); IPV4_DEVCONF_ALL(net, MC_FORWARDING)++; inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, NETCONFA_IFINDEX_ALL, net->ipv4.devconf_all); } break; case MRT_DONE: if (sk != rcu_access_pointer(mrt->mroute_sk)) { ret = -EACCES; } else { /* We need to unlock here because mrtsock_destruct takes * care of rtnl itself and we can't change that due to * the IP_ROUTER_ALERT setsockopt which runs without it. */ rtnl_unlock(); ret = ip_ra_control(sk, 0, NULL); goto out; } break; case MRT_ADD_VIF: case MRT_DEL_VIF: if (optlen != sizeof(vif)) { ret = -EINVAL; break; } if (copy_from_sockptr(&vif, optval, sizeof(vif))) { ret = -EFAULT; break; } if (vif.vifc_vifi >= MAXVIFS) { ret = -ENFILE; break; } if (optname == MRT_ADD_VIF) { ret = vif_add(net, mrt, &vif, sk == rtnl_dereference(mrt->mroute_sk)); } else { ret = vif_delete(mrt, vif.vifc_vifi, 0, NULL); } break; /* Manipulate the forwarding caches. These live * in a sort of kernel/user symbiosis. */ case MRT_ADD_MFC: case MRT_DEL_MFC: parent = -1; fallthrough; case MRT_ADD_MFC_PROXY: case MRT_DEL_MFC_PROXY: if (optlen != sizeof(mfc)) { ret = -EINVAL; break; } if (copy_from_sockptr(&mfc, optval, sizeof(mfc))) { ret = -EFAULT; break; } if (parent == 0) parent = mfc.mfcc_parent; if (optname == MRT_DEL_MFC || optname == MRT_DEL_MFC_PROXY) ret = ipmr_mfc_delete(mrt, &mfc, parent); else ret = ipmr_mfc_add(net, mrt, &mfc, sk == rtnl_dereference(mrt->mroute_sk), parent); break; case MRT_FLUSH: if (optlen != sizeof(val)) { ret = -EINVAL; break; } if (copy_from_sockptr(&val, optval, sizeof(val))) { ret = -EFAULT; break; } mroute_clean_tables(mrt, val); break; /* Control PIM assert. */ case MRT_ASSERT: if (optlen != sizeof(val)) { ret = -EINVAL; break; } if (copy_from_sockptr(&val, optval, sizeof(val))) { ret = -EFAULT; break; } mrt->mroute_do_assert = val; break; case MRT_PIM: if (!ipmr_pimsm_enabled()) { ret = -ENOPROTOOPT; break; } if (optlen != sizeof(val)) { ret = -EINVAL; break; } if (copy_from_sockptr(&val, optval, sizeof(val))) { ret = -EFAULT; break; } do_wrvifwhole = (val == IGMPMSG_WRVIFWHOLE); val = !!val; if (val != mrt->mroute_do_pim) { mrt->mroute_do_pim = val; mrt->mroute_do_assert = val; mrt->mroute_do_wrvifwhole = do_wrvifwhole; } break; case MRT_TABLE: if (!IS_BUILTIN(CONFIG_IP_MROUTE_MULTIPLE_TABLES)) { ret = -ENOPROTOOPT; break; } if (optlen != sizeof(uval)) { ret = -EINVAL; break; } if (copy_from_sockptr(&uval, optval, sizeof(uval))) { ret = -EFAULT; break; } if (sk == rtnl_dereference(mrt->mroute_sk)) { ret = -EBUSY; } else { mrt = ipmr_new_table(net, uval); if (IS_ERR(mrt)) ret = PTR_ERR(mrt); else raw_sk(sk)->ipmr_table = uval; } break; /* Spurious command, or MRT_VERSION which you cannot set. */ default: ret = -ENOPROTOOPT; } out_unlock: rtnl_unlock(); out: return ret; } /* Execute if this ioctl is a special mroute ioctl */ int ipmr_sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) { switch (cmd) { /* These userspace buffers will be consumed by ipmr_ioctl() */ case SIOCGETVIFCNT: { struct sioc_vif_req buffer; return sock_ioctl_inout(sk, cmd, arg, &buffer, sizeof(buffer)); } case SIOCGETSGCNT: { struct sioc_sg_req buffer; return sock_ioctl_inout(sk, cmd, arg, &buffer, sizeof(buffer)); } } /* return code > 0 means that the ioctl was not executed */ return 1; } /* Getsock opt support for the multicast routing system. */ int ip_mroute_getsockopt(struct sock *sk, int optname, sockptr_t optval, sockptr_t optlen) { int olr; int val; struct net *net = sock_net(sk); struct mr_table *mrt; if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_IGMP) return -EOPNOTSUPP; mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT); if (!mrt) return -ENOENT; switch (optname) { case MRT_VERSION: val = 0x0305; break; case MRT_PIM: if (!ipmr_pimsm_enabled()) return -ENOPROTOOPT; val = mrt->mroute_do_pim; break; case MRT_ASSERT: val = mrt->mroute_do_assert; break; default: return -ENOPROTOOPT; } if (copy_from_sockptr(&olr, optlen, sizeof(int))) return -EFAULT; if (olr < 0) return -EINVAL; olr = min_t(unsigned int, olr, sizeof(int)); if (copy_to_sockptr(optlen, &olr, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &val, olr)) return -EFAULT; return 0; } /* The IP multicast ioctl support routines. */ int ipmr_ioctl(struct sock *sk, int cmd, void *arg) { struct vif_device *vif; struct mfc_cache *c; struct net *net = sock_net(sk); struct sioc_vif_req *vr; struct sioc_sg_req *sr; struct mr_table *mrt; mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT); if (!mrt) return -ENOENT; switch (cmd) { case SIOCGETVIFCNT: vr = (struct sioc_vif_req *)arg; if (vr->vifi >= mrt->maxvif) return -EINVAL; vr->vifi = array_index_nospec(vr->vifi, mrt->maxvif); rcu_read_lock(); vif = &mrt->vif_table[vr->vifi]; if (VIF_EXISTS(mrt, vr->vifi)) { vr->icount = READ_ONCE(vif->pkt_in); vr->ocount = READ_ONCE(vif->pkt_out); vr->ibytes = READ_ONCE(vif->bytes_in); vr->obytes = READ_ONCE(vif->bytes_out); rcu_read_unlock(); return 0; } rcu_read_unlock(); return -EADDRNOTAVAIL; case SIOCGETSGCNT: sr = (struct sioc_sg_req *)arg; rcu_read_lock(); c = ipmr_cache_find(mrt, sr->src.s_addr, sr->grp.s_addr); if (c) { sr->pktcnt = atomic_long_read(&c->_c.mfc_un.res.pkt); sr->bytecnt = atomic_long_read(&c->_c.mfc_un.res.bytes); sr->wrong_if = atomic_long_read(&c->_c.mfc_un.res.wrong_if); rcu_read_unlock(); return 0; } rcu_read_unlock(); return -EADDRNOTAVAIL; default: return -ENOIOCTLCMD; } } #ifdef CONFIG_COMPAT struct compat_sioc_sg_req { struct in_addr src; struct in_addr grp; compat_ulong_t pktcnt; compat_ulong_t bytecnt; compat_ulong_t wrong_if; }; struct compat_sioc_vif_req { vifi_t vifi; /* Which iface */ compat_ulong_t icount; compat_ulong_t ocount; compat_ulong_t ibytes; compat_ulong_t obytes; }; int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) { struct compat_sioc_sg_req sr; struct compat_sioc_vif_req vr; struct vif_device *vif; struct mfc_cache *c; struct net *net = sock_net(sk); struct mr_table *mrt; mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT); if (!mrt) return -ENOENT; switch (cmd) { case SIOCGETVIFCNT: if (copy_from_user(&vr, arg, sizeof(vr))) return -EFAULT; if (vr.vifi >= mrt->maxvif) return -EINVAL; vr.vifi = array_index_nospec(vr.vifi, mrt->maxvif); rcu_read_lock(); vif = &mrt->vif_table[vr.vifi]; if (VIF_EXISTS(mrt, vr.vifi)) { vr.icount = READ_ONCE(vif->pkt_in); vr.ocount = READ_ONCE(vif->pkt_out); vr.ibytes = READ_ONCE(vif->bytes_in); vr.obytes = READ_ONCE(vif->bytes_out); rcu_read_unlock(); if (copy_to_user(arg, &vr, sizeof(vr))) return -EFAULT; return 0; } rcu_read_unlock(); return -EADDRNOTAVAIL; case SIOCGETSGCNT: if (copy_from_user(&sr, arg, sizeof(sr))) return -EFAULT; rcu_read_lock(); c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr); if (c) { sr.pktcnt = atomic_long_read(&c->_c.mfc_un.res.pkt); sr.bytecnt = atomic_long_read(&c->_c.mfc_un.res.bytes); sr.wrong_if = atomic_long_read(&c->_c.mfc_un.res.wrong_if); rcu_read_unlock(); if (copy_to_user(arg, &sr, sizeof(sr))) return -EFAULT; return 0; } rcu_read_unlock(); return -EADDRNOTAVAIL; default: return -ENOIOCTLCMD; } } #endif static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct mr_table *mrt; struct vif_device *v; int ct; if (event != NETDEV_UNREGISTER) return NOTIFY_DONE; ipmr_for_each_table(mrt, net) { v = &mrt->vif_table[0]; for (ct = 0; ct < mrt->maxvif; ct++, v++) { if (rcu_access_pointer(v->dev) == dev) vif_delete(mrt, ct, 1, NULL); } } return NOTIFY_DONE; } static struct notifier_block ip_mr_notifier = { .notifier_call = ipmr_device_event, }; /* Encapsulate a packet by attaching a valid IPIP header to it. * This avoids tunnel drivers and other mess and gives us the speed so * important for multicast video. */ static void ip_encap(struct net *net, struct sk_buff *skb, __be32 saddr, __be32 daddr) { struct iphdr *iph; const struct iphdr *old_iph = ip_hdr(skb); skb_push(skb, sizeof(struct iphdr)); skb->transport_header = skb->network_header; skb_reset_network_header(skb); iph = ip_hdr(skb); iph->version = 4; iph->tos = old_iph->tos; iph->ttl = old_iph->ttl; iph->frag_off = 0; iph->daddr = daddr; iph->saddr = saddr; iph->protocol = IPPROTO_IPIP; iph->ihl = 5; iph->tot_len = htons(skb->len); ip_select_ident(net, skb, NULL); ip_send_check(iph); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); nf_reset_ct(skb); } static inline int ipmr_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { struct ip_options *opt = &(IPCB(skb)->opt); IP_INC_STATS(net, IPSTATS_MIB_OUTFORWDATAGRAMS); if (unlikely(opt->optlen)) ip_forward_options(skb); return dst_output(net, sk, skb); } #ifdef CONFIG_NET_SWITCHDEV static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt, int in_vifi, int out_vifi) { struct vif_device *out_vif = &mrt->vif_table[out_vifi]; struct vif_device *in_vif = &mrt->vif_table[in_vifi]; if (!skb->offload_l3_fwd_mark) return false; if (!out_vif->dev_parent_id.id_len || !in_vif->dev_parent_id.id_len) return false; return netdev_phys_item_id_same(&out_vif->dev_parent_id, &in_vif->dev_parent_id); } #else static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt, int in_vifi, int out_vifi) { return false; } #endif /* Processing handlers for ipmr_forward, under rcu_read_lock() */ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, int in_vifi, struct sk_buff *skb, int vifi) { const struct iphdr *iph = ip_hdr(skb); struct vif_device *vif = &mrt->vif_table[vifi]; struct net_device *vif_dev; struct net_device *dev; struct rtable *rt; struct flowi4 fl4; int encap = 0; vif_dev = vif_dev_read(vif); if (!vif_dev) goto out_free; if (vif->flags & VIFF_REGISTER) { WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1); WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len); DEV_STATS_ADD(vif_dev, tx_bytes, skb->len); DEV_STATS_INC(vif_dev, tx_packets); ipmr_cache_report(mrt, skb, vifi, IGMPMSG_WHOLEPKT); goto out_free; } if (ipmr_forward_offloaded(skb, mrt, in_vifi, vifi)) goto out_free; if (vif->flags & VIFF_TUNNEL) { rt = ip_route_output_ports(net, &fl4, NULL, vif->remote, vif->local, 0, 0, IPPROTO_IPIP, iph->tos & INET_DSCP_MASK, vif->link); if (IS_ERR(rt)) goto out_free; encap = sizeof(struct iphdr); } else { rt = ip_route_output_ports(net, &fl4, NULL, iph->daddr, 0, 0, 0, IPPROTO_IPIP, iph->tos & INET_DSCP_MASK, vif->link); if (IS_ERR(rt)) goto out_free; } dev = rt->dst.dev; if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) { /* Do not fragment multicasts. Alas, IPv4 does not * allow to send ICMP, so that packets will disappear * to blackhole. */ IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); ip_rt_put(rt); goto out_free; } encap += LL_RESERVED_SPACE(dev) + rt->dst.header_len; if (skb_cow(skb, encap)) { ip_rt_put(rt); goto out_free; } WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1); WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len); skb_dst_drop(skb); skb_dst_set(skb, &rt->dst); ip_decrease_ttl(ip_hdr(skb)); /* FIXME: forward and output firewalls used to be called here. * What do we do with netfilter? -- RR */ if (vif->flags & VIFF_TUNNEL) { ip_encap(net, skb, vif->local, vif->remote); /* FIXME: extra output firewall step used to be here. --RR */ DEV_STATS_INC(vif_dev, tx_packets); DEV_STATS_ADD(vif_dev, tx_bytes, skb->len); } IPCB(skb)->flags |= IPSKB_FORWARDED; /* RFC1584 teaches, that DVMRP/PIM router must deliver packets locally * not only before forwarding, but after forwarding on all output * interfaces. It is clear, if mrouter runs a multicasting * program, it should receive packets not depending to what interface * program is joined. * If we will not make it, the program will have to join on all * interfaces. On the other hand, multihoming host (or router, but * not mrouter) cannot join to more than one interface - it will * result in receiving multiple packets. */ NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, net, NULL, skb, skb->dev, dev, ipmr_forward_finish); return; out_free: kfree_skb(skb); } /* Called with mrt_lock or rcu_read_lock() */ static int ipmr_find_vif(const struct mr_table *mrt, struct net_device *dev) { int ct; /* Pairs with WRITE_ONCE() in vif_delete()/vif_add() */ for (ct = READ_ONCE(mrt->maxvif) - 1; ct >= 0; ct--) { if (rcu_access_pointer(mrt->vif_table[ct].dev) == dev) break; } return ct; } /* "local" means that we should preserve one skb (for local delivery) */ /* Called uner rcu_read_lock() */ static void ip_mr_forward(struct net *net, struct mr_table *mrt, struct net_device *dev, struct sk_buff *skb, struct mfc_cache *c, int local) { int true_vifi = ipmr_find_vif(mrt, dev); int psend = -1; int vif, ct; vif = c->_c.mfc_parent; atomic_long_inc(&c->_c.mfc_un.res.pkt); atomic_long_add(skb->len, &c->_c.mfc_un.res.bytes); WRITE_ONCE(c->_c.mfc_un.res.lastuse, jiffies); if (c->mfc_origin == htonl(INADDR_ANY) && true_vifi >= 0) { struct mfc_cache *cache_proxy; /* For an (*,G) entry, we only check that the incoming * interface is part of the static tree. */ cache_proxy = mr_mfc_find_any_parent(mrt, vif); if (cache_proxy && cache_proxy->_c.mfc_un.res.ttls[true_vifi] < 255) goto forward; } /* Wrong interface: drop packet and (maybe) send PIM assert. */ if (rcu_access_pointer(mrt->vif_table[vif].dev) != dev) { if (rt_is_output_route(skb_rtable(skb))) { /* It is our own packet, looped back. * Very complicated situation... * * The best workaround until routing daemons will be * fixed is not to redistribute packet, if it was * send through wrong interface. It means, that * multicast applications WILL NOT work for * (S,G), which have default multicast route pointing * to wrong oif. In any case, it is not a good * idea to use multicasting applications on router. */ goto dont_forward; } atomic_long_inc(&c->_c.mfc_un.res.wrong_if); if (true_vifi >= 0 && mrt->mroute_do_assert && /* pimsm uses asserts, when switching from RPT to SPT, * so that we cannot check that packet arrived on an oif. * It is bad, but otherwise we would need to move pretty * large chunk of pimd to kernel. Ough... --ANK */ (mrt->mroute_do_pim || c->_c.mfc_un.res.ttls[true_vifi] < 255) && time_after(jiffies, c->_c.mfc_un.res.last_assert + MFC_ASSERT_THRESH)) { c->_c.mfc_un.res.last_assert = jiffies; ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRONGVIF); if (mrt->mroute_do_wrvifwhole) ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRVIFWHOLE); } goto dont_forward; } forward: WRITE_ONCE(mrt->vif_table[vif].pkt_in, mrt->vif_table[vif].pkt_in + 1); WRITE_ONCE(mrt->vif_table[vif].bytes_in, mrt->vif_table[vif].bytes_in + skb->len); /* Forward the frame */ if (c->mfc_origin == htonl(INADDR_ANY) && c->mfc_mcastgrp == htonl(INADDR_ANY)) { if (true_vifi >= 0 && true_vifi != c->_c.mfc_parent && ip_hdr(skb)->ttl > c->_c.mfc_un.res.ttls[c->_c.mfc_parent]) { /* It's an (*,*) entry and the packet is not coming from * the upstream: forward the packet to the upstream * only. */ psend = c->_c.mfc_parent; goto last_forward; } goto dont_forward; } for (ct = c->_c.mfc_un.res.maxvif - 1; ct >= c->_c.mfc_un.res.minvif; ct--) { /* For (*,G) entry, don't forward to the incoming interface */ if ((c->mfc_origin != htonl(INADDR_ANY) || ct != true_vifi) && ip_hdr(skb)->ttl > c->_c.mfc_un.res.ttls[ct]) { if (psend != -1) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) ipmr_queue_xmit(net, mrt, true_vifi, skb2, psend); } psend = ct; } } last_forward: if (psend != -1) { if (local) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) ipmr_queue_xmit(net, mrt, true_vifi, skb2, psend); } else { ipmr_queue_xmit(net, mrt, true_vifi, skb, psend); return; } } dont_forward: if (!local) kfree_skb(skb); } static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb) { struct rtable *rt = skb_rtable(skb); struct iphdr *iph = ip_hdr(skb); struct flowi4 fl4 = { .daddr = iph->daddr, .saddr = iph->saddr, .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)), .flowi4_oif = (rt_is_output_route(rt) ? skb->dev->ifindex : 0), .flowi4_iif = (rt_is_output_route(rt) ? LOOPBACK_IFINDEX : skb->dev->ifindex), .flowi4_mark = skb->mark, }; struct mr_table *mrt; int err; err = ipmr_fib_lookup(net, &fl4, &mrt); if (err) return ERR_PTR(err); return mrt; } /* Multicast packets for forwarding arrive here * Called with rcu_read_lock(); */ int ip_mr_input(struct sk_buff *skb) { struct mfc_cache *cache; struct net *net = dev_net(skb->dev); int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL; struct mr_table *mrt; struct net_device *dev; /* skb->dev passed in is the loX master dev for vrfs. * As there are no vifs associated with loopback devices, * get the proper interface that does have a vif associated with it. */ dev = skb->dev; if (netif_is_l3_master(skb->dev)) { dev = dev_get_by_index_rcu(net, IPCB(skb)->iif); if (!dev) { kfree_skb(skb); return -ENODEV; } } /* Packet is looped back after forward, it should not be * forwarded second time, but still can be delivered locally. */ if (IPCB(skb)->flags & IPSKB_FORWARDED) goto dont_forward; mrt = ipmr_rt_fib_lookup(net, skb); if (IS_ERR(mrt)) { kfree_skb(skb); return PTR_ERR(mrt); } if (!local) { if (IPCB(skb)->opt.router_alert) { if (ip_call_ra_chain(skb)) return 0; } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP) { /* IGMPv1 (and broken IGMPv2 implementations sort of * Cisco IOS <= 11.2(8)) do not put router alert * option to IGMP packets destined to routable * groups. It is very bad, because it means * that we can forward NO IGMP messages. */ struct sock *mroute_sk; mroute_sk = rcu_dereference(mrt->mroute_sk); if (mroute_sk) { nf_reset_ct(skb); raw_rcv(mroute_sk, skb); return 0; } } } /* already under rcu_read_lock() */ cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); if (!cache) { int vif = ipmr_find_vif(mrt, dev); if (vif >= 0) cache = ipmr_cache_find_any(mrt, ip_hdr(skb)->daddr, vif); } /* No usable cache entry */ if (!cache) { int vif; if (local) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); ip_local_deliver(skb); if (!skb2) return -ENOBUFS; skb = skb2; } vif = ipmr_find_vif(mrt, dev); if (vif >= 0) return ipmr_cache_unresolved(mrt, vif, skb, dev); kfree_skb(skb); return -ENODEV; } ip_mr_forward(net, mrt, dev, skb, cache, local); if (local) return ip_local_deliver(skb); return 0; dont_forward: if (local) return ip_local_deliver(skb); kfree_skb(skb); return 0; } #ifdef CONFIG_IP_PIMSM_V1 /* Handle IGMP messages of PIMv1 */ int pim_rcv_v1(struct sk_buff *skb) { struct igmphdr *pim; struct net *net = dev_net(skb->dev); struct mr_table *mrt; if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr))) goto drop; pim = igmp_hdr(skb); mrt = ipmr_rt_fib_lookup(net, skb); if (IS_ERR(mrt)) goto drop; if (!mrt->mroute_do_pim || pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER) goto drop; if (__pim_rcv(mrt, skb, sizeof(*pim))) { drop: kfree_skb(skb); } return 0; } #endif #ifdef CONFIG_IP_PIMSM_V2 static int pim_rcv(struct sk_buff *skb) { struct pimreghdr *pim; struct net *net = dev_net(skb->dev); struct mr_table *mrt; if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr))) goto drop; pim = (struct pimreghdr *)skb_transport_header(skb); if (pim->type != ((PIM_VERSION << 4) | (PIM_TYPE_REGISTER)) || (pim->flags & PIM_NULL_REGISTER) || (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 && csum_fold(skb_checksum(skb, 0, skb->len, 0)))) goto drop; mrt = ipmr_rt_fib_lookup(net, skb); if (IS_ERR(mrt)) goto drop; if (__pim_rcv(mrt, skb, sizeof(*pim))) { drop: kfree_skb(skb); } return 0; } #endif int ipmr_get_route(struct net *net, struct sk_buff *skb, __be32 saddr, __be32 daddr, struct rtmsg *rtm, u32 portid) { struct mfc_cache *cache; struct mr_table *mrt; int err; rcu_read_lock(); mrt = __ipmr_get_table(net, RT_TABLE_DEFAULT); if (!mrt) { rcu_read_unlock(); return -ENOENT; } cache = ipmr_cache_find(mrt, saddr, daddr); if (!cache && skb->dev) { int vif = ipmr_find_vif(mrt, skb->dev); if (vif >= 0) cache = ipmr_cache_find_any(mrt, daddr, vif); } if (!cache) { struct sk_buff *skb2; struct iphdr *iph; struct net_device *dev; int vif = -1; dev = skb->dev; if (dev) vif = ipmr_find_vif(mrt, dev); if (vif < 0) { rcu_read_unlock(); return -ENODEV; } skb2 = skb_realloc_headroom(skb, sizeof(struct iphdr)); if (!skb2) { rcu_read_unlock(); return -ENOMEM; } NETLINK_CB(skb2).portid = portid; skb_push(skb2, sizeof(struct iphdr)); skb_reset_network_header(skb2); iph = ip_hdr(skb2); iph->ihl = sizeof(struct iphdr) >> 2; iph->saddr = saddr; iph->daddr = daddr; iph->version = 0; err = ipmr_cache_unresolved(mrt, vif, skb2, dev); rcu_read_unlock(); return err; } err = mr_fill_mroute(mrt, skb, &cache->_c, rtm); rcu_read_unlock(); return err; } static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, u32 portid, u32 seq, struct mfc_cache *c, int cmd, int flags) { struct nlmsghdr *nlh; struct rtmsg *rtm; int err; nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rtm), flags); if (!nlh) return -EMSGSIZE; rtm = nlmsg_data(nlh); rtm->rtm_family = RTNL_FAMILY_IPMR; rtm->rtm_dst_len = 32; rtm->rtm_src_len = 32; rtm->rtm_tos = 0; rtm->rtm_table = mrt->id; if (nla_put_u32(skb, RTA_TABLE, mrt->id)) goto nla_put_failure; rtm->rtm_type = RTN_MULTICAST; rtm->rtm_scope = RT_SCOPE_UNIVERSE; if (c->_c.mfc_flags & MFC_STATIC) rtm->rtm_protocol = RTPROT_STATIC; else rtm->rtm_protocol = RTPROT_MROUTED; rtm->rtm_flags = 0; if (nla_put_in_addr(skb, RTA_SRC, c->mfc_origin) || nla_put_in_addr(skb, RTA_DST, c->mfc_mcastgrp)) goto nla_put_failure; err = mr_fill_mroute(mrt, skb, &c->_c, rtm); /* do not break the dump if cache is unresolved */ if (err < 0 && err != -ENOENT) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int _ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, u32 portid, u32 seq, struct mr_mfc *c, int cmd, int flags) { return ipmr_fill_mroute(mrt, skb, portid, seq, (struct mfc_cache *)c, cmd, flags); } static size_t mroute_msgsize(bool unresolved, int maxvif) { size_t len = NLMSG_ALIGN(sizeof(struct rtmsg)) + nla_total_size(4) /* RTA_TABLE */ + nla_total_size(4) /* RTA_SRC */ + nla_total_size(4) /* RTA_DST */ ; if (!unresolved) len = len + nla_total_size(4) /* RTA_IIF */ + nla_total_size(0) /* RTA_MULTIPATH */ + maxvif * NLA_ALIGN(sizeof(struct rtnexthop)) /* RTA_MFC_STATS */ + nla_total_size_64bit(sizeof(struct rta_mfc_stats)) ; return len; } static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc, int cmd) { struct net *net = read_pnet(&mrt->net); struct sk_buff *skb; int err = -ENOBUFS; skb = nlmsg_new(mroute_msgsize(mfc->_c.mfc_parent >= MAXVIFS, mrt->maxvif), GFP_ATOMIC); if (!skb) goto errout; err = ipmr_fill_mroute(mrt, skb, 0, 0, mfc, cmd, 0); if (err < 0) goto errout; rtnl_notify(skb, net, 0, RTNLGRP_IPV4_MROUTE, NULL, GFP_ATOMIC); return; errout: kfree_skb(skb); rtnl_set_sk_err(net, RTNLGRP_IPV4_MROUTE, err); } static size_t igmpmsg_netlink_msgsize(size_t payloadlen) { size_t len = NLMSG_ALIGN(sizeof(struct rtgenmsg)) + nla_total_size(1) /* IPMRA_CREPORT_MSGTYPE */ + nla_total_size(4) /* IPMRA_CREPORT_VIF_ID */ + nla_total_size(4) /* IPMRA_CREPORT_SRC_ADDR */ + nla_total_size(4) /* IPMRA_CREPORT_DST_ADDR */ + nla_total_size(4) /* IPMRA_CREPORT_TABLE */ /* IPMRA_CREPORT_PKT */ + nla_total_size(payloadlen) ; return len; } static void igmpmsg_netlink_event(const struct mr_table *mrt, struct sk_buff *pkt) { struct net *net = read_pnet(&mrt->net); struct nlmsghdr *nlh; struct rtgenmsg *rtgenm; struct igmpmsg *msg; struct sk_buff *skb; struct nlattr *nla; int payloadlen; payloadlen = pkt->len - sizeof(struct igmpmsg); msg = (struct igmpmsg *)skb_network_header(pkt); skb = nlmsg_new(igmpmsg_netlink_msgsize(payloadlen), GFP_ATOMIC); if (!skb) goto errout; nlh = nlmsg_put(skb, 0, 0, RTM_NEWCACHEREPORT, sizeof(struct rtgenmsg), 0); if (!nlh) goto errout; rtgenm = nlmsg_data(nlh); rtgenm->rtgen_family = RTNL_FAMILY_IPMR; if (nla_put_u8(skb, IPMRA_CREPORT_MSGTYPE, msg->im_msgtype) || nla_put_u32(skb, IPMRA_CREPORT_VIF_ID, msg->im_vif | (msg->im_vif_hi << 8)) || nla_put_in_addr(skb, IPMRA_CREPORT_SRC_ADDR, msg->im_src.s_addr) || nla_put_in_addr(skb, IPMRA_CREPORT_DST_ADDR, msg->im_dst.s_addr) || nla_put_u32(skb, IPMRA_CREPORT_TABLE, mrt->id)) goto nla_put_failure; nla = nla_reserve(skb, IPMRA_CREPORT_PKT, payloadlen); if (!nla || skb_copy_bits(pkt, sizeof(struct igmpmsg), nla_data(nla), payloadlen)) goto nla_put_failure; nlmsg_end(skb, nlh); rtnl_notify(skb, net, 0, RTNLGRP_IPV4_MROUTE_R, NULL, GFP_ATOMIC); return; nla_put_failure: nlmsg_cancel(skb, nlh); errout: kfree_skb(skb); rtnl_set_sk_err(net, RTNLGRP_IPV4_MROUTE_R, -ENOBUFS); } static int ipmr_rtm_valid_getroute_req(struct sk_buff *skb, const struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack) { struct rtmsg *rtm; int i, err; rtm = nlmsg_payload(nlh, sizeof(*rtm)); if (!rtm) { NL_SET_ERR_MSG(extack, "ipv4: Invalid header for multicast route get request"); return -EINVAL; } if (!netlink_strict_get_check(skb)) return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy, extack); if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) || (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) || rtm->rtm_tos || rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || rtm->rtm_type || rtm->rtm_flags) { NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for multicast route get request"); return -EINVAL; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy, extack); if (err) return err; if ((tb[RTA_SRC] && !rtm->rtm_src_len) || (tb[RTA_DST] && !rtm->rtm_dst_len)) { NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4"); return -EINVAL; } for (i = 0; i <= RTA_MAX; i++) { if (!tb[i]) continue; switch (i) { case RTA_SRC: case RTA_DST: case RTA_TABLE: break; default: NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in multicast route get request"); return -EINVAL; } } return 0; } static int ipmr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct nlattr *tb[RTA_MAX + 1]; struct sk_buff *skb = NULL; struct mfc_cache *cache; struct mr_table *mrt; __be32 src, grp; u32 tableid; int err; err = ipmr_rtm_valid_getroute_req(in_skb, nlh, tb, extack); if (err < 0) goto errout; src = nla_get_in_addr_default(tb[RTA_SRC], 0); grp = nla_get_in_addr_default(tb[RTA_DST], 0); tableid = nla_get_u32_default(tb[RTA_TABLE], 0); mrt = __ipmr_get_table(net, tableid ? tableid : RT_TABLE_DEFAULT); if (!mrt) { err = -ENOENT; goto errout_free; } /* entries are added/deleted only under RTNL */ rcu_read_lock(); cache = ipmr_cache_find(mrt, src, grp); rcu_read_unlock(); if (!cache) { err = -ENOENT; goto errout_free; } skb = nlmsg_new(mroute_msgsize(false, mrt->maxvif), GFP_KERNEL); if (!skb) { err = -ENOBUFS; goto errout_free; } err = ipmr_fill_mroute(mrt, skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, cache, RTM_NEWROUTE, 0); if (err < 0) goto errout_free; err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); errout: return err; errout_free: kfree_skb(skb); goto errout; } static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) { struct fib_dump_filter filter = { .rtnl_held = true, }; int err; if (cb->strict_check) { err = ip_valid_fib_dump_req(sock_net(skb->sk), cb->nlh, &filter, cb); if (err < 0) return err; } if (filter.table_id) { struct mr_table *mrt; mrt = __ipmr_get_table(sock_net(skb->sk), filter.table_id); if (!mrt) { if (rtnl_msg_family(cb->nlh) != RTNL_FAMILY_IPMR) return skb->len; NL_SET_ERR_MSG(cb->extack, "ipv4: MR table does not exist"); return -ENOENT; } err = mr_table_dump(mrt, skb, cb, _ipmr_fill_mroute, &mfc_unres_lock, &filter); return skb->len ? : err; } return mr_rtm_dumproute(skb, cb, ipmr_mr_table_iter, _ipmr_fill_mroute, &mfc_unres_lock, &filter); } static const struct nla_policy rtm_ipmr_policy[RTA_MAX + 1] = { [RTA_SRC] = { .type = NLA_U32 }, [RTA_DST] = { .type = NLA_U32 }, [RTA_IIF] = { .type = NLA_U32 }, [RTA_TABLE] = { .type = NLA_U32 }, [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, }; static bool ipmr_rtm_validate_proto(unsigned char rtm_protocol) { switch (rtm_protocol) { case RTPROT_STATIC: case RTPROT_MROUTED: return true; } return false; } static int ipmr_nla_get_ttls(const struct nlattr *nla, struct mfcctl *mfcc) { struct rtnexthop *rtnh = nla_data(nla); int remaining = nla_len(nla), vifi = 0; while (rtnh_ok(rtnh, remaining)) { mfcc->mfcc_ttls[vifi] = rtnh->rtnh_hops; if (++vifi == MAXVIFS) break; rtnh = rtnh_next(rtnh, &remaining); } return remaining > 0 ? -EINVAL : vifi; } /* returns < 0 on error, 0 for ADD_MFC and 1 for ADD_MFC_PROXY */ static int rtm_to_ipmr_mfcc(struct net *net, struct nlmsghdr *nlh, struct mfcctl *mfcc, int *mrtsock, struct mr_table **mrtret, struct netlink_ext_ack *extack) { struct net_device *dev = NULL; u32 tblid = RT_TABLE_DEFAULT; struct mr_table *mrt; struct nlattr *attr; struct rtmsg *rtm; int ret, rem; ret = nlmsg_validate_deprecated(nlh, sizeof(*rtm), RTA_MAX, rtm_ipmr_policy, extack); if (ret < 0) goto out; rtm = nlmsg_data(nlh); ret = -EINVAL; if (rtm->rtm_family != RTNL_FAMILY_IPMR || rtm->rtm_dst_len != 32 || rtm->rtm_type != RTN_MULTICAST || rtm->rtm_scope != RT_SCOPE_UNIVERSE || !ipmr_rtm_validate_proto(rtm->rtm_protocol)) goto out; memset(mfcc, 0, sizeof(*mfcc)); mfcc->mfcc_parent = -1; ret = 0; nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), rem) { switch (nla_type(attr)) { case RTA_SRC: mfcc->mfcc_origin.s_addr = nla_get_be32(attr); break; case RTA_DST: mfcc->mfcc_mcastgrp.s_addr = nla_get_be32(attr); break; case RTA_IIF: dev = __dev_get_by_index(net, nla_get_u32(attr)); if (!dev) { ret = -ENODEV; goto out; } break; case RTA_MULTIPATH: if (ipmr_nla_get_ttls(attr, mfcc) < 0) { ret = -EINVAL; goto out; } break; case RTA_PREFSRC: ret = 1; break; case RTA_TABLE: tblid = nla_get_u32(attr); break; } } mrt = __ipmr_get_table(net, tblid); if (!mrt) { ret = -ENOENT; goto out; } *mrtret = mrt; *mrtsock = rtm->rtm_protocol == RTPROT_MROUTED ? 1 : 0; if (dev) mfcc->mfcc_parent = ipmr_find_vif(mrt, dev); out: return ret; } /* takes care of both newroute and delroute */ static int ipmr_rtm_route(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); int ret, mrtsock, parent; struct mr_table *tbl; struct mfcctl mfcc; mrtsock = 0; tbl = NULL; ret = rtm_to_ipmr_mfcc(net, nlh, &mfcc, &mrtsock, &tbl, extack); if (ret < 0) return ret; parent = ret ? mfcc.mfcc_parent : -1; if (nlh->nlmsg_type == RTM_NEWROUTE) return ipmr_mfc_add(net, tbl, &mfcc, mrtsock, parent); else return ipmr_mfc_delete(tbl, &mfcc, parent); } static bool ipmr_fill_table(struct mr_table *mrt, struct sk_buff *skb) { u32 queue_len = atomic_read(&mrt->cache_resolve_queue_len); if (nla_put_u32(skb, IPMRA_TABLE_ID, mrt->id) || nla_put_u32(skb, IPMRA_TABLE_CACHE_RES_QUEUE_LEN, queue_len) || nla_put_s32(skb, IPMRA_TABLE_MROUTE_REG_VIF_NUM, mrt->mroute_reg_vif_num) || nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_ASSERT, mrt->mroute_do_assert) || nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_PIM, mrt->mroute_do_pim) || nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_WRVIFWHOLE, mrt->mroute_do_wrvifwhole)) return false; return true; } static bool ipmr_fill_vif(struct mr_table *mrt, u32 vifid, struct sk_buff *skb) { struct net_device *vif_dev; struct nlattr *vif_nest; struct vif_device *vif; vif = &mrt->vif_table[vifid]; vif_dev = rtnl_dereference(vif->dev); /* if the VIF doesn't exist just continue */ if (!vif_dev) return true; vif_nest = nla_nest_start_noflag(skb, IPMRA_VIF); if (!vif_nest) return false; if (nla_put_u32(skb, IPMRA_VIFA_IFINDEX, vif_dev->ifindex) || nla_put_u32(skb, IPMRA_VIFA_VIF_ID, vifid) || nla_put_u16(skb, IPMRA_VIFA_FLAGS, vif->flags) || nla_put_u64_64bit(skb, IPMRA_VIFA_BYTES_IN, vif->bytes_in, IPMRA_VIFA_PAD) || nla_put_u64_64bit(skb, IPMRA_VIFA_BYTES_OUT, vif->bytes_out, IPMRA_VIFA_PAD) || nla_put_u64_64bit(skb, IPMRA_VIFA_PACKETS_IN, vif->pkt_in, IPMRA_VIFA_PAD) || nla_put_u64_64bit(skb, IPMRA_VIFA_PACKETS_OUT, vif->pkt_out, IPMRA_VIFA_PAD) || nla_put_be32(skb, IPMRA_VIFA_LOCAL_ADDR, vif->local) || nla_put_be32(skb, IPMRA_VIFA_REMOTE_ADDR, vif->remote)) { nla_nest_cancel(skb, vif_nest); return false; } nla_nest_end(skb, vif_nest); return true; } static int ipmr_valid_dumplink(const struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct ifinfomsg *ifm; ifm = nlmsg_payload(nlh, sizeof(*ifm)); if (!ifm) { NL_SET_ERR_MSG(extack, "ipv4: Invalid header for ipmr link dump"); return -EINVAL; } if (nlmsg_attrlen(nlh, sizeof(*ifm))) { NL_SET_ERR_MSG(extack, "Invalid data after header in ipmr link dump"); return -EINVAL; } if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || ifm->ifi_change || ifm->ifi_index) { NL_SET_ERR_MSG(extack, "Invalid values in header for ipmr link dump request"); return -EINVAL; } return 0; } static int ipmr_rtm_dumplink(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct nlmsghdr *nlh = NULL; unsigned int t = 0, s_t; unsigned int e = 0, s_e; struct mr_table *mrt; if (cb->strict_check) { int err = ipmr_valid_dumplink(cb->nlh, cb->extack); if (err < 0) return err; } s_t = cb->args[0]; s_e = cb->args[1]; ipmr_for_each_table(mrt, net) { struct nlattr *vifs, *af; struct ifinfomsg *hdr; u32 i; if (t < s_t) goto skip_table; nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWLINK, sizeof(*hdr), NLM_F_MULTI); if (!nlh) break; hdr = nlmsg_data(nlh); memset(hdr, 0, sizeof(*hdr)); hdr->ifi_family = RTNL_FAMILY_IPMR; af = nla_nest_start_noflag(skb, IFLA_AF_SPEC); if (!af) { nlmsg_cancel(skb, nlh); goto out; } if (!ipmr_fill_table(mrt, skb)) { nlmsg_cancel(skb, nlh); goto out; } vifs = nla_nest_start_noflag(skb, IPMRA_TABLE_VIFS); if (!vifs) { nla_nest_end(skb, af); nlmsg_end(skb, nlh); goto out; } for (i = 0; i < mrt->maxvif; i++) { if (e < s_e) goto skip_entry; if (!ipmr_fill_vif(mrt, i, skb)) { nla_nest_end(skb, vifs); nla_nest_end(skb, af); nlmsg_end(skb, nlh); goto out; } skip_entry: e++; } s_e = 0; e = 0; nla_nest_end(skb, vifs); nla_nest_end(skb, af); nlmsg_end(skb, nlh); skip_table: t++; } out: cb->args[1] = e; cb->args[0] = t; return skb->len; } #ifdef CONFIG_PROC_FS /* The /proc interfaces to multicast routing : * /proc/net/ip_mr_cache & /proc/net/ip_mr_vif */ static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct mr_vif_iter *iter = seq->private; struct net *net = seq_file_net(seq); struct mr_table *mrt; rcu_read_lock(); mrt = __ipmr_get_table(net, RT_TABLE_DEFAULT); if (!mrt) { rcu_read_unlock(); return ERR_PTR(-ENOENT); } iter->mrt = mrt; return mr_vif_seq_start(seq, pos); } static void ipmr_vif_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static int ipmr_vif_seq_show(struct seq_file *seq, void *v) { struct mr_vif_iter *iter = seq->private; struct mr_table *mrt = iter->mrt; if (v == SEQ_START_TOKEN) { seq_puts(seq, "Interface BytesIn PktsIn BytesOut PktsOut Flags Local Remote\n"); } else { const struct vif_device *vif = v; const struct net_device *vif_dev; const char *name; vif_dev = vif_dev_read(vif); name = vif_dev ? vif_dev->name : "none"; seq_printf(seq, "%2td %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n", vif - mrt->vif_table, name, vif->bytes_in, vif->pkt_in, vif->bytes_out, vif->pkt_out, vif->flags, vif->local, vif->remote); } return 0; } static const struct seq_operations ipmr_vif_seq_ops = { .start = ipmr_vif_seq_start, .next = mr_vif_seq_next, .stop = ipmr_vif_seq_stop, .show = ipmr_vif_seq_show, }; static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos) { struct net *net = seq_file_net(seq); struct mr_table *mrt; mrt = ipmr_get_table(net, RT_TABLE_DEFAULT); if (!mrt) return ERR_PTR(-ENOENT); return mr_mfc_seq_start(seq, pos, mrt, &mfc_unres_lock); } static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) { int n; if (v == SEQ_START_TOKEN) { seq_puts(seq, "Group Origin Iif Pkts Bytes Wrong Oifs\n"); } else { const struct mfc_cache *mfc = v; const struct mr_mfc_iter *it = seq->private; const struct mr_table *mrt = it->mrt; seq_printf(seq, "%08X %08X %-3hd", (__force u32) mfc->mfc_mcastgrp, (__force u32) mfc->mfc_origin, mfc->_c.mfc_parent); if (it->cache != &mrt->mfc_unres_queue) { seq_printf(seq, " %8lu %8lu %8lu", atomic_long_read(&mfc->_c.mfc_un.res.pkt), atomic_long_read(&mfc->_c.mfc_un.res.bytes), atomic_long_read(&mfc->_c.mfc_un.res.wrong_if)); for (n = mfc->_c.mfc_un.res.minvif; n < mfc->_c.mfc_un.res.maxvif; n++) { if (VIF_EXISTS(mrt, n) && mfc->_c.mfc_un.res.ttls[n] < 255) seq_printf(seq, " %2d:%-3d", n, mfc->_c.mfc_un.res.ttls[n]); } } else { /* unresolved mfc_caches don't contain * pkt, bytes and wrong_if values */ seq_printf(seq, " %8lu %8lu %8lu", 0ul, 0ul, 0ul); } seq_putc(seq, '\n'); } return 0; } static const struct seq_operations ipmr_mfc_seq_ops = { .start = ipmr_mfc_seq_start, .next = mr_mfc_seq_next, .stop = mr_mfc_seq_stop, .show = ipmr_mfc_seq_show, }; #endif #ifdef CONFIG_IP_PIMSM_V2 static const struct net_protocol pim_protocol = { .handler = pim_rcv, }; #endif static unsigned int ipmr_seq_read(const struct net *net) { return READ_ONCE(net->ipv4.ipmr_seq) + ipmr_rules_seq_read(net); } static int ipmr_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { return mr_dump(net, nb, RTNL_FAMILY_IPMR, ipmr_rules_dump, ipmr_mr_table_iter, extack); } static const struct fib_notifier_ops ipmr_notifier_ops_template = { .family = RTNL_FAMILY_IPMR, .fib_seq_read = ipmr_seq_read, .fib_dump = ipmr_dump, .owner = THIS_MODULE, }; static int __net_init ipmr_notifier_init(struct net *net) { struct fib_notifier_ops *ops; net->ipv4.ipmr_seq = 0; ops = fib_notifier_ops_register(&ipmr_notifier_ops_template, net); if (IS_ERR(ops)) return PTR_ERR(ops); net->ipv4.ipmr_notifier_ops = ops; return 0; } static void __net_exit ipmr_notifier_exit(struct net *net) { fib_notifier_ops_unregister(net->ipv4.ipmr_notifier_ops); net->ipv4.ipmr_notifier_ops = NULL; } /* Setup for IP multicast routing */ static int __net_init ipmr_net_init(struct net *net) { int err; err = ipmr_notifier_init(net); if (err) goto ipmr_notifier_fail; err = ipmr_rules_init(net); if (err < 0) goto ipmr_rules_fail; #ifdef CONFIG_PROC_FS err = -ENOMEM; if (!proc_create_net("ip_mr_vif", 0, net->proc_net, &ipmr_vif_seq_ops, sizeof(struct mr_vif_iter))) goto proc_vif_fail; if (!proc_create_net("ip_mr_cache", 0, net->proc_net, &ipmr_mfc_seq_ops, sizeof(struct mr_mfc_iter))) goto proc_cache_fail; #endif return 0; #ifdef CONFIG_PROC_FS proc_cache_fail: remove_proc_entry("ip_mr_vif", net->proc_net); proc_vif_fail: rtnl_lock(); ipmr_rules_exit(net); rtnl_unlock(); #endif ipmr_rules_fail: ipmr_notifier_exit(net); ipmr_notifier_fail: return err; } static void __net_exit ipmr_net_exit(struct net *net) { #ifdef CONFIG_PROC_FS remove_proc_entry("ip_mr_cache", net->proc_net); remove_proc_entry("ip_mr_vif", net->proc_net); #endif ipmr_notifier_exit(net); } static void __net_exit ipmr_net_exit_batch(struct list_head *net_list) { struct net *net; rtnl_lock(); list_for_each_entry(net, net_list, exit_list) ipmr_rules_exit(net); rtnl_unlock(); } static struct pernet_operations ipmr_net_ops = { .init = ipmr_net_init, .exit = ipmr_net_exit, .exit_batch = ipmr_net_exit_batch, }; static const struct rtnl_msg_handler ipmr_rtnl_msg_handlers[] __initconst = { {.protocol = RTNL_FAMILY_IPMR, .msgtype = RTM_GETLINK, .dumpit = ipmr_rtm_dumplink}, {.protocol = RTNL_FAMILY_IPMR, .msgtype = RTM_NEWROUTE, .doit = ipmr_rtm_route}, {.protocol = RTNL_FAMILY_IPMR, .msgtype = RTM_DELROUTE, .doit = ipmr_rtm_route}, {.protocol = RTNL_FAMILY_IPMR, .msgtype = RTM_GETROUTE, .doit = ipmr_rtm_getroute, .dumpit = ipmr_rtm_dumproute}, }; int __init ip_mr_init(void) { int err; mrt_cachep = KMEM_CACHE(mfc_cache, SLAB_HWCACHE_ALIGN | SLAB_PANIC); err = register_pernet_subsys(&ipmr_net_ops); if (err) goto reg_pernet_fail; err = register_netdevice_notifier(&ip_mr_notifier); if (err) goto reg_notif_fail; #ifdef CONFIG_IP_PIMSM_V2 if (inet_add_protocol(&pim_protocol, IPPROTO_PIM) < 0) { pr_err("%s: can't add PIM protocol\n", __func__); err = -EAGAIN; goto add_proto_fail; } #endif rtnl_register_many(ipmr_rtnl_msg_handlers); return 0; #ifdef CONFIG_IP_PIMSM_V2 add_proto_fail: unregister_netdevice_notifier(&ip_mr_notifier); #endif reg_notif_fail: unregister_pernet_subsys(&ipmr_net_ops); reg_pernet_fail: kmem_cache_destroy(mrt_cachep); return err; }
6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 // SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) /* raw.c - Raw sockets for protocol family CAN * * Copyright (c) 2002-2007 Volkswagen Group Electronic Research * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of Volkswagen nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * Alternatively, provided that this notice is retained in full, this * software may be distributed under the terms of the GNU General * Public License ("GPL") version 2, in which case the provisions of the * GPL apply INSTEAD OF those given above. * * The provided data structures and external interfaces from this code * are not restricted to be used by modules with a GPL compatible license. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * */ #include <linux/module.h> #include <linux/init.h> #include <linux/uio.h> #include <linux/net.h> #include <linux/slab.h> #include <linux/netdevice.h> #include <linux/socket.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/can.h> #include <linux/can/core.h> #include <linux/can/dev.h> /* for can_is_canxl_dev_mtu() */ #include <linux/can/skb.h> #include <linux/can/raw.h> #include <net/sock.h> #include <net/net_namespace.h> MODULE_DESCRIPTION("PF_CAN raw protocol"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Urs Thuermann <urs.thuermann@volkswagen.de>"); MODULE_ALIAS("can-proto-1"); #define RAW_MIN_NAMELEN CAN_REQUIRED_SIZE(struct sockaddr_can, can_ifindex) #define MASK_ALL 0 /* A raw socket has a list of can_filters attached to it, each receiving * the CAN frames matching that filter. If the filter list is empty, * no CAN frames will be received by the socket. The default after * opening the socket, is to have one filter which receives all frames. * The filter list is allocated dynamically with the exception of the * list containing only one item. This common case is optimized by * storing the single filter in dfilter, to avoid using dynamic memory. */ struct uniqframe { int skbcnt; const struct sk_buff *skb; unsigned int join_rx_count; }; struct raw_sock { struct sock sk; int bound; int ifindex; struct net_device *dev; netdevice_tracker dev_tracker; struct list_head notifier; int loopback; int recv_own_msgs; int fd_frames; int xl_frames; struct can_raw_vcid_options raw_vcid_opts; canid_t tx_vcid_shifted; canid_t rx_vcid_shifted; canid_t rx_vcid_mask_shifted; int join_filters; int count; /* number of active filters */ struct can_filter dfilter; /* default/single filter */ struct can_filter *filter; /* pointer to filter(s) */ can_err_mask_t err_mask; struct uniqframe __percpu *uniq; }; static LIST_HEAD(raw_notifier_list); static DEFINE_SPINLOCK(raw_notifier_lock); static struct raw_sock *raw_busy_notifier; /* Return pointer to store the extra msg flags for raw_recvmsg(). * We use the space of one unsigned int beyond the 'struct sockaddr_can' * in skb->cb. */ static inline unsigned int *raw_flags(struct sk_buff *skb) { sock_skb_cb_check_size(sizeof(struct sockaddr_can) + sizeof(unsigned int)); /* return pointer after struct sockaddr_can */ return (unsigned int *)(&((struct sockaddr_can *)skb->cb)[1]); } static inline struct raw_sock *raw_sk(const struct sock *sk) { return (struct raw_sock *)sk; } static void raw_rcv(struct sk_buff *oskb, void *data) { struct sock *sk = (struct sock *)data; struct raw_sock *ro = raw_sk(sk); struct sockaddr_can *addr; struct sk_buff *skb; unsigned int *pflags; /* check the received tx sock reference */ if (!ro->recv_own_msgs && oskb->sk == sk) return; /* make sure to not pass oversized frames to the socket */ if (!ro->fd_frames && can_is_canfd_skb(oskb)) return; if (can_is_canxl_skb(oskb)) { struct canxl_frame *cxl = (struct canxl_frame *)oskb->data; /* make sure to not pass oversized frames to the socket */ if (!ro->xl_frames) return; /* filter CAN XL VCID content */ if (ro->raw_vcid_opts.flags & CAN_RAW_XL_VCID_RX_FILTER) { /* apply VCID filter if user enabled the filter */ if ((cxl->prio & ro->rx_vcid_mask_shifted) != (ro->rx_vcid_shifted & ro->rx_vcid_mask_shifted)) return; } else { /* no filter => do not forward VCID tagged frames */ if (cxl->prio & CANXL_VCID_MASK) return; } } /* eliminate multiple filter matches for the same skb */ if (this_cpu_ptr(ro->uniq)->skb == oskb && this_cpu_ptr(ro->uniq)->skbcnt == can_skb_prv(oskb)->skbcnt) { if (!ro->join_filters) return; this_cpu_inc(ro->uniq->join_rx_count); /* drop frame until all enabled filters matched */ if (this_cpu_ptr(ro->uniq)->join_rx_count < ro->count) return; } else { this_cpu_ptr(ro->uniq)->skb = oskb; this_cpu_ptr(ro->uniq)->skbcnt = can_skb_prv(oskb)->skbcnt; this_cpu_ptr(ro->uniq)->join_rx_count = 1; /* drop first frame to check all enabled filters? */ if (ro->join_filters && ro->count > 1) return; } /* clone the given skb to be able to enqueue it into the rcv queue */ skb = skb_clone(oskb, GFP_ATOMIC); if (!skb) return; /* Put the datagram to the queue so that raw_recvmsg() can get * it from there. We need to pass the interface index to * raw_recvmsg(). We pass a whole struct sockaddr_can in * skb->cb containing the interface index. */ sock_skb_cb_check_size(sizeof(struct sockaddr_can)); addr = (struct sockaddr_can *)skb->cb; memset(addr, 0, sizeof(*addr)); addr->can_family = AF_CAN; addr->can_ifindex = skb->dev->ifindex; /* add CAN specific message flags for raw_recvmsg() */ pflags = raw_flags(skb); *pflags = 0; if (oskb->sk) *pflags |= MSG_DONTROUTE; if (oskb->sk == sk) *pflags |= MSG_CONFIRM; if (sock_queue_rcv_skb(sk, skb) < 0) kfree_skb(skb); } static int raw_enable_filters(struct net *net, struct net_device *dev, struct sock *sk, struct can_filter *filter, int count) { int err = 0; int i; for (i = 0; i < count; i++) { err = can_rx_register(net, dev, filter[i].can_id, filter[i].can_mask, raw_rcv, sk, "raw", sk); if (err) { /* clean up successfully registered filters */ while (--i >= 0) can_rx_unregister(net, dev, filter[i].can_id, filter[i].can_mask, raw_rcv, sk); break; } } return err; } static int raw_enable_errfilter(struct net *net, struct net_device *dev, struct sock *sk, can_err_mask_t err_mask) { int err = 0; if (err_mask) err = can_rx_register(net, dev, 0, err_mask | CAN_ERR_FLAG, raw_rcv, sk, "raw", sk); return err; } static void raw_disable_filters(struct net *net, struct net_device *dev, struct sock *sk, struct can_filter *filter, int count) { int i; for (i = 0; i < count; i++) can_rx_unregister(net, dev, filter[i].can_id, filter[i].can_mask, raw_rcv, sk); } static inline void raw_disable_errfilter(struct net *net, struct net_device *dev, struct sock *sk, can_err_mask_t err_mask) { if (err_mask) can_rx_unregister(net, dev, 0, err_mask | CAN_ERR_FLAG, raw_rcv, sk); } static inline void raw_disable_allfilters(struct net *net, struct net_device *dev, struct sock *sk) { struct raw_sock *ro = raw_sk(sk); raw_disable_filters(net, dev, sk, ro->filter, ro->count); raw_disable_errfilter(net, dev, sk, ro->err_mask); } static int raw_enable_allfilters(struct net *net, struct net_device *dev, struct sock *sk) { struct raw_sock *ro = raw_sk(sk); int err; err = raw_enable_filters(net, dev, sk, ro->filter, ro->count); if (!err) { err = raw_enable_errfilter(net, dev, sk, ro->err_mask); if (err) raw_disable_filters(net, dev, sk, ro->filter, ro->count); } return err; } static void raw_notify(struct raw_sock *ro, unsigned long msg, struct net_device *dev) { struct sock *sk = &ro->sk; if (!net_eq(dev_net(dev), sock_net(sk))) return; if (ro->dev != dev) return; switch (msg) { case NETDEV_UNREGISTER: lock_sock(sk); /* remove current filters & unregister */ if (ro->bound) { raw_disable_allfilters(dev_net(dev), dev, sk); netdev_put(dev, &ro->dev_tracker); } if (ro->count > 1) kfree(ro->filter); ro->ifindex = 0; ro->bound = 0; ro->dev = NULL; ro->count = 0; release_sock(sk); sk->sk_err = ENODEV; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); break; case NETDEV_DOWN: sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); break; } } static int raw_notifier(struct notifier_block *nb, unsigned long msg, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (dev->type != ARPHRD_CAN) return NOTIFY_DONE; if (msg != NETDEV_UNREGISTER && msg != NETDEV_DOWN) return NOTIFY_DONE; if (unlikely(raw_busy_notifier)) /* Check for reentrant bug. */ return NOTIFY_DONE; spin_lock(&raw_notifier_lock); list_for_each_entry(raw_busy_notifier, &raw_notifier_list, notifier) { spin_unlock(&raw_notifier_lock); raw_notify(raw_busy_notifier, msg, dev); spin_lock(&raw_notifier_lock); } raw_busy_notifier = NULL; spin_unlock(&raw_notifier_lock); return NOTIFY_DONE; } static int raw_init(struct sock *sk) { struct raw_sock *ro = raw_sk(sk); ro->bound = 0; ro->ifindex = 0; ro->dev = NULL; /* set default filter to single entry dfilter */ ro->dfilter.can_id = 0; ro->dfilter.can_mask = MASK_ALL; ro->filter = &ro->dfilter; ro->count = 1; /* set default loopback behaviour */ ro->loopback = 1; ro->recv_own_msgs = 0; ro->fd_frames = 0; ro->xl_frames = 0; ro->join_filters = 0; /* alloc_percpu provides zero'ed memory */ ro->uniq = alloc_percpu(struct uniqframe); if (unlikely(!ro->uniq)) return -ENOMEM; /* set notifier */ spin_lock(&raw_notifier_lock); list_add_tail(&ro->notifier, &raw_notifier_list); spin_unlock(&raw_notifier_lock); return 0; } static int raw_release(struct socket *sock) { struct sock *sk = sock->sk; struct raw_sock *ro; struct net *net; if (!sk) return 0; ro = raw_sk(sk); net = sock_net(sk); spin_lock(&raw_notifier_lock); while (raw_busy_notifier == ro) { spin_unlock(&raw_notifier_lock); schedule_timeout_uninterruptible(1); spin_lock(&raw_notifier_lock); } list_del(&ro->notifier); spin_unlock(&raw_notifier_lock); rtnl_lock(); lock_sock(sk); /* remove current filters & unregister */ if (ro->bound) { if (ro->dev) { raw_disable_allfilters(dev_net(ro->dev), ro->dev, sk); netdev_put(ro->dev, &ro->dev_tracker); } else { raw_disable_allfilters(net, NULL, sk); } } if (ro->count > 1) kfree(ro->filter); ro->ifindex = 0; ro->bound = 0; ro->dev = NULL; ro->count = 0; free_percpu(ro->uniq); sock_orphan(sk); sock->sk = NULL; release_sock(sk); rtnl_unlock(); sock_prot_inuse_add(net, sk->sk_prot, -1); sock_put(sk); return 0; } static int raw_bind(struct socket *sock, struct sockaddr *uaddr, int len) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct sock *sk = sock->sk; struct raw_sock *ro = raw_sk(sk); struct net_device *dev = NULL; int ifindex; int err = 0; int notify_enetdown = 0; if (len < RAW_MIN_NAMELEN) return -EINVAL; if (addr->can_family != AF_CAN) return -EINVAL; rtnl_lock(); lock_sock(sk); if (ro->bound && addr->can_ifindex == ro->ifindex) goto out; if (addr->can_ifindex) { dev = dev_get_by_index(sock_net(sk), addr->can_ifindex); if (!dev) { err = -ENODEV; goto out; } if (dev->type != ARPHRD_CAN) { err = -ENODEV; goto out_put_dev; } if (!(dev->flags & IFF_UP)) notify_enetdown = 1; ifindex = dev->ifindex; /* filters set by default/setsockopt */ err = raw_enable_allfilters(sock_net(sk), dev, sk); if (err) goto out_put_dev; } else { ifindex = 0; /* filters set by default/setsockopt */ err = raw_enable_allfilters(sock_net(sk), NULL, sk); } if (!err) { if (ro->bound) { /* unregister old filters */ if (ro->dev) { raw_disable_allfilters(dev_net(ro->dev), ro->dev, sk); /* drop reference to old ro->dev */ netdev_put(ro->dev, &ro->dev_tracker); } else { raw_disable_allfilters(sock_net(sk), NULL, sk); } } ro->ifindex = ifindex; ro->bound = 1; /* bind() ok -> hold a reference for new ro->dev */ ro->dev = dev; if (ro->dev) netdev_hold(ro->dev, &ro->dev_tracker, GFP_KERNEL); } out_put_dev: /* remove potential reference from dev_get_by_index() */ dev_put(dev); out: release_sock(sk); rtnl_unlock(); if (notify_enetdown) { sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); } return err; } static int raw_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct sock *sk = sock->sk; struct raw_sock *ro = raw_sk(sk); if (peer) return -EOPNOTSUPP; memset(addr, 0, RAW_MIN_NAMELEN); addr->can_family = AF_CAN; addr->can_ifindex = ro->ifindex; return RAW_MIN_NAMELEN; } static int raw_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct raw_sock *ro = raw_sk(sk); struct can_filter *filter = NULL; /* dyn. alloc'ed filters */ struct can_filter sfilter; /* single filter */ struct net_device *dev = NULL; can_err_mask_t err_mask = 0; int fd_frames; int count = 0; int err = 0; if (level != SOL_CAN_RAW) return -EINVAL; switch (optname) { case CAN_RAW_FILTER: if (optlen % sizeof(struct can_filter) != 0) return -EINVAL; if (optlen > CAN_RAW_FILTER_MAX * sizeof(struct can_filter)) return -EINVAL; count = optlen / sizeof(struct can_filter); if (count > 1) { /* filter does not fit into dfilter => alloc space */ filter = memdup_sockptr(optval, optlen); if (IS_ERR(filter)) return PTR_ERR(filter); } else if (count == 1) { if (copy_from_sockptr(&sfilter, optval, sizeof(sfilter))) return -EFAULT; } rtnl_lock(); lock_sock(sk); dev = ro->dev; if (ro->bound && dev) { if (dev->reg_state != NETREG_REGISTERED) { if (count > 1) kfree(filter); err = -ENODEV; goto out_fil; } } if (ro->bound) { /* (try to) register the new filters */ if (count == 1) err = raw_enable_filters(sock_net(sk), dev, sk, &sfilter, 1); else err = raw_enable_filters(sock_net(sk), dev, sk, filter, count); if (err) { if (count > 1) kfree(filter); goto out_fil; } /* remove old filter registrations */ raw_disable_filters(sock_net(sk), dev, sk, ro->filter, ro->count); } /* remove old filter space */ if (ro->count > 1) kfree(ro->filter); /* link new filters to the socket */ if (count == 1) { /* copy filter data for single filter */ ro->dfilter = sfilter; filter = &ro->dfilter; } ro->filter = filter; ro->count = count; out_fil: release_sock(sk); rtnl_unlock(); break; case CAN_RAW_ERR_FILTER: if (optlen != sizeof(err_mask)) return -EINVAL; if (copy_from_sockptr(&err_mask, optval, optlen)) return -EFAULT; err_mask &= CAN_ERR_MASK; rtnl_lock(); lock_sock(sk); dev = ro->dev; if (ro->bound && dev) { if (dev->reg_state != NETREG_REGISTERED) { err = -ENODEV; goto out_err; } } /* remove current error mask */ if (ro->bound) { /* (try to) register the new err_mask */ err = raw_enable_errfilter(sock_net(sk), dev, sk, err_mask); if (err) goto out_err; /* remove old err_mask registration */ raw_disable_errfilter(sock_net(sk), dev, sk, ro->err_mask); } /* link new err_mask to the socket */ ro->err_mask = err_mask; out_err: release_sock(sk); rtnl_unlock(); break; case CAN_RAW_LOOPBACK: if (optlen != sizeof(ro->loopback)) return -EINVAL; if (copy_from_sockptr(&ro->loopback, optval, optlen)) return -EFAULT; break; case CAN_RAW_RECV_OWN_MSGS: if (optlen != sizeof(ro->recv_own_msgs)) return -EINVAL; if (copy_from_sockptr(&ro->recv_own_msgs, optval, optlen)) return -EFAULT; break; case CAN_RAW_FD_FRAMES: if (optlen != sizeof(fd_frames)) return -EINVAL; if (copy_from_sockptr(&fd_frames, optval, optlen)) return -EFAULT; /* Enabling CAN XL includes CAN FD */ if (ro->xl_frames && !fd_frames) return -EINVAL; ro->fd_frames = fd_frames; break; case CAN_RAW_XL_FRAMES: if (optlen != sizeof(ro->xl_frames)) return -EINVAL; if (copy_from_sockptr(&ro->xl_frames, optval, optlen)) return -EFAULT; /* Enabling CAN XL includes CAN FD */ if (ro->xl_frames) ro->fd_frames = ro->xl_frames; break; case CAN_RAW_XL_VCID_OPTS: if (optlen != sizeof(ro->raw_vcid_opts)) return -EINVAL; if (copy_from_sockptr(&ro->raw_vcid_opts, optval, optlen)) return -EFAULT; /* prepare 32 bit values for handling in hot path */ ro->tx_vcid_shifted = ro->raw_vcid_opts.tx_vcid << CANXL_VCID_OFFSET; ro->rx_vcid_shifted = ro->raw_vcid_opts.rx_vcid << CANXL_VCID_OFFSET; ro->rx_vcid_mask_shifted = ro->raw_vcid_opts.rx_vcid_mask << CANXL_VCID_OFFSET; break; case CAN_RAW_JOIN_FILTERS: if (optlen != sizeof(ro->join_filters)) return -EINVAL; if (copy_from_sockptr(&ro->join_filters, optval, optlen)) return -EFAULT; break; default: return -ENOPROTOOPT; } return err; } static int raw_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct raw_sock *ro = raw_sk(sk); int len; void *val; if (level != SOL_CAN_RAW) return -EINVAL; if (get_user(len, optlen)) return -EFAULT; if (len < 0) return -EINVAL; switch (optname) { case CAN_RAW_FILTER: { int err = 0; lock_sock(sk); if (ro->count > 0) { int fsize = ro->count * sizeof(struct can_filter); /* user space buffer to small for filter list? */ if (len < fsize) { /* return -ERANGE and needed space in optlen */ err = -ERANGE; if (put_user(fsize, optlen)) err = -EFAULT; } else { if (len > fsize) len = fsize; if (copy_to_user(optval, ro->filter, len)) err = -EFAULT; } } else { len = 0; } release_sock(sk); if (!err) err = put_user(len, optlen); return err; } case CAN_RAW_ERR_FILTER: if (len > sizeof(can_err_mask_t)) len = sizeof(can_err_mask_t); val = &ro->err_mask; break; case CAN_RAW_LOOPBACK: if (len > sizeof(int)) len = sizeof(int); val = &ro->loopback; break; case CAN_RAW_RECV_OWN_MSGS: if (len > sizeof(int)) len = sizeof(int); val = &ro->recv_own_msgs; break; case CAN_RAW_FD_FRAMES: if (len > sizeof(int)) len = sizeof(int); val = &ro->fd_frames; break; case CAN_RAW_XL_FRAMES: if (len > sizeof(int)) len = sizeof(int); val = &ro->xl_frames; break; case CAN_RAW_XL_VCID_OPTS: { int err = 0; /* user space buffer to small for VCID opts? */ if (len < sizeof(ro->raw_vcid_opts)) { /* return -ERANGE and needed space in optlen */ err = -ERANGE; if (put_user(sizeof(ro->raw_vcid_opts), optlen)) err = -EFAULT; } else { if (len > sizeof(ro->raw_vcid_opts)) len = sizeof(ro->raw_vcid_opts); if (copy_to_user(optval, &ro->raw_vcid_opts, len)) err = -EFAULT; } if (!err) err = put_user(len, optlen); return err; } case CAN_RAW_JOIN_FILTERS: if (len > sizeof(int)) len = sizeof(int); val = &ro->join_filters; break; default: return -ENOPROTOOPT; } if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, val, len)) return -EFAULT; return 0; } static void raw_put_canxl_vcid(struct raw_sock *ro, struct sk_buff *skb) { struct canxl_frame *cxl = (struct canxl_frame *)skb->data; /* sanitize non CAN XL bits */ cxl->prio &= (CANXL_PRIO_MASK | CANXL_VCID_MASK); /* clear VCID in CAN XL frame if pass through is disabled */ if (!(ro->raw_vcid_opts.flags & CAN_RAW_XL_VCID_TX_PASS)) cxl->prio &= CANXL_PRIO_MASK; /* set VCID in CAN XL frame if enabled */ if (ro->raw_vcid_opts.flags & CAN_RAW_XL_VCID_TX_SET) { cxl->prio &= CANXL_PRIO_MASK; cxl->prio |= ro->tx_vcid_shifted; } } static unsigned int raw_check_txframe(struct raw_sock *ro, struct sk_buff *skb, int mtu) { /* Classical CAN -> no checks for flags and device capabilities */ if (can_is_can_skb(skb)) return CAN_MTU; /* CAN FD -> needs to be enabled and a CAN FD or CAN XL device */ if (ro->fd_frames && can_is_canfd_skb(skb) && (mtu == CANFD_MTU || can_is_canxl_dev_mtu(mtu))) return CANFD_MTU; /* CAN XL -> needs to be enabled and a CAN XL device */ if (ro->xl_frames && can_is_canxl_skb(skb) && can_is_canxl_dev_mtu(mtu)) return CANXL_MTU; return 0; } static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; struct raw_sock *ro = raw_sk(sk); struct sockcm_cookie sockc; struct sk_buff *skb; struct net_device *dev; unsigned int txmtu; int ifindex; int err = -EINVAL; /* check for valid CAN frame sizes */ if (size < CANXL_HDR_SIZE + CANXL_MIN_DLEN || size > CANXL_MTU) return -EINVAL; if (msg->msg_name) { DECLARE_SOCKADDR(struct sockaddr_can *, addr, msg->msg_name); if (msg->msg_namelen < RAW_MIN_NAMELEN) return -EINVAL; if (addr->can_family != AF_CAN) return -EINVAL; ifindex = addr->can_ifindex; } else { ifindex = ro->ifindex; } dev = dev_get_by_index(sock_net(sk), ifindex); if (!dev) return -ENXIO; skb = sock_alloc_send_skb(sk, size + sizeof(struct can_skb_priv), msg->msg_flags & MSG_DONTWAIT, &err); if (!skb) goto put_dev; can_skb_reserve(skb); can_skb_prv(skb)->ifindex = dev->ifindex; can_skb_prv(skb)->skbcnt = 0; /* fill the skb before testing for valid CAN frames */ err = memcpy_from_msg(skb_put(skb, size), msg, size); if (err < 0) goto free_skb; err = -EINVAL; /* check for valid CAN (CC/FD/XL) frame content */ txmtu = raw_check_txframe(ro, skb, dev->mtu); if (!txmtu) goto free_skb; /* only CANXL: clear/forward/set VCID value */ if (txmtu == CANXL_MTU) raw_put_canxl_vcid(ro, skb); sockcm_init(&sockc, sk); if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); if (unlikely(err)) goto free_skb; } skb->dev = dev; skb->priority = sockc.priority; skb->mark = sockc.mark; skb->tstamp = sockc.transmit_time; skb_setup_tx_timestamp(skb, &sockc); err = can_send(skb, ro->loopback); dev_put(dev); if (err) goto send_failed; return size; free_skb: kfree_skb(skb); put_dev: dev_put(dev); send_failed: return err; } static int raw_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; struct sk_buff *skb; int err = 0; if (flags & MSG_ERRQUEUE) return sock_recv_errqueue(sk, msg, size, SOL_CAN_RAW, SCM_CAN_RAW_ERRQUEUE); skb = skb_recv_datagram(sk, flags, &err); if (!skb) return err; if (size < skb->len) msg->msg_flags |= MSG_TRUNC; else size = skb->len; err = memcpy_to_msg(msg, skb->data, size); if (err < 0) { skb_free_datagram(sk, skb); return err; } sock_recv_cmsgs(msg, sk, skb); if (msg->msg_name) { __sockaddr_check_size(RAW_MIN_NAMELEN); msg->msg_namelen = RAW_MIN_NAMELEN; memcpy(msg->msg_name, skb->cb, msg->msg_namelen); } /* assign the flags that have been recorded in raw_rcv() */ msg->msg_flags |= *(raw_flags(skb)); skb_free_datagram(sk, skb); return size; } static int raw_sock_no_ioctlcmd(struct socket *sock, unsigned int cmd, unsigned long arg) { /* no ioctls for socket layer -> hand it down to NIC layer */ return -ENOIOCTLCMD; } static const struct proto_ops raw_ops = { .family = PF_CAN, .release = raw_release, .bind = raw_bind, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = raw_getname, .poll = datagram_poll, .ioctl = raw_sock_no_ioctlcmd, .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = raw_setsockopt, .getsockopt = raw_getsockopt, .sendmsg = raw_sendmsg, .recvmsg = raw_recvmsg, .mmap = sock_no_mmap, }; static struct proto raw_proto __read_mostly = { .name = "CAN_RAW", .owner = THIS_MODULE, .obj_size = sizeof(struct raw_sock), .init = raw_init, }; static const struct can_proto raw_can_proto = { .type = SOCK_RAW, .protocol = CAN_RAW, .ops = &raw_ops, .prot = &raw_proto, }; static struct notifier_block canraw_notifier = { .notifier_call = raw_notifier }; static __init int raw_module_init(void) { int err; pr_info("can: raw protocol\n"); err = register_netdevice_notifier(&canraw_notifier); if (err) return err; err = can_proto_register(&raw_can_proto); if (err < 0) { pr_err("can: registration of raw protocol failed\n"); goto register_proto_failed; } return 0; register_proto_failed: unregister_netdevice_notifier(&canraw_notifier); return err; } static __exit void raw_module_exit(void) { can_proto_unregister(&raw_can_proto); unregister_netdevice_notifier(&canraw_notifier); } module_init(raw_module_init); module_exit(raw_module_exit);
250 3 4 209 3 2 1 195 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 /* SPDX-License-Identifier: GPL-2.0 */ #if !defined(_TRACE_KVM_MAIN_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_KVM_MAIN_H #include <linux/tracepoint.h> #undef TRACE_SYSTEM #define TRACE_SYSTEM kvm #define ERSN(x) { KVM_EXIT_##x, "KVM_EXIT_" #x } #define kvm_trace_exit_reason \ ERSN(UNKNOWN), ERSN(EXCEPTION), ERSN(IO), ERSN(HYPERCALL), \ ERSN(DEBUG), ERSN(HLT), ERSN(MMIO), ERSN(IRQ_WINDOW_OPEN), \ ERSN(SHUTDOWN), ERSN(FAIL_ENTRY), ERSN(INTR), ERSN(SET_TPR), \ ERSN(TPR_ACCESS), ERSN(S390_SIEIC), ERSN(S390_RESET), ERSN(DCR),\ ERSN(NMI), ERSN(INTERNAL_ERROR), ERSN(OSI), ERSN(PAPR_HCALL), \ ERSN(S390_UCONTROL), ERSN(WATCHDOG), ERSN(S390_TSCH), ERSN(EPR),\ ERSN(SYSTEM_EVENT), ERSN(S390_STSI), ERSN(IOAPIC_EOI), \ ERSN(HYPERV), ERSN(ARM_NISV), ERSN(X86_RDMSR), ERSN(X86_WRMSR) TRACE_EVENT(kvm_userspace_exit, TP_PROTO(__u32 reason, int errno), TP_ARGS(reason, errno), TP_STRUCT__entry( __field( __u32, reason ) __field( int, errno ) ), TP_fast_assign( __entry->reason = reason; __entry->errno = errno; ), TP_printk("reason %s (%d)", __entry->errno < 0 ? (__entry->errno == -EINTR ? "restart" : "error") : __print_symbolic(__entry->reason, kvm_trace_exit_reason), __entry->errno < 0 ? -__entry->errno : __entry->reason) ); TRACE_EVENT(kvm_vcpu_wakeup, TP_PROTO(__u64 ns, bool waited, bool valid), TP_ARGS(ns, waited, valid), TP_STRUCT__entry( __field( __u64, ns ) __field( bool, waited ) __field( bool, valid ) ), TP_fast_assign( __entry->ns = ns; __entry->waited = waited; __entry->valid = valid; ), TP_printk("%s time %lld ns, polling %s", __entry->waited ? "wait" : "poll", __entry->ns, __entry->valid ? "valid" : "invalid") ); #if defined(CONFIG_HAVE_KVM_IRQCHIP) TRACE_EVENT(kvm_set_irq, TP_PROTO(unsigned int gsi, int level, int irq_source_id), TP_ARGS(gsi, level, irq_source_id), TP_STRUCT__entry( __field( unsigned int, gsi ) __field( int, level ) __field( int, irq_source_id ) ), TP_fast_assign( __entry->gsi = gsi; __entry->level = level; __entry->irq_source_id = irq_source_id; ), TP_printk("gsi %u level %d source %d", __entry->gsi, __entry->level, __entry->irq_source_id) ); #endif /* defined(CONFIG_HAVE_KVM_IRQCHIP) */ #if defined(__KVM_HAVE_IOAPIC) #define kvm_deliver_mode \ {0x0, "Fixed"}, \ {0x1, "LowPrio"}, \ {0x2, "SMI"}, \ {0x3, "Res3"}, \ {0x4, "NMI"}, \ {0x5, "INIT"}, \ {0x6, "SIPI"}, \ {0x7, "ExtINT"} TRACE_EVENT(kvm_ioapic_set_irq, TP_PROTO(__u64 e, int pin, bool coalesced), TP_ARGS(e, pin, coalesced), TP_STRUCT__entry( __field( __u64, e ) __field( int, pin ) __field( bool, coalesced ) ), TP_fast_assign( __entry->e = e; __entry->pin = pin; __entry->coalesced = coalesced; ), TP_printk("pin %u dst %x vec %u (%s|%s|%s%s)%s", __entry->pin, (u8)(__entry->e >> 56), (u8)__entry->e, __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode), (__entry->e & (1<<11)) ? "logical" : "physical", (__entry->e & (1<<15)) ? "level" : "edge", (__entry->e & (1<<16)) ? "|masked" : "", __entry->coalesced ? " (coalesced)" : "") ); TRACE_EVENT(kvm_ioapic_delayed_eoi_inj, TP_PROTO(__u64 e), TP_ARGS(e), TP_STRUCT__entry( __field( __u64, e ) ), TP_fast_assign( __entry->e = e; ), TP_printk("dst %x vec %u (%s|%s|%s%s)", (u8)(__entry->e >> 56), (u8)__entry->e, __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode), (__entry->e & (1<<11)) ? "logical" : "physical", (__entry->e & (1<<15)) ? "level" : "edge", (__entry->e & (1<<16)) ? "|masked" : "") ); TRACE_EVENT(kvm_msi_set_irq, TP_PROTO(__u64 address, __u64 data), TP_ARGS(address, data), TP_STRUCT__entry( __field( __u64, address ) __field( __u64, data ) ), TP_fast_assign( __entry->address = address; __entry->data = data; ), TP_printk("dst %llx vec %u (%s|%s|%s%s)", (u8)(__entry->address >> 12) | ((__entry->address >> 32) & 0xffffff00), (u8)__entry->data, __print_symbolic((__entry->data >> 8 & 0x7), kvm_deliver_mode), (__entry->address & (1<<2)) ? "logical" : "physical", (__entry->data & (1<<15)) ? "level" : "edge", (__entry->address & (1<<3)) ? "|rh" : "") ); #define kvm_irqchips \ {KVM_IRQCHIP_PIC_MASTER, "PIC master"}, \ {KVM_IRQCHIP_PIC_SLAVE, "PIC slave"}, \ {KVM_IRQCHIP_IOAPIC, "IOAPIC"} #endif /* defined(__KVM_HAVE_IOAPIC) */ #if defined(CONFIG_HAVE_KVM_IRQCHIP) #ifdef kvm_irqchips #define kvm_ack_irq_string "irqchip %s pin %u" #define kvm_ack_irq_parm __print_symbolic(__entry->irqchip, kvm_irqchips), __entry->pin #else #define kvm_ack_irq_string "irqchip %d pin %u" #define kvm_ack_irq_parm __entry->irqchip, __entry->pin #endif TRACE_EVENT(kvm_ack_irq, TP_PROTO(unsigned int irqchip, unsigned int pin), TP_ARGS(irqchip, pin), TP_STRUCT__entry( __field( unsigned int, irqchip ) __field( unsigned int, pin ) ), TP_fast_assign( __entry->irqchip = irqchip; __entry->pin = pin; ), TP_printk(kvm_ack_irq_string, kvm_ack_irq_parm) ); #endif /* defined(CONFIG_HAVE_KVM_IRQCHIP) */ #define KVM_TRACE_MMIO_READ_UNSATISFIED 0 #define KVM_TRACE_MMIO_READ 1 #define KVM_TRACE_MMIO_WRITE 2 #define kvm_trace_symbol_mmio \ { KVM_TRACE_MMIO_READ_UNSATISFIED, "unsatisfied-read" }, \ { KVM_TRACE_MMIO_READ, "read" }, \ { KVM_TRACE_MMIO_WRITE, "write" } TRACE_EVENT(kvm_mmio, TP_PROTO(int type, int len, u64 gpa, void *val), TP_ARGS(type, len, gpa, val), TP_STRUCT__entry( __field( u32, type ) __field( u32, len ) __field( u64, gpa ) __field( u64, val ) ), TP_fast_assign( __entry->type = type; __entry->len = len; __entry->gpa = gpa; __entry->val = 0; if (val) memcpy(&__entry->val, val, min_t(u32, sizeof(__entry->val), len)); ), TP_printk("mmio %s len %u gpa 0x%llx val 0x%llx", __print_symbolic(__entry->type, kvm_trace_symbol_mmio), __entry->len, __entry->gpa, __entry->val) ); #define KVM_TRACE_IOCSR_READ_UNSATISFIED 0 #define KVM_TRACE_IOCSR_READ 1 #define KVM_TRACE_IOCSR_WRITE 2 #define kvm_trace_symbol_iocsr \ { KVM_TRACE_IOCSR_READ_UNSATISFIED, "unsatisfied-read" }, \ { KVM_TRACE_IOCSR_READ, "read" }, \ { KVM_TRACE_IOCSR_WRITE, "write" } TRACE_EVENT(kvm_iocsr, TP_PROTO(int type, int len, u64 gpa, void *val), TP_ARGS(type, len, gpa, val), TP_STRUCT__entry( __field( u32, type ) __field( u32, len ) __field( u64, gpa ) __field( u64, val ) ), TP_fast_assign( __entry->type = type; __entry->len = len; __entry->gpa = gpa; __entry->val = 0; if (val) memcpy(&__entry->val, val, min_t(u32, sizeof(__entry->val), len)); ), TP_printk("iocsr %s len %u gpa 0x%llx val 0x%llx", __print_symbolic(__entry->type, kvm_trace_symbol_iocsr), __entry->len, __entry->gpa, __entry->val) ); #define kvm_fpu_load_symbol \ {0, "unload"}, \ {1, "load"} TRACE_EVENT(kvm_fpu, TP_PROTO(int load), TP_ARGS(load), TP_STRUCT__entry( __field( u32, load ) ), TP_fast_assign( __entry->load = load; ), TP_printk("%s", __print_symbolic(__entry->load, kvm_fpu_load_symbol)) ); #ifdef CONFIG_KVM_ASYNC_PF DECLARE_EVENT_CLASS(kvm_async_get_page_class, TP_PROTO(u64 gva, u64 gfn), TP_ARGS(gva, gfn), TP_STRUCT__entry( __field(__u64, gva) __field(u64, gfn) ), TP_fast_assign( __entry->gva = gva; __entry->gfn = gfn; ), TP_printk("gva = %#llx, gfn = %#llx", __entry->gva, __entry->gfn) ); DEFINE_EVENT(kvm_async_get_page_class, kvm_try_async_get_page, TP_PROTO(u64 gva, u64 gfn), TP_ARGS(gva, gfn) ); DEFINE_EVENT(kvm_async_get_page_class, kvm_async_pf_repeated_fault, TP_PROTO(u64 gva, u64 gfn), TP_ARGS(gva, gfn) ); DECLARE_EVENT_CLASS(kvm_async_pf_nopresent_ready, TP_PROTO(u64 token, u64 gva), TP_ARGS(token, gva), TP_STRUCT__entry( __field(__u64, token) __field(__u64, gva) ), TP_fast_assign( __entry->token = token; __entry->gva = gva; ), TP_printk("token %#llx gva %#llx", __entry->token, __entry->gva) ); DEFINE_EVENT(kvm_async_pf_nopresent_ready, kvm_async_pf_not_present, TP_PROTO(u64 token, u64 gva), TP_ARGS(token, gva) ); DEFINE_EVENT(kvm_async_pf_nopresent_ready, kvm_async_pf_ready, TP_PROTO(u64 token, u64 gva), TP_ARGS(token, gva) ); TRACE_EVENT( kvm_async_pf_completed, TP_PROTO(unsigned long address, u64 gva), TP_ARGS(address, gva), TP_STRUCT__entry( __field(unsigned long, address) __field(u64, gva) ), TP_fast_assign( __entry->address = address; __entry->gva = gva; ), TP_printk("gva %#llx address %#lx", __entry->gva, __entry->address) ); #endif TRACE_EVENT(kvm_halt_poll_ns, TP_PROTO(bool grow, unsigned int vcpu_id, unsigned int new, unsigned int old), TP_ARGS(grow, vcpu_id, new, old), TP_STRUCT__entry( __field(bool, grow) __field(unsigned int, vcpu_id) __field(unsigned int, new) __field(unsigned int, old) ), TP_fast_assign( __entry->grow = grow; __entry->vcpu_id = vcpu_id; __entry->new = new; __entry->old = old; ), TP_printk("vcpu %u: halt_poll_ns %u (%s %u)", __entry->vcpu_id, __entry->new, __entry->grow ? "grow" : "shrink", __entry->old) ); #define trace_kvm_halt_poll_ns_grow(vcpu_id, new, old) \ trace_kvm_halt_poll_ns(true, vcpu_id, new, old) #define trace_kvm_halt_poll_ns_shrink(vcpu_id, new, old) \ trace_kvm_halt_poll_ns(false, vcpu_id, new, old) TRACE_EVENT(kvm_dirty_ring_push, TP_PROTO(struct kvm_dirty_ring *ring, u32 slot, u64 offset), TP_ARGS(ring, slot, offset), TP_STRUCT__entry( __field(int, index) __field(u32, dirty_index) __field(u32, reset_index) __field(u32, slot) __field(u64, offset) ), TP_fast_assign( __entry->index = ring->index; __entry->dirty_index = ring->dirty_index; __entry->reset_index = ring->reset_index; __entry->slot = slot; __entry->offset = offset; ), TP_printk("ring %d: dirty 0x%x reset 0x%x " "slot %u offset 0x%llx (used %u)", __entry->index, __entry->dirty_index, __entry->reset_index, __entry->slot, __entry->offset, __entry->dirty_index - __entry->reset_index) ); TRACE_EVENT(kvm_dirty_ring_reset, TP_PROTO(struct kvm_dirty_ring *ring), TP_ARGS(ring), TP_STRUCT__entry( __field(int, index) __field(u32, dirty_index) __field(u32, reset_index) ), TP_fast_assign( __entry->index = ring->index; __entry->dirty_index = ring->dirty_index; __entry->reset_index = ring->reset_index; ), TP_printk("ring %d: dirty 0x%x reset 0x%x (used %u)", __entry->index, __entry->dirty_index, __entry->reset_index, __entry->dirty_index - __entry->reset_index) ); TRACE_EVENT(kvm_dirty_ring_exit, TP_PROTO(struct kvm_vcpu *vcpu), TP_ARGS(vcpu), TP_STRUCT__entry( __field(int, vcpu_id) ), TP_fast_assign( __entry->vcpu_id = vcpu->vcpu_id; ), TP_printk("vcpu %d", __entry->vcpu_id) ); TRACE_EVENT(kvm_unmap_hva_range, TP_PROTO(unsigned long start, unsigned long end), TP_ARGS(start, end), TP_STRUCT__entry( __field( unsigned long, start ) __field( unsigned long, end ) ), TP_fast_assign( __entry->start = start; __entry->end = end; ), TP_printk("mmu notifier unmap range: %#016lx -- %#016lx", __entry->start, __entry->end) ); TRACE_EVENT(kvm_age_hva, TP_PROTO(unsigned long start, unsigned long end), TP_ARGS(start, end), TP_STRUCT__entry( __field( unsigned long, start ) __field( unsigned long, end ) ), TP_fast_assign( __entry->start = start; __entry->end = end; ), TP_printk("mmu notifier age hva: %#016lx -- %#016lx", __entry->start, __entry->end) ); TRACE_EVENT(kvm_test_age_hva, TP_PROTO(unsigned long hva), TP_ARGS(hva), TP_STRUCT__entry( __field( unsigned long, hva ) ), TP_fast_assign( __entry->hva = hva; ), TP_printk("mmu notifier test age hva: %#016lx", __entry->hva) ); #endif /* _TRACE_KVM_MAIN_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
1849 335 25 1176 899 872 39 39 1000 1538 923 896 175 1533 808 320 1877 1856 38 1877 1096 362 260 137 697 1184 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Based on arch/arm/include/asm/atomic.h * * Copyright (C) 1996 Russell King. * Copyright (C) 2002 Deep Blue Solutions Ltd. * Copyright (C) 2012 ARM Ltd. */ #ifndef __ASM_ATOMIC_LSE_H #define __ASM_ATOMIC_LSE_H #define ATOMIC_OP(op, asm_op) \ static __always_inline void \ __lse_atomic_##op(int i, atomic_t *v) \ { \ asm volatile( \ __LSE_PREAMBLE \ " " #asm_op " %w[i], %[v]\n" \ : [v] "+Q" (v->counter) \ : [i] "r" (i)); \ } ATOMIC_OP(andnot, stclr) ATOMIC_OP(or, stset) ATOMIC_OP(xor, steor) ATOMIC_OP(add, stadd) static __always_inline void __lse_atomic_sub(int i, atomic_t *v) { __lse_atomic_add(-i, v); } #undef ATOMIC_OP #define ATOMIC_FETCH_OP(name, mb, op, asm_op, cl...) \ static __always_inline int \ __lse_atomic_fetch_##op##name(int i, atomic_t *v) \ { \ int old; \ \ asm volatile( \ __LSE_PREAMBLE \ " " #asm_op #mb " %w[i], %w[old], %[v]" \ : [v] "+Q" (v->counter), \ [old] "=r" (old) \ : [i] "r" (i) \ : cl); \ \ return old; \ } #define ATOMIC_FETCH_OPS(op, asm_op) \ ATOMIC_FETCH_OP(_relaxed, , op, asm_op) \ ATOMIC_FETCH_OP(_acquire, a, op, asm_op, "memory") \ ATOMIC_FETCH_OP(_release, l, op, asm_op, "memory") \ ATOMIC_FETCH_OP( , al, op, asm_op, "memory") ATOMIC_FETCH_OPS(andnot, ldclr) ATOMIC_FETCH_OPS(or, ldset) ATOMIC_FETCH_OPS(xor, ldeor) ATOMIC_FETCH_OPS(add, ldadd) #undef ATOMIC_FETCH_OP #undef ATOMIC_FETCH_OPS #define ATOMIC_FETCH_OP_SUB(name) \ static __always_inline int \ __lse_atomic_fetch_sub##name(int i, atomic_t *v) \ { \ return __lse_atomic_fetch_add##name(-i, v); \ } ATOMIC_FETCH_OP_SUB(_relaxed) ATOMIC_FETCH_OP_SUB(_acquire) ATOMIC_FETCH_OP_SUB(_release) ATOMIC_FETCH_OP_SUB( ) #undef ATOMIC_FETCH_OP_SUB #define ATOMIC_OP_ADD_SUB_RETURN(name) \ static __always_inline int \ __lse_atomic_add_return##name(int i, atomic_t *v) \ { \ return __lse_atomic_fetch_add##name(i, v) + i; \ } \ \ static __always_inline int \ __lse_atomic_sub_return##name(int i, atomic_t *v) \ { \ return __lse_atomic_fetch_sub(i, v) - i; \ } ATOMIC_OP_ADD_SUB_RETURN(_relaxed) ATOMIC_OP_ADD_SUB_RETURN(_acquire) ATOMIC_OP_ADD_SUB_RETURN(_release) ATOMIC_OP_ADD_SUB_RETURN( ) #undef ATOMIC_OP_ADD_SUB_RETURN static __always_inline void __lse_atomic_and(int i, atomic_t *v) { return __lse_atomic_andnot(~i, v); } #define ATOMIC_FETCH_OP_AND(name, mb, cl...) \ static __always_inline int \ __lse_atomic_fetch_and##name(int i, atomic_t *v) \ { \ return __lse_atomic_fetch_andnot##name(~i, v); \ } ATOMIC_FETCH_OP_AND(_relaxed, ) ATOMIC_FETCH_OP_AND(_acquire, a, "memory") ATOMIC_FETCH_OP_AND(_release, l, "memory") ATOMIC_FETCH_OP_AND( , al, "memory") #undef ATOMIC_FETCH_OP_AND #define ATOMIC64_OP(op, asm_op) \ static __always_inline void \ __lse_atomic64_##op(s64 i, atomic64_t *v) \ { \ asm volatile( \ __LSE_PREAMBLE \ " " #asm_op " %[i], %[v]\n" \ : [v] "+Q" (v->counter) \ : [i] "r" (i)); \ } ATOMIC64_OP(andnot, stclr) ATOMIC64_OP(or, stset) ATOMIC64_OP(xor, steor) ATOMIC64_OP(add, stadd) static __always_inline void __lse_atomic64_sub(s64 i, atomic64_t *v) { __lse_atomic64_add(-i, v); } #undef ATOMIC64_OP #define ATOMIC64_FETCH_OP(name, mb, op, asm_op, cl...) \ static __always_inline long \ __lse_atomic64_fetch_##op##name(s64 i, atomic64_t *v) \ { \ s64 old; \ \ asm volatile( \ __LSE_PREAMBLE \ " " #asm_op #mb " %[i], %[old], %[v]" \ : [v] "+Q" (v->counter), \ [old] "=r" (old) \ : [i] "r" (i) \ : cl); \ \ return old; \ } #define ATOMIC64_FETCH_OPS(op, asm_op) \ ATOMIC64_FETCH_OP(_relaxed, , op, asm_op) \ ATOMIC64_FETCH_OP(_acquire, a, op, asm_op, "memory") \ ATOMIC64_FETCH_OP(_release, l, op, asm_op, "memory") \ ATOMIC64_FETCH_OP( , al, op, asm_op, "memory") ATOMIC64_FETCH_OPS(andnot, ldclr) ATOMIC64_FETCH_OPS(or, ldset) ATOMIC64_FETCH_OPS(xor, ldeor) ATOMIC64_FETCH_OPS(add, ldadd) #undef ATOMIC64_FETCH_OP #undef ATOMIC64_FETCH_OPS #define ATOMIC64_FETCH_OP_SUB(name) \ static __always_inline long \ __lse_atomic64_fetch_sub##name(s64 i, atomic64_t *v) \ { \ return __lse_atomic64_fetch_add##name(-i, v); \ } ATOMIC64_FETCH_OP_SUB(_relaxed) ATOMIC64_FETCH_OP_SUB(_acquire) ATOMIC64_FETCH_OP_SUB(_release) ATOMIC64_FETCH_OP_SUB( ) #undef ATOMIC64_FETCH_OP_SUB #define ATOMIC64_OP_ADD_SUB_RETURN(name) \ static __always_inline long \ __lse_atomic64_add_return##name(s64 i, atomic64_t *v) \ { \ return __lse_atomic64_fetch_add##name(i, v) + i; \ } \ \ static __always_inline long \ __lse_atomic64_sub_return##name(s64 i, atomic64_t *v) \ { \ return __lse_atomic64_fetch_sub##name(i, v) - i; \ } ATOMIC64_OP_ADD_SUB_RETURN(_relaxed) ATOMIC64_OP_ADD_SUB_RETURN(_acquire) ATOMIC64_OP_ADD_SUB_RETURN(_release) ATOMIC64_OP_ADD_SUB_RETURN( ) #undef ATOMIC64_OP_ADD_SUB_RETURN static __always_inline void __lse_atomic64_and(s64 i, atomic64_t *v) { return __lse_atomic64_andnot(~i, v); } #define ATOMIC64_FETCH_OP_AND(name, mb, cl...) \ static __always_inline long \ __lse_atomic64_fetch_and##name(s64 i, atomic64_t *v) \ { \ return __lse_atomic64_fetch_andnot##name(~i, v); \ } ATOMIC64_FETCH_OP_AND(_relaxed, ) ATOMIC64_FETCH_OP_AND(_acquire, a, "memory") ATOMIC64_FETCH_OP_AND(_release, l, "memory") ATOMIC64_FETCH_OP_AND( , al, "memory") #undef ATOMIC64_FETCH_OP_AND static __always_inline s64 __lse_atomic64_dec_if_positive(atomic64_t *v) { unsigned long tmp; asm volatile( __LSE_PREAMBLE "1: ldr %x[tmp], %[v]\n" " subs %[ret], %x[tmp], #1\n" " b.lt 2f\n" " casal %x[tmp], %[ret], %[v]\n" " sub %x[tmp], %x[tmp], #1\n" " sub %x[tmp], %x[tmp], %[ret]\n" " cbnz %x[tmp], 1b\n" "2:" : [ret] "+&r" (v), [v] "+Q" (v->counter), [tmp] "=&r" (tmp) : : "cc", "memory"); return (long)v; } #define __CMPXCHG_CASE(w, sfx, name, sz, mb, cl...) \ static __always_inline u##sz \ __lse__cmpxchg_case_##name##sz(volatile void *ptr, \ u##sz old, \ u##sz new) \ { \ asm volatile( \ __LSE_PREAMBLE \ " cas" #mb #sfx " %" #w "[old], %" #w "[new], %[v]\n" \ : [v] "+Q" (*(u##sz *)ptr), \ [old] "+r" (old) \ : [new] "rZ" (new) \ : cl); \ \ return old; \ } __CMPXCHG_CASE(w, b, , 8, ) __CMPXCHG_CASE(w, h, , 16, ) __CMPXCHG_CASE(w, , , 32, ) __CMPXCHG_CASE(x, , , 64, ) __CMPXCHG_CASE(w, b, acq_, 8, a, "memory") __CMPXCHG_CASE(w, h, acq_, 16, a, "memory") __CMPXCHG_CASE(w, , acq_, 32, a, "memory") __CMPXCHG_CASE(x, , acq_, 64, a, "memory") __CMPXCHG_CASE(w, b, rel_, 8, l, "memory") __CMPXCHG_CASE(w, h, rel_, 16, l, "memory") __CMPXCHG_CASE(w, , rel_, 32, l, "memory") __CMPXCHG_CASE(x, , rel_, 64, l, "memory") __CMPXCHG_CASE(w, b, mb_, 8, al, "memory") __CMPXCHG_CASE(w, h, mb_, 16, al, "memory") __CMPXCHG_CASE(w, , mb_, 32, al, "memory") __CMPXCHG_CASE(x, , mb_, 64, al, "memory") #undef __CMPXCHG_CASE #define __CMPXCHG128(name, mb, cl...) \ static __always_inline u128 \ __lse__cmpxchg128##name(volatile u128 *ptr, u128 old, u128 new) \ { \ union __u128_halves r, o = { .full = (old) }, \ n = { .full = (new) }; \ register unsigned long x0 asm ("x0") = o.low; \ register unsigned long x1 asm ("x1") = o.high; \ register unsigned long x2 asm ("x2") = n.low; \ register unsigned long x3 asm ("x3") = n.high; \ register unsigned long x4 asm ("x4") = (unsigned long)ptr; \ \ asm volatile( \ __LSE_PREAMBLE \ " casp" #mb "\t%[old1], %[old2], %[new1], %[new2], %[v]\n"\ : [old1] "+&r" (x0), [old2] "+&r" (x1), \ [v] "+Q" (*(u128 *)ptr) \ : [new1] "r" (x2), [new2] "r" (x3), [ptr] "r" (x4), \ [oldval1] "r" (o.low), [oldval2] "r" (o.high) \ : cl); \ \ r.low = x0; r.high = x1; \ \ return r.full; \ } __CMPXCHG128( , ) __CMPXCHG128(_mb, al, "memory") #undef __CMPXCHG128 #endif /* __ASM_ATOMIC_LSE_H */
295 295 249 310 311 305 295 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 // SPDX-License-Identifier: GPL-2.0 /* * VMID allocator. * * Based on Arm64 ASID allocator algorithm. * Please refer arch/arm64/mm/context.c for detailed * comments on algorithm. * * Copyright (C) 2002-2003 Deep Blue Solutions Ltd, all rights reserved. * Copyright (C) 2012 ARM Ltd. */ #include <linux/bitfield.h> #include <linux/bitops.h> #include <asm/kvm_asm.h> #include <asm/kvm_mmu.h> unsigned int __ro_after_init kvm_arm_vmid_bits; static DEFINE_RAW_SPINLOCK(cpu_vmid_lock); static atomic64_t vmid_generation; static unsigned long *vmid_map; static DEFINE_PER_CPU(atomic64_t, active_vmids); static DEFINE_PER_CPU(u64, reserved_vmids); #define VMID_MASK (~GENMASK(kvm_arm_vmid_bits - 1, 0)) #define VMID_FIRST_VERSION (1UL << kvm_arm_vmid_bits) #define NUM_USER_VMIDS VMID_FIRST_VERSION #define vmid2idx(vmid) ((vmid) & ~VMID_MASK) #define idx2vmid(idx) vmid2idx(idx) /* * As vmid #0 is always reserved, we will never allocate one * as below and can be treated as invalid. This is used to * set the active_vmids on vCPU schedule out. */ #define VMID_ACTIVE_INVALID VMID_FIRST_VERSION #define vmid_gen_match(vmid) \ (!(((vmid) ^ atomic64_read(&vmid_generation)) >> kvm_arm_vmid_bits)) static void flush_context(void) { int cpu; u64 vmid; bitmap_zero(vmid_map, NUM_USER_VMIDS); for_each_possible_cpu(cpu) { vmid = atomic64_xchg_relaxed(&per_cpu(active_vmids, cpu), 0); /* Preserve reserved VMID */ if (vmid == 0) vmid = per_cpu(reserved_vmids, cpu); __set_bit(vmid2idx(vmid), vmid_map); per_cpu(reserved_vmids, cpu) = vmid; } /* * Unlike ASID allocator, we expect less frequent rollover in * case of VMIDs. Hence, instead of marking the CPU as * flush_pending and issuing a local context invalidation on * the next context-switch, we broadcast TLB flush + I-cache * invalidation over the inner shareable domain on rollover. */ kvm_call_hyp(__kvm_flush_vm_context); } static bool check_update_reserved_vmid(u64 vmid, u64 newvmid) { int cpu; bool hit = false; /* * Iterate over the set of reserved VMIDs looking for a match * and update to use newvmid (i.e. the same VMID in the current * generation). */ for_each_possible_cpu(cpu) { if (per_cpu(reserved_vmids, cpu) == vmid) { hit = true; per_cpu(reserved_vmids, cpu) = newvmid; } } return hit; } static u64 new_vmid(struct kvm_vmid *kvm_vmid) { static u32 cur_idx = 1; u64 vmid = atomic64_read(&kvm_vmid->id); u64 generation = atomic64_read(&vmid_generation); if (vmid != 0) { u64 newvmid = generation | (vmid & ~VMID_MASK); if (check_update_reserved_vmid(vmid, newvmid)) { atomic64_set(&kvm_vmid->id, newvmid); return newvmid; } if (!__test_and_set_bit(vmid2idx(vmid), vmid_map)) { atomic64_set(&kvm_vmid->id, newvmid); return newvmid; } } vmid = find_next_zero_bit(vmid_map, NUM_USER_VMIDS, cur_idx); if (vmid != NUM_USER_VMIDS) goto set_vmid; /* We're out of VMIDs, so increment the global generation count */ generation = atomic64_add_return_relaxed(VMID_FIRST_VERSION, &vmid_generation); flush_context(); /* We have more VMIDs than CPUs, so this will always succeed */ vmid = find_next_zero_bit(vmid_map, NUM_USER_VMIDS, 1); set_vmid: __set_bit(vmid, vmid_map); cur_idx = vmid; vmid = idx2vmid(vmid) | generation; atomic64_set(&kvm_vmid->id, vmid); return vmid; } /* Called from vCPU sched out with preemption disabled */ void kvm_arm_vmid_clear_active(void) { atomic64_set(this_cpu_ptr(&active_vmids), VMID_ACTIVE_INVALID); } void kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid) { unsigned long flags; u64 vmid, old_active_vmid; vmid = atomic64_read(&kvm_vmid->id); /* * Please refer comments in check_and_switch_context() in * arch/arm64/mm/context.c. * * Unlike ASID allocator, we set the active_vmids to * VMID_ACTIVE_INVALID on vCPU schedule out to avoid * reserving the VMID space needlessly on rollover. * Hence explicitly check here for a "!= 0" to * handle the sync with a concurrent rollover. */ old_active_vmid = atomic64_read(this_cpu_ptr(&active_vmids)); if (old_active_vmid != 0 && vmid_gen_match(vmid) && 0 != atomic64_cmpxchg_relaxed(this_cpu_ptr(&active_vmids), old_active_vmid, vmid)) return; raw_spin_lock_irqsave(&cpu_vmid_lock, flags); /* Check that our VMID belongs to the current generation. */ vmid = atomic64_read(&kvm_vmid->id); if (!vmid_gen_match(vmid)) vmid = new_vmid(kvm_vmid); atomic64_set(this_cpu_ptr(&active_vmids), vmid); raw_spin_unlock_irqrestore(&cpu_vmid_lock, flags); } /* * Initialize the VMID allocator */ int __init kvm_arm_vmid_alloc_init(void) { kvm_arm_vmid_bits = kvm_get_vmid_bits(); /* * Expect allocation after rollover to fail if we don't have * at least one more VMID than CPUs. VMID #0 is always reserved. */ WARN_ON(NUM_USER_VMIDS - 1 <= num_possible_cpus()); atomic64_set(&vmid_generation, VMID_FIRST_VERSION); vmid_map = bitmap_zalloc(NUM_USER_VMIDS, GFP_KERNEL); if (!vmid_map) return -ENOMEM; return 0; } void __init kvm_arm_vmid_alloc_free(void) { bitmap_free(vmid_map); }
6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 // SPDX-License-Identifier: GPL-2.0 /* XDP sockets * * AF_XDP sockets allows a channel between XDP programs and userspace * applications. * Copyright(c) 2018 Intel Corporation. * * Author(s): Björn Töpel <bjorn.topel@intel.com> * Magnus Karlsson <magnus.karlsson@intel.com> */ #define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__ #include <linux/if_xdp.h> #include <linux/init.h> #include <linux/sched/mm.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/socket.h> #include <linux/file.h> #include <linux/uaccess.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/rculist.h> #include <linux/vmalloc.h> #include <net/xdp_sock_drv.h> #include <net/busy_poll.h> #include <net/netdev_lock.h> #include <net/netdev_rx_queue.h> #include <net/xdp.h> #include "xsk_queue.h" #include "xdp_umem.h" #include "xsk.h" #define TX_BATCH_SIZE 32 #define MAX_PER_SOCKET_BUDGET (TX_BATCH_SIZE) void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool) { if (pool->cached_need_wakeup & XDP_WAKEUP_RX) return; pool->fq->ring->flags |= XDP_RING_NEED_WAKEUP; pool->cached_need_wakeup |= XDP_WAKEUP_RX; } EXPORT_SYMBOL(xsk_set_rx_need_wakeup); void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool) { struct xdp_sock *xs; if (pool->cached_need_wakeup & XDP_WAKEUP_TX) return; rcu_read_lock(); list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP; } rcu_read_unlock(); pool->cached_need_wakeup |= XDP_WAKEUP_TX; } EXPORT_SYMBOL(xsk_set_tx_need_wakeup); void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool) { if (!(pool->cached_need_wakeup & XDP_WAKEUP_RX)) return; pool->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP; pool->cached_need_wakeup &= ~XDP_WAKEUP_RX; } EXPORT_SYMBOL(xsk_clear_rx_need_wakeup); void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool) { struct xdp_sock *xs; if (!(pool->cached_need_wakeup & XDP_WAKEUP_TX)) return; rcu_read_lock(); list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP; } rcu_read_unlock(); pool->cached_need_wakeup &= ~XDP_WAKEUP_TX; } EXPORT_SYMBOL(xsk_clear_tx_need_wakeup); bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool) { return pool->uses_need_wakeup; } EXPORT_SYMBOL(xsk_uses_need_wakeup); struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev, u16 queue_id) { if (queue_id < dev->real_num_rx_queues) return dev->_rx[queue_id].pool; if (queue_id < dev->real_num_tx_queues) return dev->_tx[queue_id].pool; return NULL; } EXPORT_SYMBOL(xsk_get_pool_from_qid); void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id) { if (queue_id < dev->num_rx_queues) dev->_rx[queue_id].pool = NULL; if (queue_id < dev->num_tx_queues) dev->_tx[queue_id].pool = NULL; } /* The buffer pool is stored both in the _rx struct and the _tx struct as we do * not know if the device has more tx queues than rx, or the opposite. * This might also change during run time. */ int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool, u16 queue_id) { if (queue_id >= max_t(unsigned int, dev->real_num_rx_queues, dev->real_num_tx_queues)) return -EINVAL; if (queue_id < dev->real_num_rx_queues) dev->_rx[queue_id].pool = pool; if (queue_id < dev->real_num_tx_queues) dev->_tx[queue_id].pool = pool; return 0; } static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff_xsk *xskb, u32 len, u32 flags) { u64 addr; int err; addr = xp_get_handle(xskb, xskb->pool); err = xskq_prod_reserve_desc(xs->rx, addr, len, flags); if (err) { xs->rx_queue_full++; return err; } xp_release(xskb); return 0; } static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); u32 frags = xdp_buff_has_frags(xdp); struct xdp_buff_xsk *pos, *tmp; struct list_head *xskb_list; u32 contd = 0; int err; if (frags) contd = XDP_PKT_CONTD; err = __xsk_rcv_zc(xs, xskb, len, contd); if (err) goto err; if (likely(!frags)) return 0; xskb_list = &xskb->pool->xskb_list; list_for_each_entry_safe(pos, tmp, xskb_list, list_node) { if (list_is_singular(xskb_list)) contd = 0; len = pos->xdp.data_end - pos->xdp.data; err = __xsk_rcv_zc(xs, pos, len, contd); if (err) goto err; list_del(&pos->list_node); } return 0; err: xsk_buff_free(xdp); return err; } static void *xsk_copy_xdp_start(struct xdp_buff *from) { if (unlikely(xdp_data_meta_unsupported(from))) return from->data; else return from->data_meta; } static u32 xsk_copy_xdp(void *to, void **from, u32 to_len, u32 *from_len, skb_frag_t **frag, u32 rem) { u32 copied = 0; while (1) { u32 copy_len = min_t(u32, *from_len, to_len); memcpy(to, *from, copy_len); copied += copy_len; if (rem == copied) return copied; if (*from_len == copy_len) { *from = skb_frag_address(*frag); *from_len = skb_frag_size((*frag)++); } else { *from += copy_len; *from_len -= copy_len; } if (to_len == copy_len) return copied; to_len -= copy_len; to += copy_len; } } static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { u32 frame_size = xsk_pool_get_rx_frame_size(xs->pool); void *copy_from = xsk_copy_xdp_start(xdp), *copy_to; u32 from_len, meta_len, rem, num_desc; struct xdp_buff_xsk *xskb; struct xdp_buff *xsk_xdp; skb_frag_t *frag; from_len = xdp->data_end - copy_from; meta_len = xdp->data - copy_from; rem = len + meta_len; if (len <= frame_size && !xdp_buff_has_frags(xdp)) { int err; xsk_xdp = xsk_buff_alloc(xs->pool); if (!xsk_xdp) { xs->rx_dropped++; return -ENOMEM; } memcpy(xsk_xdp->data - meta_len, copy_from, rem); xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp); err = __xsk_rcv_zc(xs, xskb, len, 0); if (err) { xsk_buff_free(xsk_xdp); return err; } return 0; } num_desc = (len - 1) / frame_size + 1; if (!xsk_buff_can_alloc(xs->pool, num_desc)) { xs->rx_dropped++; return -ENOMEM; } if (xskq_prod_nb_free(xs->rx, num_desc) < num_desc) { xs->rx_queue_full++; return -ENOBUFS; } if (xdp_buff_has_frags(xdp)) { struct skb_shared_info *sinfo; sinfo = xdp_get_shared_info_from_buff(xdp); frag = &sinfo->frags[0]; } do { u32 to_len = frame_size + meta_len; u32 copied; xsk_xdp = xsk_buff_alloc(xs->pool); copy_to = xsk_xdp->data - meta_len; copied = xsk_copy_xdp(copy_to, &copy_from, to_len, &from_len, &frag, rem); rem -= copied; xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp); __xsk_rcv_zc(xs, xskb, copied - meta_len, rem ? XDP_PKT_CONTD : 0); meta_len = 0; } while (rem); return 0; } static bool xsk_tx_writeable(struct xdp_sock *xs) { if (xskq_cons_present_entries(xs->tx) > xs->tx->nentries / 2) return false; return true; } static bool xsk_is_bound(struct xdp_sock *xs) { if (READ_ONCE(xs->state) == XSK_BOUND) { /* Matches smp_wmb() in bind(). */ smp_rmb(); return true; } return false; } static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { if (!xsk_is_bound(xs)) return -ENXIO; if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) return -EINVAL; if (len > xsk_pool_get_rx_frame_size(xs->pool) && !xs->sg) { xs->rx_dropped++; return -ENOSPC; } return 0; } static void xsk_flush(struct xdp_sock *xs) { xskq_prod_submit(xs->rx); __xskq_cons_release(xs->pool->fq); sock_def_readable(&xs->sk); } int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { u32 len = xdp_get_buff_len(xdp); int err; err = xsk_rcv_check(xs, xdp, len); if (!err) { spin_lock_bh(&xs->pool->rx_lock); err = __xsk_rcv(xs, xdp, len); xsk_flush(xs); spin_unlock_bh(&xs->pool->rx_lock); } return err; } static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { u32 len = xdp_get_buff_len(xdp); int err; err = xsk_rcv_check(xs, xdp, len); if (err) return err; if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) { len = xdp->data_end - xdp->data; return xsk_rcv_zc(xs, xdp, len); } err = __xsk_rcv(xs, xdp, len); if (!err) xdp_return_buff(xdp); return err; } int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp) { int err; err = xsk_rcv(xs, xdp); if (err) return err; if (!xs->flush_node.prev) { struct list_head *flush_list = bpf_net_ctx_get_xskmap_flush_list(); list_add(&xs->flush_node, flush_list); } return 0; } void __xsk_map_flush(struct list_head *flush_list) { struct xdp_sock *xs, *tmp; list_for_each_entry_safe(xs, tmp, flush_list, flush_node) { xsk_flush(xs); __list_del_clearprev(&xs->flush_node); } } void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries) { xskq_prod_submit_n(pool->cq, nb_entries); } EXPORT_SYMBOL(xsk_tx_completed); void xsk_tx_release(struct xsk_buff_pool *pool) { struct xdp_sock *xs; rcu_read_lock(); list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { __xskq_cons_release(xs->tx); if (xsk_tx_writeable(xs)) xs->sk.sk_write_space(&xs->sk); } rcu_read_unlock(); } EXPORT_SYMBOL(xsk_tx_release); bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) { bool budget_exhausted = false; struct xdp_sock *xs; rcu_read_lock(); again: list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { if (xs->tx_budget_spent >= MAX_PER_SOCKET_BUDGET) { budget_exhausted = true; continue; } if (!xskq_cons_peek_desc(xs->tx, desc, pool)) { if (xskq_has_descs(xs->tx)) xskq_cons_release(xs->tx); continue; } xs->tx_budget_spent++; /* This is the backpressure mechanism for the Tx path. * Reserve space in the completion queue and only proceed * if there is space in it. This avoids having to implement * any buffering in the Tx path. */ if (xskq_prod_reserve_addr(pool->cq, desc->addr)) goto out; xskq_cons_release(xs->tx); rcu_read_unlock(); return true; } if (budget_exhausted) { list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) xs->tx_budget_spent = 0; budget_exhausted = false; goto again; } out: rcu_read_unlock(); return false; } EXPORT_SYMBOL(xsk_tx_peek_desc); static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, u32 max_entries) { struct xdp_desc *descs = pool->tx_descs; u32 nb_pkts = 0; while (nb_pkts < max_entries && xsk_tx_peek_desc(pool, &descs[nb_pkts])) nb_pkts++; xsk_tx_release(pool); return nb_pkts; } u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 nb_pkts) { struct xdp_sock *xs; rcu_read_lock(); if (!list_is_singular(&pool->xsk_tx_list)) { /* Fallback to the non-batched version */ rcu_read_unlock(); return xsk_tx_peek_release_fallback(pool, nb_pkts); } xs = list_first_or_null_rcu(&pool->xsk_tx_list, struct xdp_sock, tx_list); if (!xs) { nb_pkts = 0; goto out; } nb_pkts = xskq_cons_nb_entries(xs->tx, nb_pkts); /* This is the backpressure mechanism for the Tx path. Try to * reserve space in the completion queue for all packets, but * if there are fewer slots available, just process that many * packets. This avoids having to implement any buffering in * the Tx path. */ nb_pkts = xskq_prod_nb_free(pool->cq, nb_pkts); if (!nb_pkts) goto out; nb_pkts = xskq_cons_read_desc_batch(xs->tx, pool, nb_pkts); if (!nb_pkts) { xs->tx->queue_empty_descs++; goto out; } __xskq_cons_release(xs->tx); xskq_prod_write_addr_batch(pool->cq, pool->tx_descs, nb_pkts); xs->sk.sk_write_space(&xs->sk); out: rcu_read_unlock(); return nb_pkts; } EXPORT_SYMBOL(xsk_tx_peek_release_desc_batch); static int xsk_wakeup(struct xdp_sock *xs, u8 flags) { struct net_device *dev = xs->dev; return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags); } static int xsk_cq_reserve_addr_locked(struct xsk_buff_pool *pool, u64 addr) { unsigned long flags; int ret; spin_lock_irqsave(&pool->cq_lock, flags); ret = xskq_prod_reserve_addr(pool->cq, addr); spin_unlock_irqrestore(&pool->cq_lock, flags); return ret; } static void xsk_cq_submit_locked(struct xsk_buff_pool *pool, u32 n) { unsigned long flags; spin_lock_irqsave(&pool->cq_lock, flags); xskq_prod_submit_n(pool->cq, n); spin_unlock_irqrestore(&pool->cq_lock, flags); } static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n) { unsigned long flags; spin_lock_irqsave(&pool->cq_lock, flags); xskq_prod_cancel_n(pool->cq, n); spin_unlock_irqrestore(&pool->cq_lock, flags); } static u32 xsk_get_num_desc(struct sk_buff *skb) { return skb ? (long)skb_shinfo(skb)->destructor_arg : 0; } static void xsk_destruct_skb(struct sk_buff *skb) { struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta; if (compl->tx_timestamp) { /* sw completion timestamp, not a real one */ *compl->tx_timestamp = ktime_get_tai_fast_ns(); } xsk_cq_submit_locked(xdp_sk(skb->sk)->pool, xsk_get_num_desc(skb)); sock_wfree(skb); } static void xsk_set_destructor_arg(struct sk_buff *skb) { long num = xsk_get_num_desc(xdp_sk(skb->sk)->skb) + 1; skb_shinfo(skb)->destructor_arg = (void *)num; } static void xsk_consume_skb(struct sk_buff *skb) { struct xdp_sock *xs = xdp_sk(skb->sk); skb->destructor = sock_wfree; xsk_cq_cancel_locked(xs->pool, xsk_get_num_desc(skb)); /* Free skb without triggering the perf drop trace */ consume_skb(skb); xs->skb = NULL; } static void xsk_drop_skb(struct sk_buff *skb) { xdp_sk(skb->sk)->tx->invalid_descs += xsk_get_num_desc(skb); xsk_consume_skb(skb); } static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, struct xdp_desc *desc) { struct xsk_buff_pool *pool = xs->pool; u32 hr, len, ts, offset, copy, copied; struct sk_buff *skb = xs->skb; struct page *page; void *buffer; int err, i; u64 addr; if (!skb) { hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom)); skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err); if (unlikely(!skb)) return ERR_PTR(err); skb_reserve(skb, hr); } addr = desc->addr; len = desc->len; ts = pool->unaligned ? len : pool->chunk_size; buffer = xsk_buff_raw_get_data(pool, addr); offset = offset_in_page(buffer); addr = buffer - pool->addrs; for (copied = 0, i = skb_shinfo(skb)->nr_frags; copied < len; i++) { if (unlikely(i >= MAX_SKB_FRAGS)) return ERR_PTR(-EOVERFLOW); page = pool->umem->pgs[addr >> PAGE_SHIFT]; get_page(page); copy = min_t(u32, PAGE_SIZE - offset, len - copied); skb_fill_page_desc(skb, i, page, offset, copy); copied += copy; addr += copy; offset = 0; } skb->len += len; skb->data_len += len; skb->truesize += ts; refcount_add(ts, &xs->sk.sk_wmem_alloc); return skb; } static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, struct xdp_desc *desc) { struct xsk_tx_metadata *meta = NULL; struct net_device *dev = xs->dev; struct sk_buff *skb = xs->skb; bool first_frag = false; int err; if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) { skb = xsk_build_skb_zerocopy(xs, desc); if (IS_ERR(skb)) { err = PTR_ERR(skb); goto free_err; } } else { u32 hr, tr, len; void *buffer; buffer = xsk_buff_raw_get_data(xs->pool, desc->addr); len = desc->len; if (!skb) { first_frag = true; hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom)); tr = dev->needed_tailroom; skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err); if (unlikely(!skb)) goto free_err; skb_reserve(skb, hr); skb_put(skb, len); err = skb_store_bits(skb, 0, buffer, len); if (unlikely(err)) goto free_err; } else { int nr_frags = skb_shinfo(skb)->nr_frags; struct page *page; u8 *vaddr; if (unlikely(nr_frags == (MAX_SKB_FRAGS - 1) && xp_mb_desc(desc))) { err = -EOVERFLOW; goto free_err; } page = alloc_page(xs->sk.sk_allocation); if (unlikely(!page)) { err = -EAGAIN; goto free_err; } vaddr = kmap_local_page(page); memcpy(vaddr, buffer, len); kunmap_local(vaddr); skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE); refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc); } if (first_frag && desc->options & XDP_TX_METADATA) { if (unlikely(xs->pool->tx_metadata_len == 0)) { err = -EINVAL; goto free_err; } meta = buffer - xs->pool->tx_metadata_len; if (unlikely(!xsk_buff_valid_tx_metadata(meta))) { err = -EINVAL; goto free_err; } if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM) { if (unlikely(meta->request.csum_start + meta->request.csum_offset + sizeof(__sum16) > len)) { err = -EINVAL; goto free_err; } skb->csum_start = hr + meta->request.csum_start; skb->csum_offset = meta->request.csum_offset; skb->ip_summed = CHECKSUM_PARTIAL; if (unlikely(xs->pool->tx_sw_csum)) { err = skb_checksum_help(skb); if (err) goto free_err; } } if (meta->flags & XDP_TXMD_FLAGS_LAUNCH_TIME) skb->skb_mstamp_ns = meta->request.launch_time; } } skb->dev = dev; skb->priority = READ_ONCE(xs->sk.sk_priority); skb->mark = READ_ONCE(xs->sk.sk_mark); skb->destructor = xsk_destruct_skb; xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta); xsk_set_destructor_arg(skb); return skb; free_err: if (first_frag && skb) kfree_skb(skb); if (err == -EOVERFLOW) { /* Drop the packet */ xsk_set_destructor_arg(xs->skb); xsk_drop_skb(xs->skb); xskq_cons_release(xs->tx); } else { /* Let application retry */ xsk_cq_cancel_locked(xs->pool, 1); } return ERR_PTR(err); } static int __xsk_generic_xmit(struct sock *sk) { struct xdp_sock *xs = xdp_sk(sk); u32 max_batch = TX_BATCH_SIZE; bool sent_frame = false; struct xdp_desc desc; struct sk_buff *skb; int err = 0; mutex_lock(&xs->mutex); /* Since we dropped the RCU read lock, the socket state might have changed. */ if (unlikely(!xsk_is_bound(xs))) { err = -ENXIO; goto out; } if (xs->queue_id >= xs->dev->real_num_tx_queues) goto out; while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) { if (max_batch-- == 0) { err = -EAGAIN; goto out; } /* This is the backpressure mechanism for the Tx path. * Reserve space in the completion queue and only proceed * if there is space in it. This avoids having to implement * any buffering in the Tx path. */ err = xsk_cq_reserve_addr_locked(xs->pool, desc.addr); if (err) { err = -EAGAIN; goto out; } skb = xsk_build_skb(xs, &desc); if (IS_ERR(skb)) { err = PTR_ERR(skb); if (err != -EOVERFLOW) goto out; err = 0; continue; } xskq_cons_release(xs->tx); if (xp_mb_desc(&desc)) { xs->skb = skb; continue; } err = __dev_direct_xmit(skb, xs->queue_id); if (err == NETDEV_TX_BUSY) { /* Tell user-space to retry the send */ xskq_cons_cancel_n(xs->tx, xsk_get_num_desc(skb)); xsk_consume_skb(skb); err = -EAGAIN; goto out; } /* Ignore NET_XMIT_CN as packet might have been sent */ if (err == NET_XMIT_DROP) { /* SKB completed but not sent */ err = -EBUSY; xs->skb = NULL; goto out; } sent_frame = true; xs->skb = NULL; } if (xskq_has_descs(xs->tx)) { if (xs->skb) xsk_drop_skb(xs->skb); xskq_cons_release(xs->tx); } out: if (sent_frame) if (xsk_tx_writeable(xs)) sk->sk_write_space(sk); mutex_unlock(&xs->mutex); return err; } static int xsk_generic_xmit(struct sock *sk) { int ret; /* Drop the RCU lock since the SKB path might sleep. */ rcu_read_unlock(); ret = __xsk_generic_xmit(sk); /* Reaquire RCU lock before going into common code. */ rcu_read_lock(); return ret; } static bool xsk_no_wakeup(struct sock *sk) { #ifdef CONFIG_NET_RX_BUSY_POLL /* Prefer busy-polling, skip the wakeup. */ return READ_ONCE(sk->sk_prefer_busy_poll) && READ_ONCE(sk->sk_ll_usec) && napi_id_valid(READ_ONCE(sk->sk_napi_id)); #else return false; #endif } static int xsk_check_common(struct xdp_sock *xs) { if (unlikely(!xsk_is_bound(xs))) return -ENXIO; if (unlikely(!(xs->dev->flags & IFF_UP))) return -ENETDOWN; return 0; } static int __xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { bool need_wait = !(m->msg_flags & MSG_DONTWAIT); struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); struct xsk_buff_pool *pool; int err; err = xsk_check_common(xs); if (err) return err; if (unlikely(need_wait)) return -EOPNOTSUPP; if (unlikely(!xs->tx)) return -ENOBUFS; if (sk_can_busy_loop(sk)) sk_busy_loop(sk, 1); /* only support non-blocking sockets */ if (xs->zc && xsk_no_wakeup(sk)) return 0; pool = xs->pool; if (pool->cached_need_wakeup & XDP_WAKEUP_TX) { if (xs->zc) return xsk_wakeup(xs, XDP_WAKEUP_TX); return xsk_generic_xmit(sk); } return 0; } static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { int ret; rcu_read_lock(); ret = __xsk_sendmsg(sock, m, total_len); rcu_read_unlock(); return ret; } static int __xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags) { bool need_wait = !(flags & MSG_DONTWAIT); struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); int err; err = xsk_check_common(xs); if (err) return err; if (unlikely(!xs->rx)) return -ENOBUFS; if (unlikely(need_wait)) return -EOPNOTSUPP; if (sk_can_busy_loop(sk)) sk_busy_loop(sk, 1); /* only support non-blocking sockets */ if (xsk_no_wakeup(sk)) return 0; if (xs->pool->cached_need_wakeup & XDP_WAKEUP_RX && xs->zc) return xsk_wakeup(xs, XDP_WAKEUP_RX); return 0; } static int xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags) { int ret; rcu_read_lock(); ret = __xsk_recvmsg(sock, m, len, flags); rcu_read_unlock(); return ret; } static __poll_t xsk_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait) { __poll_t mask = 0; struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); struct xsk_buff_pool *pool; sock_poll_wait(file, sock, wait); rcu_read_lock(); if (xsk_check_common(xs)) goto out; pool = xs->pool; if (pool->cached_need_wakeup) { if (xs->zc) xsk_wakeup(xs, pool->cached_need_wakeup); else if (xs->tx) /* Poll needs to drive Tx also in copy mode */ xsk_generic_xmit(sk); } if (xs->rx && !xskq_prod_is_empty(xs->rx)) mask |= EPOLLIN | EPOLLRDNORM; if (xs->tx && xsk_tx_writeable(xs)) mask |= EPOLLOUT | EPOLLWRNORM; out: rcu_read_unlock(); return mask; } static int xsk_init_queue(u32 entries, struct xsk_queue **queue, bool umem_queue) { struct xsk_queue *q; if (entries == 0 || *queue || !is_power_of_2(entries)) return -EINVAL; q = xskq_create(entries, umem_queue); if (!q) return -ENOMEM; /* Make sure queue is ready before it can be seen by others */ smp_wmb(); WRITE_ONCE(*queue, q); return 0; } static void xsk_unbind_dev(struct xdp_sock *xs) { struct net_device *dev = xs->dev; if (xs->state != XSK_BOUND) return; WRITE_ONCE(xs->state, XSK_UNBOUND); /* Wait for driver to stop using the xdp socket. */ xp_del_xsk(xs->pool, xs); synchronize_net(); dev_put(dev); } static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs, struct xdp_sock __rcu ***map_entry) { struct xsk_map *map = NULL; struct xsk_map_node *node; *map_entry = NULL; spin_lock_bh(&xs->map_list_lock); node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node, node); if (node) { bpf_map_inc(&node->map->map); map = node->map; *map_entry = node->map_entry; } spin_unlock_bh(&xs->map_list_lock); return map; } static void xsk_delete_from_maps(struct xdp_sock *xs) { /* This function removes the current XDP socket from all the * maps it resides in. We need to take extra care here, due to * the two locks involved. Each map has a lock synchronizing * updates to the entries, and each socket has a lock that * synchronizes access to the list of maps (map_list). For * deadlock avoidance the locks need to be taken in the order * "map lock"->"socket map list lock". We start off by * accessing the socket map list, and take a reference to the * map to guarantee existence between the * xsk_get_map_list_entry() and xsk_map_try_sock_delete() * calls. Then we ask the map to remove the socket, which * tries to remove the socket from the map. Note that there * might be updates to the map between * xsk_get_map_list_entry() and xsk_map_try_sock_delete(). */ struct xdp_sock __rcu **map_entry = NULL; struct xsk_map *map; while ((map = xsk_get_map_list_entry(xs, &map_entry))) { xsk_map_try_sock_delete(map, xs, map_entry); bpf_map_put(&map->map); } } static int xsk_release(struct socket *sock) { struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); struct net *net; if (!sk) return 0; net = sock_net(sk); if (xs->skb) xsk_drop_skb(xs->skb); mutex_lock(&net->xdp.lock); sk_del_node_init_rcu(sk); mutex_unlock(&net->xdp.lock); sock_prot_inuse_add(net, sk->sk_prot, -1); xsk_delete_from_maps(xs); mutex_lock(&xs->mutex); xsk_unbind_dev(xs); mutex_unlock(&xs->mutex); xskq_destroy(xs->rx); xskq_destroy(xs->tx); xskq_destroy(xs->fq_tmp); xskq_destroy(xs->cq_tmp); sock_orphan(sk); sock->sk = NULL; sock_put(sk); return 0; } static struct socket *xsk_lookup_xsk_from_fd(int fd) { struct socket *sock; int err; sock = sockfd_lookup(fd, &err); if (!sock) return ERR_PTR(-ENOTSOCK); if (sock->sk->sk_family != PF_XDP) { sockfd_put(sock); return ERR_PTR(-ENOPROTOOPT); } return sock; } static bool xsk_validate_queues(struct xdp_sock *xs) { return xs->fq_tmp && xs->cq_tmp; } static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr; struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); struct net_device *dev; int bound_dev_if; u32 flags, qid; int err = 0; if (addr_len < sizeof(struct sockaddr_xdp)) return -EINVAL; if (sxdp->sxdp_family != AF_XDP) return -EINVAL; flags = sxdp->sxdp_flags; if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY | XDP_USE_NEED_WAKEUP | XDP_USE_SG)) return -EINVAL; bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); if (bound_dev_if && bound_dev_if != sxdp->sxdp_ifindex) return -EINVAL; rtnl_lock(); mutex_lock(&xs->mutex); if (xs->state != XSK_READY) { err = -EBUSY; goto out_release; } dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex); if (!dev) { err = -ENODEV; goto out_release; } netdev_lock_ops(dev); if (!xs->rx && !xs->tx) { err = -EINVAL; goto out_unlock; } qid = sxdp->sxdp_queue_id; if (flags & XDP_SHARED_UMEM) { struct xdp_sock *umem_xs; struct socket *sock; if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) || (flags & XDP_USE_NEED_WAKEUP) || (flags & XDP_USE_SG)) { /* Cannot specify flags for shared sockets. */ err = -EINVAL; goto out_unlock; } if (xs->umem) { /* We have already our own. */ err = -EINVAL; goto out_unlock; } sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd); if (IS_ERR(sock)) { err = PTR_ERR(sock); goto out_unlock; } umem_xs = xdp_sk(sock->sk); if (!xsk_is_bound(umem_xs)) { err = -EBADF; sockfd_put(sock); goto out_unlock; } if (umem_xs->queue_id != qid || umem_xs->dev != dev) { /* Share the umem with another socket on another qid * and/or device. */ xs->pool = xp_create_and_assign_umem(xs, umem_xs->umem); if (!xs->pool) { err = -ENOMEM; sockfd_put(sock); goto out_unlock; } err = xp_assign_dev_shared(xs->pool, umem_xs, dev, qid); if (err) { xp_destroy(xs->pool); xs->pool = NULL; sockfd_put(sock); goto out_unlock; } } else { /* Share the buffer pool with the other socket. */ if (xs->fq_tmp || xs->cq_tmp) { /* Do not allow setting your own fq or cq. */ err = -EINVAL; sockfd_put(sock); goto out_unlock; } xp_get_pool(umem_xs->pool); xs->pool = umem_xs->pool; /* If underlying shared umem was created without Tx * ring, allocate Tx descs array that Tx batching API * utilizes */ if (xs->tx && !xs->pool->tx_descs) { err = xp_alloc_tx_descs(xs->pool, xs); if (err) { xp_put_pool(xs->pool); xs->pool = NULL; sockfd_put(sock); goto out_unlock; } } } xdp_get_umem(umem_xs->umem); WRITE_ONCE(xs->umem, umem_xs->umem); sockfd_put(sock); } else if (!xs->umem || !xsk_validate_queues(xs)) { err = -EINVAL; goto out_unlock; } else { /* This xsk has its own umem. */ xs->pool = xp_create_and_assign_umem(xs, xs->umem); if (!xs->pool) { err = -ENOMEM; goto out_unlock; } err = xp_assign_dev(xs->pool, dev, qid, flags); if (err) { xp_destroy(xs->pool); xs->pool = NULL; goto out_unlock; } } /* FQ and CQ are now owned by the buffer pool and cleaned up with it. */ xs->fq_tmp = NULL; xs->cq_tmp = NULL; xs->dev = dev; xs->zc = xs->umem->zc; xs->sg = !!(xs->umem->flags & XDP_UMEM_SG_FLAG); xs->queue_id = qid; xp_add_xsk(xs->pool, xs); if (qid < dev->real_num_rx_queues) { struct netdev_rx_queue *rxq; rxq = __netif_get_rx_queue(dev, qid); if (rxq->napi) __sk_mark_napi_id_once(sk, rxq->napi->napi_id); } out_unlock: if (err) { dev_put(dev); } else { /* Matches smp_rmb() in bind() for shared umem * sockets, and xsk_is_bound(). */ smp_wmb(); WRITE_ONCE(xs->state, XSK_BOUND); } netdev_unlock_ops(dev); out_release: mutex_unlock(&xs->mutex); rtnl_unlock(); return err; } struct xdp_umem_reg_v1 { __u64 addr; /* Start of packet data area */ __u64 len; /* Length of packet data area */ __u32 chunk_size; __u32 headroom; }; static int xsk_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); int err; if (level != SOL_XDP) return -ENOPROTOOPT; switch (optname) { case XDP_RX_RING: case XDP_TX_RING: { struct xsk_queue **q; int entries; if (optlen < sizeof(entries)) return -EINVAL; if (copy_from_sockptr(&entries, optval, sizeof(entries))) return -EFAULT; mutex_lock(&xs->mutex); if (xs->state != XSK_READY) { mutex_unlock(&xs->mutex); return -EBUSY; } q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx; err = xsk_init_queue(entries, q, false); if (!err && optname == XDP_TX_RING) /* Tx needs to be explicitly woken up the first time */ xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP; mutex_unlock(&xs->mutex); return err; } case XDP_UMEM_REG: { size_t mr_size = sizeof(struct xdp_umem_reg); struct xdp_umem_reg mr = {}; struct xdp_umem *umem; if (optlen < sizeof(struct xdp_umem_reg_v1)) return -EINVAL; else if (optlen < sizeof(mr)) mr_size = sizeof(struct xdp_umem_reg_v1); BUILD_BUG_ON(sizeof(struct xdp_umem_reg_v1) >= sizeof(struct xdp_umem_reg)); /* Make sure the last field of the struct doesn't have * uninitialized padding. All padding has to be explicit * and has to be set to zero by the userspace to make * struct xdp_umem_reg extensible in the future. */ BUILD_BUG_ON(offsetof(struct xdp_umem_reg, tx_metadata_len) + sizeof_field(struct xdp_umem_reg, tx_metadata_len) != sizeof(struct xdp_umem_reg)); if (copy_from_sockptr(&mr, optval, mr_size)) return -EFAULT; mutex_lock(&xs->mutex); if (xs->state != XSK_READY || xs->umem) { mutex_unlock(&xs->mutex); return -EBUSY; } umem = xdp_umem_create(&mr); if (IS_ERR(umem)) { mutex_unlock(&xs->mutex); return PTR_ERR(umem); } /* Make sure umem is ready before it can be seen by others */ smp_wmb(); WRITE_ONCE(xs->umem, umem); mutex_unlock(&xs->mutex); return 0; } case XDP_UMEM_FILL_RING: case XDP_UMEM_COMPLETION_RING: { struct xsk_queue **q; int entries; if (optlen < sizeof(entries)) return -EINVAL; if (copy_from_sockptr(&entries, optval, sizeof(entries))) return -EFAULT; mutex_lock(&xs->mutex); if (xs->state != XSK_READY) { mutex_unlock(&xs->mutex); return -EBUSY; } q = (optname == XDP_UMEM_FILL_RING) ? &xs->fq_tmp : &xs->cq_tmp; err = xsk_init_queue(entries, q, true); mutex_unlock(&xs->mutex); return err; } default: break; } return -ENOPROTOOPT; } static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring) { ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer); ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer); ring->desc = offsetof(struct xdp_rxtx_ring, desc); } static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring) { ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer); ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer); ring->desc = offsetof(struct xdp_umem_ring, desc); } struct xdp_statistics_v1 { __u64 rx_dropped; __u64 rx_invalid_descs; __u64 tx_invalid_descs; }; static int xsk_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); int len; if (level != SOL_XDP) return -ENOPROTOOPT; if (get_user(len, optlen)) return -EFAULT; if (len < 0) return -EINVAL; switch (optname) { case XDP_STATISTICS: { struct xdp_statistics stats = {}; bool extra_stats = true; size_t stats_size; if (len < sizeof(struct xdp_statistics_v1)) { return -EINVAL; } else if (len < sizeof(stats)) { extra_stats = false; stats_size = sizeof(struct xdp_statistics_v1); } else { stats_size = sizeof(stats); } mutex_lock(&xs->mutex); stats.rx_dropped = xs->rx_dropped; if (extra_stats) { stats.rx_ring_full = xs->rx_queue_full; stats.rx_fill_ring_empty_descs = xs->pool ? xskq_nb_queue_empty_descs(xs->pool->fq) : 0; stats.tx_ring_empty_descs = xskq_nb_queue_empty_descs(xs->tx); } else { stats.rx_dropped += xs->rx_queue_full; } stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx); stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx); mutex_unlock(&xs->mutex); if (copy_to_user(optval, &stats, stats_size)) return -EFAULT; if (put_user(stats_size, optlen)) return -EFAULT; return 0; } case XDP_MMAP_OFFSETS: { struct xdp_mmap_offsets off; struct xdp_mmap_offsets_v1 off_v1; bool flags_supported = true; void *to_copy; if (len < sizeof(off_v1)) return -EINVAL; else if (len < sizeof(off)) flags_supported = false; if (flags_supported) { /* xdp_ring_offset is identical to xdp_ring_offset_v1 * except for the flags field added to the end. */ xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *) &off.rx); xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *) &off.tx); xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *) &off.fr); xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *) &off.cr); off.rx.flags = offsetof(struct xdp_rxtx_ring, ptrs.flags); off.tx.flags = offsetof(struct xdp_rxtx_ring, ptrs.flags); off.fr.flags = offsetof(struct xdp_umem_ring, ptrs.flags); off.cr.flags = offsetof(struct xdp_umem_ring, ptrs.flags); len = sizeof(off); to_copy = &off; } else { xsk_enter_rxtx_offsets(&off_v1.rx); xsk_enter_rxtx_offsets(&off_v1.tx); xsk_enter_umem_offsets(&off_v1.fr); xsk_enter_umem_offsets(&off_v1.cr); len = sizeof(off_v1); to_copy = &off_v1; } if (copy_to_user(optval, to_copy, len)) return -EFAULT; if (put_user(len, optlen)) return -EFAULT; return 0; } case XDP_OPTIONS: { struct xdp_options opts = {}; if (len < sizeof(opts)) return -EINVAL; mutex_lock(&xs->mutex); if (xs->zc) opts.flags |= XDP_OPTIONS_ZEROCOPY; mutex_unlock(&xs->mutex); len = sizeof(opts); if (copy_to_user(optval, &opts, len)) return -EFAULT; if (put_user(len, optlen)) return -EFAULT; return 0; } default: break; } return -EOPNOTSUPP; } static int xsk_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) { loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT; unsigned long size = vma->vm_end - vma->vm_start; struct xdp_sock *xs = xdp_sk(sock->sk); int state = READ_ONCE(xs->state); struct xsk_queue *q = NULL; if (state != XSK_READY && state != XSK_BOUND) return -EBUSY; if (offset == XDP_PGOFF_RX_RING) { q = READ_ONCE(xs->rx); } else if (offset == XDP_PGOFF_TX_RING) { q = READ_ONCE(xs->tx); } else { /* Matches the smp_wmb() in XDP_UMEM_REG */ smp_rmb(); if (offset == XDP_UMEM_PGOFF_FILL_RING) q = state == XSK_READY ? READ_ONCE(xs->fq_tmp) : READ_ONCE(xs->pool->fq); else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING) q = state == XSK_READY ? READ_ONCE(xs->cq_tmp) : READ_ONCE(xs->pool->cq); } if (!q) return -EINVAL; /* Matches the smp_wmb() in xsk_init_queue */ smp_rmb(); if (size > q->ring_vmalloc_size) return -EINVAL; return remap_vmalloc_range(vma, q->ring, 0); } static int xsk_notifier(struct notifier_block *this, unsigned long msg, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct sock *sk; switch (msg) { case NETDEV_UNREGISTER: mutex_lock(&net->xdp.lock); sk_for_each(sk, &net->xdp.list) { struct xdp_sock *xs = xdp_sk(sk); mutex_lock(&xs->mutex); if (xs->dev == dev) { sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); xsk_unbind_dev(xs); /* Clear device references. */ xp_clear_dev(xs->pool); } mutex_unlock(&xs->mutex); } mutex_unlock(&net->xdp.lock); break; } return NOTIFY_DONE; } static struct proto xsk_proto = { .name = "XDP", .owner = THIS_MODULE, .obj_size = sizeof(struct xdp_sock), }; static const struct proto_ops xsk_proto_ops = { .family = PF_XDP, .owner = THIS_MODULE, .release = xsk_release, .bind = xsk_bind, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, .poll = xsk_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = xsk_setsockopt, .getsockopt = xsk_getsockopt, .sendmsg = xsk_sendmsg, .recvmsg = xsk_recvmsg, .mmap = xsk_mmap, }; static void xsk_destruct(struct sock *sk) { struct xdp_sock *xs = xdp_sk(sk); if (!sock_flag(sk, SOCK_DEAD)) return; if (!xp_put_pool(xs->pool)) xdp_put_umem(xs->umem, !xs->pool); } static int xsk_create(struct net *net, struct socket *sock, int protocol, int kern) { struct xdp_sock *xs; struct sock *sk; if (!ns_capable(net->user_ns, CAP_NET_RAW)) return -EPERM; if (sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; if (protocol) return -EPROTONOSUPPORT; sock->state = SS_UNCONNECTED; sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern); if (!sk) return -ENOBUFS; sock->ops = &xsk_proto_ops; sock_init_data(sock, sk); sk->sk_family = PF_XDP; sk->sk_destruct = xsk_destruct; sock_set_flag(sk, SOCK_RCU_FREE); xs = xdp_sk(sk); xs->state = XSK_READY; mutex_init(&xs->mutex); INIT_LIST_HEAD(&xs->map_list); spin_lock_init(&xs->map_list_lock); mutex_lock(&net->xdp.lock); sk_add_node_rcu(sk, &net->xdp.list); mutex_unlock(&net->xdp.lock); sock_prot_inuse_add(net, &xsk_proto, 1); return 0; } static const struct net_proto_family xsk_family_ops = { .family = PF_XDP, .create = xsk_create, .owner = THIS_MODULE, }; static struct notifier_block xsk_netdev_notifier = { .notifier_call = xsk_notifier, }; static int __net_init xsk_net_init(struct net *net) { mutex_init(&net->xdp.lock); INIT_HLIST_HEAD(&net->xdp.list); return 0; } static void __net_exit xsk_net_exit(struct net *net) { WARN_ON_ONCE(!hlist_empty(&net->xdp.list)); } static struct pernet_operations xsk_net_ops = { .init = xsk_net_init, .exit = xsk_net_exit, }; static int __init xsk_init(void) { int err; err = proto_register(&xsk_proto, 0 /* no slab */); if (err) goto out; err = sock_register(&xsk_family_ops); if (err) goto out_proto; err = register_pernet_subsys(&xsk_net_ops); if (err) goto out_sk; err = register_netdevice_notifier(&xsk_netdev_notifier); if (err) goto out_pernet; return 0; out_pernet: unregister_pernet_subsys(&xsk_net_ops); out_sk: sock_unregister(PF_XDP); out_proto: proto_unregister(&xsk_proto); out: return err; } fs_initcall(xsk_init);
24 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 /* SPDX-License-Identifier: GPL-2.0 */ /* File: linux/xattr.h Extended attributes handling. Copyright (C) 2001 by Andreas Gruenbacher <a.gruenbacher@computer.org> Copyright (c) 2001-2002 Silicon Graphics, Inc. All Rights Reserved. Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> */ #ifndef _LINUX_XATTR_H #define _LINUX_XATTR_H #include <linux/slab.h> #include <linux/types.h> #include <linux/spinlock.h> #include <linux/mm.h> #include <linux/user_namespace.h> #include <uapi/linux/xattr.h> /* List of all open_how "versions". */ #define XATTR_ARGS_SIZE_VER0 16 /* sizeof first published struct */ #define XATTR_ARGS_SIZE_LATEST XATTR_ARGS_SIZE_VER0 struct inode; struct dentry; static inline bool is_posix_acl_xattr(const char *name) { return (strcmp(name, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || (strcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT) == 0); } /* * struct xattr_handler: When @name is set, match attributes with exactly that * name. When @prefix is set instead, match attributes with that prefix and * with a non-empty suffix. */ struct xattr_handler { const char *name; const char *prefix; int flags; /* fs private flags */ bool (*list)(struct dentry *dentry); int (*get)(const struct xattr_handler *, struct dentry *dentry, struct inode *inode, const char *name, void *buffer, size_t size); int (*set)(const struct xattr_handler *, struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *buffer, size_t size, int flags); }; /** * xattr_handler_can_list - check whether xattr can be listed * @handler: handler for this type of xattr * @dentry: dentry whose inode xattr to list * * Determine whether the xattr associated with @dentry can be listed given * @handler. * * Return: true if xattr can be listed, false if not. */ static inline bool xattr_handler_can_list(const struct xattr_handler *handler, struct dentry *dentry) { return handler && (!handler->list || handler->list(dentry)); } const char *xattr_full_name(const struct xattr_handler *, const char *); struct xattr { const char *name; void *value; size_t value_len; }; ssize_t __vfs_getxattr(struct dentry *, struct inode *, const char *, void *, size_t); ssize_t vfs_getxattr(struct mnt_idmap *, struct dentry *, const char *, void *, size_t); ssize_t vfs_listxattr(struct dentry *d, char *list, size_t size); int __vfs_setxattr(struct mnt_idmap *, struct dentry *, struct inode *, const char *, const void *, size_t, int); int __vfs_setxattr_noperm(struct mnt_idmap *, struct dentry *, const char *, const void *, size_t, int); int __vfs_setxattr_locked(struct mnt_idmap *, struct dentry *, const char *, const void *, size_t, int, struct inode **); int vfs_setxattr(struct mnt_idmap *, struct dentry *, const char *, const void *, size_t, int); int __vfs_removexattr(struct mnt_idmap *, struct dentry *, const char *); int __vfs_removexattr_locked(struct mnt_idmap *, struct dentry *, const char *, struct inode **); int vfs_removexattr(struct mnt_idmap *, struct dentry *, const char *); ssize_t generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size); int vfs_getxattr_alloc(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, char **xattr_value, size_t size, gfp_t flags); int xattr_supports_user_prefix(struct inode *inode); static inline const char *xattr_prefix(const struct xattr_handler *handler) { return handler->prefix ?: handler->name; } struct simple_xattrs { struct rb_root rb_root; rwlock_t lock; }; struct simple_xattr { struct rb_node rb_node; char *name; size_t size; char value[]; }; void simple_xattrs_init(struct simple_xattrs *xattrs); void simple_xattrs_free(struct simple_xattrs *xattrs, size_t *freed_space); size_t simple_xattr_space(const char *name, size_t size); struct simple_xattr *simple_xattr_alloc(const void *value, size_t size); void simple_xattr_free(struct simple_xattr *xattr); int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, void *buffer, size_t size); struct simple_xattr *simple_xattr_set(struct simple_xattrs *xattrs, const char *name, const void *value, size_t size, int flags); ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, char *buffer, size_t size); void simple_xattr_add(struct simple_xattrs *xattrs, struct simple_xattr *new_xattr); int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name); #endif /* _LINUX_XATTR_H */
1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 /* * llc_station.c - station component of LLC * * Copyright (c) 1997 by Procom Technology, Inc. * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * This program can be redistributed or modified under the terms of the * GNU General Public License as published by the Free Software Foundation. * This program is distributed without any warranty or implied warranty * of merchantability or fitness for a particular purpose. * * See the GNU General Public License for more details. */ #include <linux/init.h> #include <linux/module.h> #include <linux/slab.h> #include <net/llc.h> #include <net/llc_sap.h> #include <net/llc_conn.h> #include <net/llc_c_ac.h> #include <net/llc_s_ac.h> #include <net/llc_c_ev.h> #include <net/llc_c_st.h> #include <net/llc_s_ev.h> #include <net/llc_s_st.h> #include <net/llc_pdu.h> static int llc_stat_ev_rx_null_dsap_xid_c(struct sk_buff *skb) { struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); return LLC_PDU_IS_CMD(pdu) && /* command PDU */ LLC_PDU_TYPE_IS_U(pdu) && /* U type PDU */ LLC_U_PDU_CMD(pdu) == LLC_1_PDU_CMD_XID && !pdu->dsap; /* NULL DSAP value */ } static int llc_stat_ev_rx_null_dsap_test_c(struct sk_buff *skb) { struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); return LLC_PDU_IS_CMD(pdu) && /* command PDU */ LLC_PDU_TYPE_IS_U(pdu) && /* U type PDU */ LLC_U_PDU_CMD(pdu) == LLC_1_PDU_CMD_TEST && !pdu->dsap; /* NULL DSAP */ } static int llc_station_ac_send_xid_r(struct sk_buff *skb) { u8 mac_da[ETH_ALEN], dsap; int rc = 1; struct sk_buff *nskb = llc_alloc_frame(NULL, skb->dev, LLC_PDU_TYPE_U, sizeof(struct llc_xid_info)); if (!nskb) goto out; llc_pdu_decode_sa(skb, mac_da); llc_pdu_decode_ssap(skb, &dsap); llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, 0, dsap, LLC_PDU_RSP); llc_pdu_init_as_xid_rsp(nskb, LLC_XID_NULL_CLASS_2, 127); rc = llc_mac_hdr_init(nskb, skb->dev->dev_addr, mac_da); if (unlikely(rc)) goto free; dev_queue_xmit(nskb); out: return rc; free: kfree_skb(nskb); goto out; } static int llc_station_ac_send_test_r(struct sk_buff *skb) { u8 mac_da[ETH_ALEN], dsap; int rc = 1; u32 data_size; struct sk_buff *nskb; if (skb->mac_len < ETH_HLEN) goto out; /* The test request command is type U (llc_len = 3) */ data_size = ntohs(eth_hdr(skb)->h_proto) - 3; nskb = llc_alloc_frame(NULL, skb->dev, LLC_PDU_TYPE_U, data_size); if (!nskb) goto out; llc_pdu_decode_sa(skb, mac_da); llc_pdu_decode_ssap(skb, &dsap); llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, 0, dsap, LLC_PDU_RSP); llc_pdu_init_as_test_rsp(nskb, skb); rc = llc_mac_hdr_init(nskb, skb->dev->dev_addr, mac_da); if (unlikely(rc)) goto free; dev_queue_xmit(nskb); out: return rc; free: kfree_skb(nskb); goto out; } /** * llc_station_rcv - send received pdu to the station state machine * @skb: received frame. * * Sends data unit to station state machine. */ static void llc_station_rcv(struct sk_buff *skb) { if (llc_stat_ev_rx_null_dsap_xid_c(skb)) llc_station_ac_send_xid_r(skb); else if (llc_stat_ev_rx_null_dsap_test_c(skb)) llc_station_ac_send_test_r(skb); kfree_skb(skb); } void __init llc_station_init(void) { llc_set_station_handler(llc_station_rcv); } void llc_station_exit(void) { llc_set_station_handler(NULL); }
298 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2015, 2016 ARM Ltd. */ #ifndef __KVM_ARM_VGIC_H #define __KVM_ARM_VGIC_H #include <linux/bits.h> #include <linux/kvm.h> #include <linux/irqreturn.h> #include <linux/kref.h> #include <linux/mutex.h> #include <linux/spinlock.h> #include <linux/static_key.h> #include <linux/types.h> #include <linux/xarray.h> #include <kvm/iodev.h> #include <linux/list.h> #include <linux/jump_label.h> #include <linux/irqchip/arm-gic-v4.h> #define VGIC_V3_MAX_CPUS 512 #define VGIC_V2_MAX_CPUS 8 #define VGIC_NR_IRQS_LEGACY 256 #define VGIC_NR_SGIS 16 #define VGIC_NR_PPIS 16 #define VGIC_NR_PRIVATE_IRQS (VGIC_NR_SGIS + VGIC_NR_PPIS) #define VGIC_MAX_SPI 1019 #define VGIC_MAX_RESERVED 1023 #define VGIC_MIN_LPI 8192 #define KVM_IRQCHIP_NUM_PINS (1020 - 32) #define irq_is_ppi(irq) ((irq) >= VGIC_NR_SGIS && (irq) < VGIC_NR_PRIVATE_IRQS) #define irq_is_spi(irq) ((irq) >= VGIC_NR_PRIVATE_IRQS && \ (irq) <= VGIC_MAX_SPI) enum vgic_type { VGIC_V2, /* Good ol' GICv2 */ VGIC_V3, /* New fancy GICv3 */ VGIC_V5, /* Newer, fancier GICv5 */ }; /* same for all guests, as depending only on the _host's_ GIC model */ struct vgic_global { /* type of the host GIC */ enum vgic_type type; /* Physical address of vgic virtual cpu interface */ phys_addr_t vcpu_base; /* GICV mapping, kernel VA */ void __iomem *vcpu_base_va; /* GICV mapping, HYP VA */ void __iomem *vcpu_hyp_va; /* virtual control interface mapping, kernel VA */ void __iomem *vctrl_base; /* virtual control interface mapping, HYP VA */ void __iomem *vctrl_hyp; /* Number of implemented list registers */ int nr_lr; /* Maintenance IRQ number */ unsigned int maint_irq; /* maximum number of VCPUs allowed (GICv2 limits us to 8) */ int max_gic_vcpus; /* Only needed for the legacy KVM_CREATE_IRQCHIP */ bool can_emulate_gicv2; /* Hardware has GICv4? */ bool has_gicv4; bool has_gicv4_1; /* Pseudo GICv3 from outer space */ bool no_hw_deactivation; /* GICv3 system register CPU interface */ struct static_key_false gicv3_cpuif; /* GICv3 compat mode on a GICv5 host */ bool has_gcie_v3_compat; u32 ich_vtr_el2; }; extern struct vgic_global kvm_vgic_global_state; #define VGIC_V2_MAX_LRS (1 << 6) #define VGIC_V3_MAX_LRS 16 #define VGIC_V3_LR_INDEX(lr) (VGIC_V3_MAX_LRS - 1 - lr) enum vgic_irq_config { VGIC_CONFIG_EDGE = 0, VGIC_CONFIG_LEVEL }; /* * Per-irq ops overriding some common behavious. * * Always called in non-preemptible section and the functions can use * kvm_arm_get_running_vcpu() to get the vcpu pointer for private IRQs. */ struct irq_ops { /* Per interrupt flags for special-cased interrupts */ unsigned long flags; #define VGIC_IRQ_SW_RESAMPLE BIT(0) /* Clear the active state for resampling */ /* * Callback function pointer to in-kernel devices that can tell us the * state of the input level of mapped level-triggered IRQ faster than * peaking into the physical GIC. */ bool (*get_input_level)(int vintid); }; struct vgic_irq { raw_spinlock_t irq_lock; /* Protects the content of the struct */ struct rcu_head rcu; struct list_head ap_list; struct kvm_vcpu *vcpu; /* SGIs and PPIs: The VCPU * SPIs and LPIs: The VCPU whose ap_list * this is queued on. */ struct kvm_vcpu *target_vcpu; /* The VCPU that this interrupt should * be sent to, as a result of the * targets reg (v2) or the * affinity reg (v3). */ u32 intid; /* Guest visible INTID */ bool line_level; /* Level only */ bool pending_latch; /* The pending latch state used to calculate * the pending state for both level * and edge triggered IRQs. */ bool active; /* not used for LPIs */ bool enabled; bool hw; /* Tied to HW IRQ */ struct kref refcount; /* Used for LPIs */ u32 hwintid; /* HW INTID number */ unsigned int host_irq; /* linux irq corresponding to hwintid */ union { u8 targets; /* GICv2 target VCPUs mask */ u32 mpidr; /* GICv3 target VCPU */ }; u8 source; /* GICv2 SGIs only */ u8 active_source; /* GICv2 SGIs only */ u8 priority; u8 group; /* 0 == group 0, 1 == group 1 */ enum vgic_irq_config config; /* Level or edge */ struct irq_ops *ops; void *owner; /* Opaque pointer to reserve an interrupt for in-kernel devices. */ }; static inline bool vgic_irq_needs_resampling(struct vgic_irq *irq) { return irq->ops && (irq->ops->flags & VGIC_IRQ_SW_RESAMPLE); } struct vgic_register_region; struct vgic_its; enum iodev_type { IODEV_CPUIF, IODEV_DIST, IODEV_REDIST, IODEV_ITS }; struct vgic_io_device { gpa_t base_addr; union { struct kvm_vcpu *redist_vcpu; struct vgic_its *its; }; const struct vgic_register_region *regions; enum iodev_type iodev_type; int nr_regions; struct kvm_io_device dev; }; struct vgic_its { /* The base address of the ITS control register frame */ gpa_t vgic_its_base; bool enabled; struct vgic_io_device iodev; struct kvm_device *dev; /* These registers correspond to GITS_BASER{0,1} */ u64 baser_device_table; u64 baser_coll_table; /* Protects the command queue */ struct mutex cmd_lock; u64 cbaser; u32 creadr; u32 cwriter; /* migration ABI revision in use */ u32 abi_rev; /* Protects the device and collection lists */ struct mutex its_lock; struct list_head device_list; struct list_head collection_list; /* * Caches the (device_id, event_id) -> vgic_irq translation for * LPIs that are mapped and enabled. */ struct xarray translation_cache; }; struct vgic_state_iter; struct vgic_redist_region { u32 index; gpa_t base; u32 count; /* number of redistributors or 0 if single region */ u32 free_index; /* index of the next free redistributor */ struct list_head list; }; struct vgic_dist { bool in_kernel; bool ready; bool initialized; /* vGIC model the kernel emulates for the guest (GICv2 or GICv3) */ u32 vgic_model; /* Implementation revision as reported in the GICD_IIDR */ u32 implementation_rev; #define KVM_VGIC_IMP_REV_2 2 /* GICv2 restorable groups */ #define KVM_VGIC_IMP_REV_3 3 /* GICv3 GICR_CTLR.{IW,CES,RWP} */ #define KVM_VGIC_IMP_REV_LATEST KVM_VGIC_IMP_REV_3 /* Userspace can write to GICv2 IGROUPR */ bool v2_groups_user_writable; /* Do injected MSIs require an additional device ID? */ bool msis_require_devid; int nr_spis; /* The GIC maintenance IRQ for nested hypervisors. */ u32 mi_intid; /* base addresses in guest physical address space: */ gpa_t vgic_dist_base; /* distributor */ union { /* either a GICv2 CPU interface */ gpa_t vgic_cpu_base; /* or a number of GICv3 redistributor regions */ struct list_head rd_regions; }; /* distributor enabled */ bool enabled; /* Supports SGIs without active state */ bool nassgicap; /* Wants SGIs without active state */ bool nassgireq; struct vgic_irq *spis; struct vgic_io_device dist_iodev; bool has_its; bool table_write_in_progress; /* * Contains the attributes and gpa of the LPI configuration table. * Since we report GICR_TYPER.CommonLPIAff as 0b00, we can share * one address across all redistributors. * GICv3 spec: IHI 0069E 6.1.1 "LPI Configuration tables" */ u64 propbaser; #define LPI_XA_MARK_DEBUG_ITER XA_MARK_0 struct xarray lpi_xa; /* used by vgic-debug */ struct vgic_state_iter *iter; /* * GICv4 ITS per-VM data, containing the IRQ domain, the VPE * array, the property table pointer as well as allocation * data. This essentially ties the Linux IRQ core and ITS * together, and avoids leaking KVM's data structures anywhere * else. */ struct its_vm its_vm; }; struct vgic_v2_cpu_if { u32 vgic_hcr; u32 vgic_vmcr; u32 vgic_apr; u32 vgic_lr[VGIC_V2_MAX_LRS]; unsigned int used_lrs; }; struct vgic_v3_cpu_if { u32 vgic_hcr; u32 vgic_vmcr; u32 vgic_sre; /* Restored only, change ignored */ u32 vgic_ap0r[4]; u32 vgic_ap1r[4]; u64 vgic_lr[VGIC_V3_MAX_LRS]; /* * GICv4 ITS per-VPE data, containing the doorbell IRQ, the * pending table pointer, the its_vm pointer and a few other * HW specific things. As for the its_vm structure, this is * linking the Linux IRQ subsystem and the ITS together. */ struct its_vpe its_vpe; unsigned int used_lrs; }; struct vgic_cpu { /* CPU vif control registers for world switch */ union { struct vgic_v2_cpu_if vgic_v2; struct vgic_v3_cpu_if vgic_v3; }; struct vgic_irq *private_irqs; raw_spinlock_t ap_list_lock; /* Protects the ap_list */ /* * List of IRQs that this VCPU should consider because they are either * Active or Pending (hence the name; AP list), or because they recently * were one of the two and need to be migrated off this list to another * VCPU. */ struct list_head ap_list_head; /* * Members below are used with GICv3 emulation only and represent * parts of the redistributor. */ struct vgic_io_device rd_iodev; struct vgic_redist_region *rdreg; u32 rdreg_index; atomic_t syncr_busy; /* Contains the attributes and gpa of the LPI pending tables. */ u64 pendbaser; /* GICR_CTLR.{ENABLE_LPIS,RWP} */ atomic_t ctlr; /* Cache guest priority bits */ u32 num_pri_bits; /* Cache guest interrupt ID bits */ u32 num_id_bits; }; extern struct static_key_false vgic_v2_cpuif_trap; extern struct static_key_false vgic_v3_cpuif_trap; int kvm_set_legacy_vgic_v2_addr(struct kvm *kvm, struct kvm_arm_device_addr *dev_addr); void kvm_vgic_early_init(struct kvm *kvm); int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu); int kvm_vgic_vcpu_nv_init(struct kvm_vcpu *vcpu); int kvm_vgic_create(struct kvm *kvm, u32 type); void kvm_vgic_destroy(struct kvm *kvm); void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu); int kvm_vgic_map_resources(struct kvm *kvm); int kvm_vgic_hyp_init(void); void kvm_vgic_init_cpu_hardware(void); int kvm_vgic_inject_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, unsigned int intid, bool level, void *owner); int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, unsigned int host_irq, u32 vintid, struct irq_ops *ops); int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int vintid); int kvm_vgic_get_map(struct kvm_vcpu *vcpu, unsigned int vintid); bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int vintid); int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu); void kvm_vgic_load(struct kvm_vcpu *vcpu); void kvm_vgic_put(struct kvm_vcpu *vcpu); u16 vgic_v3_get_eisr(struct kvm_vcpu *vcpu); u16 vgic_v3_get_elrsr(struct kvm_vcpu *vcpu); u64 vgic_v3_get_misr(struct kvm_vcpu *vcpu); #define irqchip_in_kernel(k) (!!((k)->arch.vgic.in_kernel)) #define vgic_initialized(k) ((k)->arch.vgic.initialized) #define vgic_ready(k) ((k)->arch.vgic.ready) #define vgic_valid_spi(k, i) (((i) >= VGIC_NR_PRIVATE_IRQS) && \ ((i) < (k)->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS)) bool kvm_vcpu_has_pending_irqs(struct kvm_vcpu *vcpu); void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu); void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu); void kvm_vgic_reset_mapped_irq(struct kvm_vcpu *vcpu, u32 vintid); void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg, bool allow_group1); /** * kvm_vgic_get_max_vcpus - Get the maximum number of VCPUs allowed by HW * * The host's GIC naturally limits the maximum amount of VCPUs a guest * can use. */ static inline int kvm_vgic_get_max_vcpus(void) { return kvm_vgic_global_state.max_gic_vcpus; } /** * kvm_vgic_setup_default_irq_routing: * Setup a default flat gsi routing table mapping all SPIs */ int kvm_vgic_setup_default_irq_routing(struct kvm *kvm); int kvm_vgic_set_owner(struct kvm_vcpu *vcpu, unsigned int intid, void *owner); struct kvm_kernel_irq_routing_entry; int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int irq, struct kvm_kernel_irq_routing_entry *irq_entry); int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int host_irq); int vgic_v4_load(struct kvm_vcpu *vcpu); void vgic_v4_commit(struct kvm_vcpu *vcpu); int vgic_v4_put(struct kvm_vcpu *vcpu); bool vgic_state_is_nested(struct kvm_vcpu *vcpu); /* CPU HP callbacks */ void kvm_vgic_cpu_up(void); void kvm_vgic_cpu_down(void); #endif /* __KVM_ARM_VGIC_H */
1 1 1 1 1 1 1 1 1 1 1 6 5 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 // SPDX-License-Identifier: GPL-2.0-only #include <net/netdev_lock.h> #include <net/netdev_queues.h> #include <net/sock.h> #include <linux/ethtool_netlink.h> #include <linux/phy_link_topology.h> #include <linux/pm_runtime.h> #include "netlink.h" #include "module_fw.h" static struct genl_family ethtool_genl_family; static bool ethnl_ok __read_mostly; static u32 ethnl_bcast_seq; #define ETHTOOL_FLAGS_BASIC (ETHTOOL_FLAG_COMPACT_BITSETS | \ ETHTOOL_FLAG_OMIT_REPLY) #define ETHTOOL_FLAGS_STATS (ETHTOOL_FLAGS_BASIC | ETHTOOL_FLAG_STATS) const struct nla_policy ethnl_header_policy[] = { [ETHTOOL_A_HEADER_DEV_INDEX] = { .type = NLA_U32 }, [ETHTOOL_A_HEADER_DEV_NAME] = { .type = NLA_NUL_STRING, .len = ALTIFNAMSIZ - 1 }, [ETHTOOL_A_HEADER_FLAGS] = NLA_POLICY_MASK(NLA_U32, ETHTOOL_FLAGS_BASIC), }; const struct nla_policy ethnl_header_policy_stats[] = { [ETHTOOL_A_HEADER_DEV_INDEX] = { .type = NLA_U32 }, [ETHTOOL_A_HEADER_DEV_NAME] = { .type = NLA_NUL_STRING, .len = ALTIFNAMSIZ - 1 }, [ETHTOOL_A_HEADER_FLAGS] = NLA_POLICY_MASK(NLA_U32, ETHTOOL_FLAGS_STATS), }; const struct nla_policy ethnl_header_policy_phy[] = { [ETHTOOL_A_HEADER_DEV_INDEX] = { .type = NLA_U32 }, [ETHTOOL_A_HEADER_DEV_NAME] = { .type = NLA_NUL_STRING, .len = ALTIFNAMSIZ - 1 }, [ETHTOOL_A_HEADER_FLAGS] = NLA_POLICY_MASK(NLA_U32, ETHTOOL_FLAGS_BASIC), [ETHTOOL_A_HEADER_PHY_INDEX] = NLA_POLICY_MIN(NLA_U32, 1), }; const struct nla_policy ethnl_header_policy_phy_stats[] = { [ETHTOOL_A_HEADER_DEV_INDEX] = { .type = NLA_U32 }, [ETHTOOL_A_HEADER_DEV_NAME] = { .type = NLA_NUL_STRING, .len = ALTIFNAMSIZ - 1 }, [ETHTOOL_A_HEADER_FLAGS] = NLA_POLICY_MASK(NLA_U32, ETHTOOL_FLAGS_STATS), [ETHTOOL_A_HEADER_PHY_INDEX] = NLA_POLICY_MIN(NLA_U32, 1), }; int ethnl_sock_priv_set(struct sk_buff *skb, struct net_device *dev, u32 portid, enum ethnl_sock_type type) { struct ethnl_sock_priv *sk_priv; sk_priv = genl_sk_priv_get(&ethtool_genl_family, NETLINK_CB(skb).sk); if (IS_ERR(sk_priv)) return PTR_ERR(sk_priv); sk_priv->dev = dev; sk_priv->portid = portid; sk_priv->type = type; return 0; } static void ethnl_sock_priv_destroy(void *priv) { struct ethnl_sock_priv *sk_priv = priv; switch (sk_priv->type) { case ETHTOOL_SOCK_TYPE_MODULE_FW_FLASH: ethnl_module_fw_flash_sock_destroy(sk_priv); break; default: break; } } int ethnl_ops_begin(struct net_device *dev) { int ret; if (!dev) return -ENODEV; if (dev->dev.parent) pm_runtime_get_sync(dev->dev.parent); netdev_ops_assert_locked(dev); if (!netif_device_present(dev) || dev->reg_state >= NETREG_UNREGISTERING) { ret = -ENODEV; goto err; } if (dev->ethtool_ops->begin) { ret = dev->ethtool_ops->begin(dev); if (ret) goto err; } return 0; err: if (dev->dev.parent) pm_runtime_put(dev->dev.parent); return ret; } void ethnl_ops_complete(struct net_device *dev) { if (dev->ethtool_ops->complete) dev->ethtool_ops->complete(dev); if (dev->dev.parent) pm_runtime_put(dev->dev.parent); } /** * ethnl_parse_header_dev_get() - parse request header * @req_info: structure to put results into * @header: nest attribute with request header * @net: request netns * @extack: netlink extack for error reporting * @require_dev: fail if no device identified in header * * Parse request header in nested attribute @nest and puts results into * the structure pointed to by @req_info. Extack from @info is used for error * reporting. If req_info->dev is not null on return, reference to it has * been taken. If error is returned, *req_info is null initialized and no * reference is held. * * Return: 0 on success or negative error code */ int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info, const struct nlattr *header, struct net *net, struct netlink_ext_ack *extack, bool require_dev) { struct nlattr *tb[ARRAY_SIZE(ethnl_header_policy_phy)]; const struct nlattr *devname_attr; struct net_device *dev = NULL; u32 flags = 0; int ret; if (!header) { if (!require_dev) return 0; NL_SET_ERR_MSG(extack, "request header missing"); return -EINVAL; } /* No validation here, command policy should have a nested policy set * for the header, therefore validation should have already been done. */ ret = nla_parse_nested(tb, ARRAY_SIZE(ethnl_header_policy_phy) - 1, header, NULL, extack); if (ret < 0) return ret; if (tb[ETHTOOL_A_HEADER_FLAGS]) flags = nla_get_u32(tb[ETHTOOL_A_HEADER_FLAGS]); devname_attr = tb[ETHTOOL_A_HEADER_DEV_NAME]; if (tb[ETHTOOL_A_HEADER_DEV_INDEX]) { u32 ifindex = nla_get_u32(tb[ETHTOOL_A_HEADER_DEV_INDEX]); dev = netdev_get_by_index(net, ifindex, &req_info->dev_tracker, GFP_KERNEL); if (!dev) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_HEADER_DEV_INDEX], "no device matches ifindex"); return -ENODEV; } /* if both ifindex and ifname are passed, they must match */ if (devname_attr && strncmp(dev->name, nla_data(devname_attr), IFNAMSIZ)) { netdev_put(dev, &req_info->dev_tracker); NL_SET_ERR_MSG_ATTR(extack, header, "ifindex and name do not match"); return -ENODEV; } } else if (devname_attr) { dev = netdev_get_by_name(net, nla_data(devname_attr), &req_info->dev_tracker, GFP_KERNEL); if (!dev) { NL_SET_ERR_MSG_ATTR(extack, devname_attr, "no device matches name"); return -ENODEV; } } else if (require_dev) { NL_SET_ERR_MSG_ATTR(extack, header, "neither ifindex nor name specified"); return -EINVAL; } if (tb[ETHTOOL_A_HEADER_PHY_INDEX]) { if (dev) { req_info->phy_index = nla_get_u32(tb[ETHTOOL_A_HEADER_PHY_INDEX]); } else { NL_SET_ERR_MSG_ATTR(extack, header, "phy_index set without a netdev"); return -EINVAL; } } req_info->dev = dev; req_info->flags = flags; return 0; } struct phy_device *ethnl_req_get_phydev(const struct ethnl_req_info *req_info, struct nlattr **tb, unsigned int header, struct netlink_ext_ack *extack) { struct phy_device *phydev; ASSERT_RTNL(); if (!req_info->dev) return NULL; if (!req_info->phy_index) return req_info->dev->phydev; phydev = phy_link_topo_get_phy(req_info->dev, req_info->phy_index); if (!phydev && tb) { NL_SET_ERR_MSG_ATTR(extack, tb[header], "no phy matching phyindex"); return ERR_PTR(-ENODEV); } return phydev; } /** * ethnl_fill_reply_header() - Put common header into a reply message * @skb: skb with the message * @dev: network device to describe in header * @attrtype: attribute type to use for the nest * * Create a nested attribute with attributes describing given network device. * * Return: 0 on success, error value (-EMSGSIZE only) on error */ int ethnl_fill_reply_header(struct sk_buff *skb, struct net_device *dev, u16 attrtype) { struct nlattr *nest; if (!dev) return 0; nest = nla_nest_start(skb, attrtype); if (!nest) return -EMSGSIZE; if (nla_put_u32(skb, ETHTOOL_A_HEADER_DEV_INDEX, (u32)dev->ifindex) || nla_put_string(skb, ETHTOOL_A_HEADER_DEV_NAME, dev->name)) goto nla_put_failure; /* If more attributes are put into reply header, ethnl_header_size() * must be updated to account for them. */ nla_nest_end(skb, nest); return 0; nla_put_failure: nla_nest_cancel(skb, nest); return -EMSGSIZE; } /** * ethnl_reply_init() - Create skb for a reply and fill device identification * @payload: payload length (without netlink and genetlink header) * @dev: device the reply is about (may be null) * @cmd: ETHTOOL_MSG_* message type for reply * @hdr_attrtype: attribute type for common header * @info: genetlink info of the received packet we respond to * @ehdrp: place to store payload pointer returned by genlmsg_new() * * Return: pointer to allocated skb on success, NULL on error */ struct sk_buff *ethnl_reply_init(size_t payload, struct net_device *dev, u8 cmd, u16 hdr_attrtype, struct genl_info *info, void **ehdrp) { struct sk_buff *skb; skb = genlmsg_new(payload, GFP_KERNEL); if (!skb) goto err; *ehdrp = genlmsg_put_reply(skb, info, &ethtool_genl_family, 0, cmd); if (!*ehdrp) goto err_free; if (dev) { int ret; ret = ethnl_fill_reply_header(skb, dev, hdr_attrtype); if (ret < 0) goto err_free; } return skb; err_free: nlmsg_free(skb); err: if (info) GENL_SET_ERR_MSG(info, "failed to setup reply message"); return NULL; } void *ethnl_dump_put(struct sk_buff *skb, struct netlink_callback *cb, u8 cmd) { return genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &ethtool_genl_family, 0, cmd); } void *ethnl_bcastmsg_put(struct sk_buff *skb, u8 cmd) { return genlmsg_put(skb, 0, ++ethnl_bcast_seq, &ethtool_genl_family, 0, cmd); } void *ethnl_unicast_put(struct sk_buff *skb, u32 portid, u32 seq, u8 cmd) { return genlmsg_put(skb, portid, seq, &ethtool_genl_family, 0, cmd); } int ethnl_multicast(struct sk_buff *skb, struct net_device *dev) { return genlmsg_multicast_netns(&ethtool_genl_family, dev_net(dev), skb, 0, ETHNL_MCGRP_MONITOR, GFP_KERNEL); } /* GET request helpers */ /** * struct ethnl_dump_ctx - context structure for generic dumpit() callback * @ops: request ops of currently processed message type * @req_info: parsed request header of processed request * @reply_data: data needed to compose the reply * @pos_ifindex: saved iteration position - ifindex * * These parameters are kept in struct netlink_callback as context preserved * between iterations. They are initialized by ethnl_default_start() and used * in ethnl_default_dumpit() and ethnl_default_done(). */ struct ethnl_dump_ctx { const struct ethnl_request_ops *ops; struct ethnl_req_info *req_info; struct ethnl_reply_data *reply_data; unsigned long pos_ifindex; }; /** * struct ethnl_perphy_dump_ctx - context for dumpit() PHY-aware callbacks * @ethnl_ctx: generic ethnl context * @ifindex: For Filtered DUMP requests, the ifindex of the targeted netdev * @pos_phyindex: iterator position for multi-msg DUMP */ struct ethnl_perphy_dump_ctx { struct ethnl_dump_ctx ethnl_ctx; unsigned int ifindex; unsigned long pos_phyindex; }; static const struct ethnl_request_ops * ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = { [ETHTOOL_MSG_STRSET_GET] = &ethnl_strset_request_ops, [ETHTOOL_MSG_LINKINFO_GET] = &ethnl_linkinfo_request_ops, [ETHTOOL_MSG_LINKINFO_SET] = &ethnl_linkinfo_request_ops, [ETHTOOL_MSG_LINKMODES_GET] = &ethnl_linkmodes_request_ops, [ETHTOOL_MSG_LINKMODES_SET] = &ethnl_linkmodes_request_ops, [ETHTOOL_MSG_LINKSTATE_GET] = &ethnl_linkstate_request_ops, [ETHTOOL_MSG_DEBUG_GET] = &ethnl_debug_request_ops, [ETHTOOL_MSG_DEBUG_SET] = &ethnl_debug_request_ops, [ETHTOOL_MSG_WOL_GET] = &ethnl_wol_request_ops, [ETHTOOL_MSG_WOL_SET] = &ethnl_wol_request_ops, [ETHTOOL_MSG_FEATURES_GET] = &ethnl_features_request_ops, [ETHTOOL_MSG_PRIVFLAGS_GET] = &ethnl_privflags_request_ops, [ETHTOOL_MSG_PRIVFLAGS_SET] = &ethnl_privflags_request_ops, [ETHTOOL_MSG_RINGS_GET] = &ethnl_rings_request_ops, [ETHTOOL_MSG_RINGS_SET] = &ethnl_rings_request_ops, [ETHTOOL_MSG_CHANNELS_GET] = &ethnl_channels_request_ops, [ETHTOOL_MSG_CHANNELS_SET] = &ethnl_channels_request_ops, [ETHTOOL_MSG_COALESCE_GET] = &ethnl_coalesce_request_ops, [ETHTOOL_MSG_COALESCE_SET] = &ethnl_coalesce_request_ops, [ETHTOOL_MSG_PAUSE_GET] = &ethnl_pause_request_ops, [ETHTOOL_MSG_PAUSE_SET] = &ethnl_pause_request_ops, [ETHTOOL_MSG_EEE_GET] = &ethnl_eee_request_ops, [ETHTOOL_MSG_EEE_SET] = &ethnl_eee_request_ops, [ETHTOOL_MSG_FEC_GET] = &ethnl_fec_request_ops, [ETHTOOL_MSG_FEC_SET] = &ethnl_fec_request_ops, [ETHTOOL_MSG_TSINFO_GET] = &ethnl_tsinfo_request_ops, [ETHTOOL_MSG_MODULE_EEPROM_GET] = &ethnl_module_eeprom_request_ops, [ETHTOOL_MSG_STATS_GET] = &ethnl_stats_request_ops, [ETHTOOL_MSG_PHC_VCLOCKS_GET] = &ethnl_phc_vclocks_request_ops, [ETHTOOL_MSG_MODULE_GET] = &ethnl_module_request_ops, [ETHTOOL_MSG_MODULE_SET] = &ethnl_module_request_ops, [ETHTOOL_MSG_PSE_GET] = &ethnl_pse_request_ops, [ETHTOOL_MSG_PSE_SET] = &ethnl_pse_request_ops, [ETHTOOL_MSG_RSS_GET] = &ethnl_rss_request_ops, [ETHTOOL_MSG_PLCA_GET_CFG] = &ethnl_plca_cfg_request_ops, [ETHTOOL_MSG_PLCA_SET_CFG] = &ethnl_plca_cfg_request_ops, [ETHTOOL_MSG_PLCA_GET_STATUS] = &ethnl_plca_status_request_ops, [ETHTOOL_MSG_MM_GET] = &ethnl_mm_request_ops, [ETHTOOL_MSG_MM_SET] = &ethnl_mm_request_ops, [ETHTOOL_MSG_TSCONFIG_GET] = &ethnl_tsconfig_request_ops, [ETHTOOL_MSG_TSCONFIG_SET] = &ethnl_tsconfig_request_ops, [ETHTOOL_MSG_PHY_GET] = &ethnl_phy_request_ops, }; static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb) { return (struct ethnl_dump_ctx *)cb->ctx; } static struct ethnl_perphy_dump_ctx * ethnl_perphy_dump_context(struct netlink_callback *cb) { return (struct ethnl_perphy_dump_ctx *)cb->ctx; } /** * ethnl_default_parse() - Parse request message * @req_info: pointer to structure to put data into * @info: genl_info from the request * @request_ops: struct request_ops for request type * @require_dev: fail if no device identified in header * * Parse universal request header and call request specific ->parse_request() * callback (if defined) to parse the rest of the message. * * Return: 0 on success or negative error code */ static int ethnl_default_parse(struct ethnl_req_info *req_info, const struct genl_info *info, const struct ethnl_request_ops *request_ops, bool require_dev) { struct nlattr **tb = info->attrs; int ret; ret = ethnl_parse_header_dev_get(req_info, tb[request_ops->hdr_attr], genl_info_net(info), info->extack, require_dev); if (ret < 0) return ret; if (request_ops->parse_request) { ret = request_ops->parse_request(req_info, tb, info->extack); if (ret < 0) return ret; } return 0; } /** * ethnl_init_reply_data() - Initialize reply data for GET request * @reply_data: pointer to embedded struct ethnl_reply_data * @ops: instance of struct ethnl_request_ops describing the layout * @dev: network device to initialize the reply for * * Fills the reply data part with zeros and sets the dev member. Must be called * before calling the ->fill_reply() callback (for each iteration when handling * dump requests). */ static void ethnl_init_reply_data(struct ethnl_reply_data *reply_data, const struct ethnl_request_ops *ops, struct net_device *dev) { memset(reply_data, 0, ops->reply_data_size); reply_data->dev = dev; } /* default ->doit() handler for GET type requests */ static int ethnl_default_doit(struct sk_buff *skb, struct genl_info *info) { struct ethnl_reply_data *reply_data = NULL; struct ethnl_req_info *req_info = NULL; const u8 cmd = info->genlhdr->cmd; const struct ethnl_request_ops *ops; int hdr_len, reply_len; struct sk_buff *rskb; void *reply_payload; int ret; ops = ethnl_default_requests[cmd]; if (WARN_ONCE(!ops, "cmd %u has no ethnl_request_ops\n", cmd)) return -EOPNOTSUPP; if (GENL_REQ_ATTR_CHECK(info, ops->hdr_attr)) return -EINVAL; req_info = kzalloc(ops->req_info_size, GFP_KERNEL); if (!req_info) return -ENOMEM; reply_data = kmalloc(ops->reply_data_size, GFP_KERNEL); if (!reply_data) { kfree(req_info); return -ENOMEM; } ret = ethnl_default_parse(req_info, info, ops, !ops->allow_nodev_do); if (ret < 0) goto err_dev; ethnl_init_reply_data(reply_data, ops, req_info->dev); rtnl_lock(); if (req_info->dev) netdev_lock_ops(req_info->dev); ret = ops->prepare_data(req_info, reply_data, info); if (req_info->dev) netdev_unlock_ops(req_info->dev); rtnl_unlock(); if (ret < 0) goto err_dev; ret = ops->reply_size(req_info, reply_data); if (ret < 0) goto err_cleanup; reply_len = ret; ret = -ENOMEM; rskb = ethnl_reply_init(reply_len + ethnl_reply_header_size(), req_info->dev, ops->reply_cmd, ops->hdr_attr, info, &reply_payload); if (!rskb) goto err_cleanup; hdr_len = rskb->len; ret = ops->fill_reply(rskb, req_info, reply_data); if (ret < 0) goto err_msg; WARN_ONCE(rskb->len - hdr_len > reply_len, "ethnl cmd %d: calculated reply length %d, but consumed %d\n", cmd, reply_len, rskb->len - hdr_len); if (ops->cleanup_data) ops->cleanup_data(reply_data); genlmsg_end(rskb, reply_payload); netdev_put(req_info->dev, &req_info->dev_tracker); kfree(reply_data); kfree(req_info); return genlmsg_reply(rskb, info); err_msg: WARN_ONCE(ret == -EMSGSIZE, "calculated message payload length (%d) not sufficient\n", reply_len); nlmsg_free(rskb); err_cleanup: if (ops->cleanup_data) ops->cleanup_data(reply_data); err_dev: netdev_put(req_info->dev, &req_info->dev_tracker); kfree(reply_data); kfree(req_info); return ret; } static int ethnl_default_dump_one(struct sk_buff *skb, struct net_device *dev, const struct ethnl_dump_ctx *ctx, const struct genl_info *info) { void *ehdr; int ret; ehdr = genlmsg_put(skb, info->snd_portid, info->snd_seq, &ethtool_genl_family, NLM_F_MULTI, ctx->ops->reply_cmd); if (!ehdr) return -EMSGSIZE; ethnl_init_reply_data(ctx->reply_data, ctx->ops, dev); rtnl_lock(); netdev_lock_ops(dev); ret = ctx->ops->prepare_data(ctx->req_info, ctx->reply_data, info); netdev_unlock_ops(dev); rtnl_unlock(); if (ret < 0) goto out_cancel; ret = ethnl_fill_reply_header(skb, dev, ctx->ops->hdr_attr); if (ret < 0) goto out; ret = ctx->ops->fill_reply(skb, ctx->req_info, ctx->reply_data); out: if (ctx->ops->cleanup_data) ctx->ops->cleanup_data(ctx->reply_data); out_cancel: ctx->reply_data->dev = NULL; if (ret < 0) genlmsg_cancel(skb, ehdr); else genlmsg_end(skb, ehdr); return ret; } /* Default ->dumpit() handler for GET requests. */ static int ethnl_default_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct ethnl_dump_ctx *ctx = ethnl_dump_context(cb); struct net *net = sock_net(skb->sk); netdevice_tracker dev_tracker; struct net_device *dev; int ret = 0; rcu_read_lock(); for_each_netdev_dump(net, dev, ctx->pos_ifindex) { netdev_hold(dev, &dev_tracker, GFP_ATOMIC); rcu_read_unlock(); ret = ethnl_default_dump_one(skb, dev, ctx, genl_info_dump(cb)); rcu_read_lock(); netdev_put(dev, &dev_tracker); if (ret < 0 && ret != -EOPNOTSUPP) { if (likely(skb->len)) ret = skb->len; break; } ret = 0; } rcu_read_unlock(); return ret; } /* generic ->start() handler for GET requests */ static int ethnl_default_start(struct netlink_callback *cb) { const struct genl_dumpit_info *info = genl_dumpit_info(cb); struct ethnl_dump_ctx *ctx = ethnl_dump_context(cb); struct ethnl_reply_data *reply_data; const struct ethnl_request_ops *ops; struct ethnl_req_info *req_info; struct genlmsghdr *ghdr; int ret; BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); ghdr = nlmsg_data(cb->nlh); ops = ethnl_default_requests[ghdr->cmd]; if (WARN_ONCE(!ops, "cmd %u has no ethnl_request_ops\n", ghdr->cmd)) return -EOPNOTSUPP; req_info = kzalloc(ops->req_info_size, GFP_KERNEL); if (!req_info) return -ENOMEM; reply_data = kmalloc(ops->reply_data_size, GFP_KERNEL); if (!reply_data) { ret = -ENOMEM; goto free_req_info; } ret = ethnl_default_parse(req_info, &info->info, ops, false); if (req_info->dev) { /* We ignore device specification in dump requests but as the * same parser as for non-dump (doit) requests is used, it * would take reference to the device if it finds one */ netdev_put(req_info->dev, &req_info->dev_tracker); req_info->dev = NULL; } if (ret < 0) goto free_reply_data; ctx->ops = ops; ctx->req_info = req_info; ctx->reply_data = reply_data; ctx->pos_ifindex = 0; return 0; free_reply_data: kfree(reply_data); free_req_info: kfree(req_info); return ret; } /* per-PHY ->start() handler for GET requests */ static int ethnl_perphy_start(struct netlink_callback *cb) { struct ethnl_perphy_dump_ctx *phy_ctx = ethnl_perphy_dump_context(cb); const struct genl_dumpit_info *info = genl_dumpit_info(cb); struct ethnl_dump_ctx *ctx = &phy_ctx->ethnl_ctx; struct ethnl_reply_data *reply_data; const struct ethnl_request_ops *ops; struct ethnl_req_info *req_info; struct genlmsghdr *ghdr; int ret; BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); ghdr = nlmsg_data(cb->nlh); ops = ethnl_default_requests[ghdr->cmd]; if (WARN_ONCE(!ops, "cmd %u has no ethnl_request_ops\n", ghdr->cmd)) return -EOPNOTSUPP; req_info = kzalloc(ops->req_info_size, GFP_KERNEL); if (!req_info) return -ENOMEM; reply_data = kmalloc(ops->reply_data_size, GFP_KERNEL); if (!reply_data) { ret = -ENOMEM; goto free_req_info; } /* Unlike per-dev dump, don't ignore dev. The dump handler * will notice it and dump PHYs from given dev. We only keep track of * the dev's ifindex, .dumpit() will grab and release the netdev itself. */ ret = ethnl_default_parse(req_info, &info->info, ops, false); if (req_info->dev) { phy_ctx->ifindex = req_info->dev->ifindex; netdev_put(req_info->dev, &req_info->dev_tracker); req_info->dev = NULL; } if (ret < 0) goto free_reply_data; ctx->ops = ops; ctx->req_info = req_info; ctx->reply_data = reply_data; ctx->pos_ifindex = 0; return 0; free_reply_data: kfree(reply_data); free_req_info: kfree(req_info); return ret; } static int ethnl_perphy_dump_one_dev(struct sk_buff *skb, struct ethnl_perphy_dump_ctx *ctx, const struct genl_info *info) { struct ethnl_dump_ctx *ethnl_ctx = &ctx->ethnl_ctx; struct net_device *dev = ethnl_ctx->req_info->dev; struct phy_device_node *pdn; int ret; if (!dev->link_topo) return 0; xa_for_each_start(&dev->link_topo->phys, ctx->pos_phyindex, pdn, ctx->pos_phyindex) { ethnl_ctx->req_info->phy_index = ctx->pos_phyindex; /* We can re-use the original dump_one as ->prepare_data in * commands use ethnl_req_get_phydev(), which gets the PHY from * the req_info->phy_index */ ret = ethnl_default_dump_one(skb, dev, ethnl_ctx, info); if (ret) return ret; } ctx->pos_phyindex = 0; return 0; } static int ethnl_perphy_dump_all_dev(struct sk_buff *skb, struct ethnl_perphy_dump_ctx *ctx, const struct genl_info *info) { struct ethnl_dump_ctx *ethnl_ctx = &ctx->ethnl_ctx; struct net *net = sock_net(skb->sk); netdevice_tracker dev_tracker; struct net_device *dev; int ret = 0; rcu_read_lock(); for_each_netdev_dump(net, dev, ethnl_ctx->pos_ifindex) { netdev_hold(dev, &dev_tracker, GFP_ATOMIC); rcu_read_unlock(); /* per-PHY commands use ethnl_req_get_phydev(), which needs the * net_device in the req_info */ ethnl_ctx->req_info->dev = dev; ret = ethnl_perphy_dump_one_dev(skb, ctx, info); rcu_read_lock(); netdev_put(dev, &dev_tracker); ethnl_ctx->req_info->dev = NULL; if (ret < 0 && ret != -EOPNOTSUPP) { if (likely(skb->len)) ret = skb->len; break; } ret = 0; } rcu_read_unlock(); return ret; } /* per-PHY ->dumpit() handler for GET requests. */ static int ethnl_perphy_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct ethnl_perphy_dump_ctx *ctx = ethnl_perphy_dump_context(cb); const struct genl_dumpit_info *info = genl_dumpit_info(cb); struct ethnl_dump_ctx *ethnl_ctx = &ctx->ethnl_ctx; int ret = 0; if (ctx->ifindex) { netdevice_tracker dev_tracker; struct net_device *dev; dev = netdev_get_by_index(genl_info_net(&info->info), ctx->ifindex, &dev_tracker, GFP_KERNEL); if (!dev) return -ENODEV; ethnl_ctx->req_info->dev = dev; ret = ethnl_perphy_dump_one_dev(skb, ctx, genl_info_dump(cb)); if (ret < 0 && ret != -EOPNOTSUPP && likely(skb->len)) ret = skb->len; netdev_put(dev, &dev_tracker); } else { ret = ethnl_perphy_dump_all_dev(skb, ctx, genl_info_dump(cb)); } return ret; } /* per-PHY ->done() handler for GET requests */ static int ethnl_perphy_done(struct netlink_callback *cb) { struct ethnl_perphy_dump_ctx *ctx = ethnl_perphy_dump_context(cb); struct ethnl_dump_ctx *ethnl_ctx = &ctx->ethnl_ctx; kfree(ethnl_ctx->reply_data); kfree(ethnl_ctx->req_info); return 0; } /* default ->done() handler for GET requests */ static int ethnl_default_done(struct netlink_callback *cb) { struct ethnl_dump_ctx *ctx = ethnl_dump_context(cb); kfree(ctx->reply_data); kfree(ctx->req_info); return 0; } static int ethnl_default_set_doit(struct sk_buff *skb, struct genl_info *info) { const struct ethnl_request_ops *ops; struct ethnl_req_info req_info = {}; const u8 cmd = info->genlhdr->cmd; struct net_device *dev; int ret; ops = ethnl_default_requests[cmd]; if (WARN_ONCE(!ops, "cmd %u has no ethnl_request_ops\n", cmd)) return -EOPNOTSUPP; if (GENL_REQ_ATTR_CHECK(info, ops->hdr_attr)) return -EINVAL; ret = ethnl_parse_header_dev_get(&req_info, info->attrs[ops->hdr_attr], genl_info_net(info), info->extack, true); if (ret < 0) return ret; if (ops->set_validate) { ret = ops->set_validate(&req_info, info); /* 0 means nothing to do */ if (ret <= 0) goto out_dev; } dev = req_info.dev; rtnl_lock(); netdev_lock_ops(dev); dev->cfg_pending = kmemdup(dev->cfg, sizeof(*dev->cfg), GFP_KERNEL_ACCOUNT); if (!dev->cfg_pending) { ret = -ENOMEM; goto out_tie_cfg; } ret = ethnl_ops_begin(dev); if (ret < 0) goto out_free_cfg; ret = ops->set(&req_info, info); if (ret < 0) goto out_ops; swap(dev->cfg, dev->cfg_pending); if (!ret) goto out_ops; ethtool_notify(dev, ops->set_ntf_cmd, NULL); ret = 0; out_ops: ethnl_ops_complete(dev); out_free_cfg: kfree(dev->cfg_pending); out_tie_cfg: dev->cfg_pending = dev->cfg; netdev_unlock_ops(dev); rtnl_unlock(); out_dev: ethnl_parse_header_dev_put(&req_info); return ret; } static const struct ethnl_request_ops * ethnl_default_notify_ops[ETHTOOL_MSG_KERNEL_MAX + 1] = { [ETHTOOL_MSG_LINKINFO_NTF] = &ethnl_linkinfo_request_ops, [ETHTOOL_MSG_LINKMODES_NTF] = &ethnl_linkmodes_request_ops, [ETHTOOL_MSG_DEBUG_NTF] = &ethnl_debug_request_ops, [ETHTOOL_MSG_WOL_NTF] = &ethnl_wol_request_ops, [ETHTOOL_MSG_FEATURES_NTF] = &ethnl_features_request_ops, [ETHTOOL_MSG_PRIVFLAGS_NTF] = &ethnl_privflags_request_ops, [ETHTOOL_MSG_RINGS_NTF] = &ethnl_rings_request_ops, [ETHTOOL_MSG_CHANNELS_NTF] = &ethnl_channels_request_ops, [ETHTOOL_MSG_COALESCE_NTF] = &ethnl_coalesce_request_ops, [ETHTOOL_MSG_PAUSE_NTF] = &ethnl_pause_request_ops, [ETHTOOL_MSG_EEE_NTF] = &ethnl_eee_request_ops, [ETHTOOL_MSG_FEC_NTF] = &ethnl_fec_request_ops, [ETHTOOL_MSG_MODULE_NTF] = &ethnl_module_request_ops, [ETHTOOL_MSG_PLCA_NTF] = &ethnl_plca_cfg_request_ops, [ETHTOOL_MSG_MM_NTF] = &ethnl_mm_request_ops, }; /* default notification handler */ static void ethnl_default_notify(struct net_device *dev, unsigned int cmd, const void *data) { struct ethnl_reply_data *reply_data; const struct ethnl_request_ops *ops; struct ethnl_req_info *req_info; struct genl_info info; struct sk_buff *skb; void *reply_payload; int reply_len; int ret; genl_info_init_ntf(&info, &ethtool_genl_family, cmd); if (WARN_ONCE(cmd > ETHTOOL_MSG_KERNEL_MAX || !ethnl_default_notify_ops[cmd], "unexpected notification type %u\n", cmd)) return; ops = ethnl_default_notify_ops[cmd]; req_info = kzalloc(ops->req_info_size, GFP_KERNEL); if (!req_info) return; reply_data = kmalloc(ops->reply_data_size, GFP_KERNEL); if (!reply_data) { kfree(req_info); return; } req_info->dev = dev; req_info->flags |= ETHTOOL_FLAG_COMPACT_BITSETS; netdev_ops_assert_locked(dev); ethnl_init_reply_data(reply_data, ops, dev); ret = ops->prepare_data(req_info, reply_data, &info); if (ret < 0) goto err_rep; ret = ops->reply_size(req_info, reply_data); if (ret < 0) goto err_cleanup; reply_len = ret + ethnl_reply_header_size(); skb = genlmsg_new(reply_len, GFP_KERNEL); if (!skb) goto err_cleanup; reply_payload = ethnl_bcastmsg_put(skb, cmd); if (!reply_payload) goto err_skb; ret = ethnl_fill_reply_header(skb, dev, ops->hdr_attr); if (ret < 0) goto err_msg; ret = ops->fill_reply(skb, req_info, reply_data); if (ret < 0) goto err_msg; if (ops->cleanup_data) ops->cleanup_data(reply_data); genlmsg_end(skb, reply_payload); kfree(reply_data); kfree(req_info); ethnl_multicast(skb, dev); return; err_msg: WARN_ONCE(ret == -EMSGSIZE, "calculated message payload length (%d) not sufficient\n", reply_len); err_skb: nlmsg_free(skb); err_cleanup: if (ops->cleanup_data) ops->cleanup_data(reply_data); err_rep: kfree(reply_data); kfree(req_info); return; } /* notifications */ typedef void (*ethnl_notify_handler_t)(struct net_device *dev, unsigned int cmd, const void *data); static const ethnl_notify_handler_t ethnl_notify_handlers[] = { [ETHTOOL_MSG_LINKINFO_NTF] = ethnl_default_notify, [ETHTOOL_MSG_LINKMODES_NTF] = ethnl_default_notify, [ETHTOOL_MSG_DEBUG_NTF] = ethnl_default_notify, [ETHTOOL_MSG_WOL_NTF] = ethnl_default_notify, [ETHTOOL_MSG_FEATURES_NTF] = ethnl_default_notify, [ETHTOOL_MSG_PRIVFLAGS_NTF] = ethnl_default_notify, [ETHTOOL_MSG_RINGS_NTF] = ethnl_default_notify, [ETHTOOL_MSG_CHANNELS_NTF] = ethnl_default_notify, [ETHTOOL_MSG_COALESCE_NTF] = ethnl_default_notify, [ETHTOOL_MSG_PAUSE_NTF] = ethnl_default_notify, [ETHTOOL_MSG_EEE_NTF] = ethnl_default_notify, [ETHTOOL_MSG_FEC_NTF] = ethnl_default_notify, [ETHTOOL_MSG_MODULE_NTF] = ethnl_default_notify, [ETHTOOL_MSG_PLCA_NTF] = ethnl_default_notify, [ETHTOOL_MSG_MM_NTF] = ethnl_default_notify, }; void ethtool_notify(struct net_device *dev, unsigned int cmd, const void *data) { if (unlikely(!ethnl_ok)) return; ASSERT_RTNL(); if (likely(cmd < ARRAY_SIZE(ethnl_notify_handlers) && ethnl_notify_handlers[cmd])) ethnl_notify_handlers[cmd](dev, cmd, data); else WARN_ONCE(1, "notification %u not implemented (dev=%s)\n", cmd, netdev_name(dev)); } EXPORT_SYMBOL(ethtool_notify); static void ethnl_notify_features(struct netdev_notifier_info *info) { struct net_device *dev = netdev_notifier_info_to_dev(info); ethtool_notify(dev, ETHTOOL_MSG_FEATURES_NTF, NULL); } static int ethnl_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct netdev_notifier_info *info = ptr; struct netlink_ext_ack *extack; struct net_device *dev; dev = netdev_notifier_info_to_dev(info); extack = netdev_notifier_info_to_extack(info); switch (event) { case NETDEV_FEAT_CHANGE: ethnl_notify_features(ptr); break; case NETDEV_PRE_UP: if (dev->ethtool->module_fw_flash_in_progress) { NL_SET_ERR_MSG(extack, "Can't set port up while flashing module firmware"); return NOTIFY_BAD; } } return NOTIFY_DONE; } static struct notifier_block ethnl_netdev_notifier = { .notifier_call = ethnl_netdev_event, }; /* genetlink setup */ static const struct genl_ops ethtool_genl_ops[] = { { .cmd = ETHTOOL_MSG_STRSET_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_strset_get_policy, .maxattr = ARRAY_SIZE(ethnl_strset_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_LINKINFO_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_linkinfo_get_policy, .maxattr = ARRAY_SIZE(ethnl_linkinfo_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_LINKINFO_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_linkinfo_set_policy, .maxattr = ARRAY_SIZE(ethnl_linkinfo_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_LINKMODES_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_linkmodes_get_policy, .maxattr = ARRAY_SIZE(ethnl_linkmodes_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_LINKMODES_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_linkmodes_set_policy, .maxattr = ARRAY_SIZE(ethnl_linkmodes_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_LINKSTATE_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_linkstate_get_policy, .maxattr = ARRAY_SIZE(ethnl_linkstate_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_DEBUG_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_debug_get_policy, .maxattr = ARRAY_SIZE(ethnl_debug_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_DEBUG_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_debug_set_policy, .maxattr = ARRAY_SIZE(ethnl_debug_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_WOL_GET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_wol_get_policy, .maxattr = ARRAY_SIZE(ethnl_wol_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_WOL_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_wol_set_policy, .maxattr = ARRAY_SIZE(ethnl_wol_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_FEATURES_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_features_get_policy, .maxattr = ARRAY_SIZE(ethnl_features_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_FEATURES_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_set_features, .policy = ethnl_features_set_policy, .maxattr = ARRAY_SIZE(ethnl_features_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_PRIVFLAGS_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_privflags_get_policy, .maxattr = ARRAY_SIZE(ethnl_privflags_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_PRIVFLAGS_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_privflags_set_policy, .maxattr = ARRAY_SIZE(ethnl_privflags_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_RINGS_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_rings_get_policy, .maxattr = ARRAY_SIZE(ethnl_rings_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_RINGS_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_rings_set_policy, .maxattr = ARRAY_SIZE(ethnl_rings_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_CHANNELS_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_channels_get_policy, .maxattr = ARRAY_SIZE(ethnl_channels_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_CHANNELS_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_channels_set_policy, .maxattr = ARRAY_SIZE(ethnl_channels_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_COALESCE_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_coalesce_get_policy, .maxattr = ARRAY_SIZE(ethnl_coalesce_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_COALESCE_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_coalesce_set_policy, .maxattr = ARRAY_SIZE(ethnl_coalesce_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_PAUSE_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_pause_get_policy, .maxattr = ARRAY_SIZE(ethnl_pause_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_PAUSE_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_pause_set_policy, .maxattr = ARRAY_SIZE(ethnl_pause_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_EEE_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_eee_get_policy, .maxattr = ARRAY_SIZE(ethnl_eee_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_EEE_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_eee_set_policy, .maxattr = ARRAY_SIZE(ethnl_eee_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_TSINFO_GET, .doit = ethnl_default_doit, .start = ethnl_tsinfo_start, .dumpit = ethnl_tsinfo_dumpit, .done = ethnl_tsinfo_done, .policy = ethnl_tsinfo_get_policy, .maxattr = ARRAY_SIZE(ethnl_tsinfo_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_CABLE_TEST_ACT, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_act_cable_test, .policy = ethnl_cable_test_act_policy, .maxattr = ARRAY_SIZE(ethnl_cable_test_act_policy) - 1, }, { .cmd = ETHTOOL_MSG_CABLE_TEST_TDR_ACT, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_act_cable_test_tdr, .policy = ethnl_cable_test_tdr_act_policy, .maxattr = ARRAY_SIZE(ethnl_cable_test_tdr_act_policy) - 1, }, { .cmd = ETHTOOL_MSG_TUNNEL_INFO_GET, .doit = ethnl_tunnel_info_doit, .start = ethnl_tunnel_info_start, .dumpit = ethnl_tunnel_info_dumpit, .policy = ethnl_tunnel_info_get_policy, .maxattr = ARRAY_SIZE(ethnl_tunnel_info_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_FEC_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_fec_get_policy, .maxattr = ARRAY_SIZE(ethnl_fec_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_FEC_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_fec_set_policy, .maxattr = ARRAY_SIZE(ethnl_fec_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_MODULE_EEPROM_GET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_module_eeprom_get_policy, .maxattr = ARRAY_SIZE(ethnl_module_eeprom_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_STATS_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_stats_get_policy, .maxattr = ARRAY_SIZE(ethnl_stats_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_PHC_VCLOCKS_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_phc_vclocks_get_policy, .maxattr = ARRAY_SIZE(ethnl_phc_vclocks_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_MODULE_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_module_get_policy, .maxattr = ARRAY_SIZE(ethnl_module_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_MODULE_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_module_set_policy, .maxattr = ARRAY_SIZE(ethnl_module_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_PSE_GET, .doit = ethnl_default_doit, .start = ethnl_perphy_start, .dumpit = ethnl_perphy_dumpit, .done = ethnl_perphy_done, .policy = ethnl_pse_get_policy, .maxattr = ARRAY_SIZE(ethnl_pse_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_PSE_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_pse_set_policy, .maxattr = ARRAY_SIZE(ethnl_pse_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_RSS_GET, .doit = ethnl_default_doit, .start = ethnl_rss_dump_start, .dumpit = ethnl_rss_dumpit, .policy = ethnl_rss_get_policy, .maxattr = ARRAY_SIZE(ethnl_rss_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_PLCA_GET_CFG, .doit = ethnl_default_doit, .start = ethnl_perphy_start, .dumpit = ethnl_perphy_dumpit, .done = ethnl_perphy_done, .policy = ethnl_plca_get_cfg_policy, .maxattr = ARRAY_SIZE(ethnl_plca_get_cfg_policy) - 1, }, { .cmd = ETHTOOL_MSG_PLCA_SET_CFG, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_plca_set_cfg_policy, .maxattr = ARRAY_SIZE(ethnl_plca_set_cfg_policy) - 1, }, { .cmd = ETHTOOL_MSG_PLCA_GET_STATUS, .doit = ethnl_default_doit, .start = ethnl_perphy_start, .dumpit = ethnl_perphy_dumpit, .done = ethnl_perphy_done, .policy = ethnl_plca_get_status_policy, .maxattr = ARRAY_SIZE(ethnl_plca_get_status_policy) - 1, }, { .cmd = ETHTOOL_MSG_MM_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_mm_get_policy, .maxattr = ARRAY_SIZE(ethnl_mm_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_MM_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_mm_set_policy, .maxattr = ARRAY_SIZE(ethnl_mm_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_MODULE_FW_FLASH_ACT, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_act_module_fw_flash, .policy = ethnl_module_fw_flash_act_policy, .maxattr = ARRAY_SIZE(ethnl_module_fw_flash_act_policy) - 1, }, { .cmd = ETHTOOL_MSG_PHY_GET, .doit = ethnl_default_doit, .start = ethnl_perphy_start, .dumpit = ethnl_perphy_dumpit, .done = ethnl_perphy_done, .policy = ethnl_phy_get_policy, .maxattr = ARRAY_SIZE(ethnl_phy_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_TSCONFIG_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_tsconfig_get_policy, .maxattr = ARRAY_SIZE(ethnl_tsconfig_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_TSCONFIG_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_tsconfig_set_policy, .maxattr = ARRAY_SIZE(ethnl_tsconfig_set_policy) - 1, }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { [ETHNL_MCGRP_MONITOR] = { .name = ETHTOOL_MCGRP_MONITOR_NAME }, }; static struct genl_family ethtool_genl_family __ro_after_init = { .name = ETHTOOL_GENL_NAME, .version = ETHTOOL_GENL_VERSION, .netnsok = true, .parallel_ops = true, .ops = ethtool_genl_ops, .n_ops = ARRAY_SIZE(ethtool_genl_ops), .resv_start_op = ETHTOOL_MSG_MODULE_GET + 1, .mcgrps = ethtool_nl_mcgrps, .n_mcgrps = ARRAY_SIZE(ethtool_nl_mcgrps), .sock_priv_size = sizeof(struct ethnl_sock_priv), .sock_priv_destroy = ethnl_sock_priv_destroy, }; /* module setup */ static int __init ethnl_init(void) { int ret; ret = genl_register_family(&ethtool_genl_family); if (WARN(ret < 0, "ethtool: genetlink family registration failed")) return ret; ethnl_ok = true; ret = register_netdevice_notifier(&ethnl_netdev_notifier); WARN(ret < 0, "ethtool: net device notifier registration failed"); return ret; } subsys_initcall(ethnl_init);
1 265 266 378 265 273 258 266 378 345 2 378 31 235 376 378 376 377 273 273 268 18 377 376 378 355 378 378 378 377 378 378 2 2 2 2 294 294 294 269 98 147 147 266 266 266 1 266 265 266 265 265 266 266 13 13 267 236 266 265 266 266 2 264 266 265 266 235 236 265 258 258 266 236 266 265 264 266 67 65 15 15 13 13 13 13 13 13 67 67 109 97 2 13 14 13 98 13 13 13 1 1 1 294 294 27 27 3 3 27 27 27 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 // SPDX-License-Identifier: GPL-2.0-only /* * Stand-alone page-table allocator for hyp stage-1 and guest stage-2. * No bombay mix was harmed in the writing of this file. * * Copyright (C) 2020 Google LLC * Author: Will Deacon <will@kernel.org> */ #include <linux/bitfield.h> #include <asm/kvm_pgtable.h> #include <asm/stage2_pgtable.h> struct kvm_pgtable_walk_data { struct kvm_pgtable_walker *walker; const u64 start; u64 addr; const u64 end; }; static bool kvm_pgtable_walk_skip_bbm_tlbi(const struct kvm_pgtable_visit_ctx *ctx) { return unlikely(ctx->flags & KVM_PGTABLE_WALK_SKIP_BBM_TLBI); } static bool kvm_pgtable_walk_skip_cmo(const struct kvm_pgtable_visit_ctx *ctx) { return unlikely(ctx->flags & KVM_PGTABLE_WALK_SKIP_CMO); } static bool kvm_block_mapping_supported(const struct kvm_pgtable_visit_ctx *ctx, u64 phys) { u64 granule = kvm_granule_size(ctx->level); if (!kvm_level_supports_block_mapping(ctx->level)) return false; if (granule > (ctx->end - ctx->addr)) return false; if (!IS_ALIGNED(phys, granule)) return false; return IS_ALIGNED(ctx->addr, granule); } static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, s8 level) { u64 shift = kvm_granule_shift(level); u64 mask = BIT(PAGE_SHIFT - 3) - 1; return (data->addr >> shift) & mask; } static u32 kvm_pgd_page_idx(struct kvm_pgtable *pgt, u64 addr) { u64 shift = kvm_granule_shift(pgt->start_level - 1); /* May underflow */ u64 mask = BIT(pgt->ia_bits) - 1; return (addr & mask) >> shift; } static u32 kvm_pgd_pages(u32 ia_bits, s8 start_level) { struct kvm_pgtable pgt = { .ia_bits = ia_bits, .start_level = start_level, }; return kvm_pgd_page_idx(&pgt, -1ULL) + 1; } static bool kvm_pte_table(kvm_pte_t pte, s8 level) { if (level == KVM_PGTABLE_LAST_LEVEL) return false; if (!kvm_pte_valid(pte)) return false; return FIELD_GET(KVM_PTE_TYPE, pte) == KVM_PTE_TYPE_TABLE; } static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte, struct kvm_pgtable_mm_ops *mm_ops) { return mm_ops->phys_to_virt(kvm_pte_to_phys(pte)); } static void kvm_clear_pte(kvm_pte_t *ptep) { WRITE_ONCE(*ptep, 0); } static kvm_pte_t kvm_init_table_pte(kvm_pte_t *childp, struct kvm_pgtable_mm_ops *mm_ops) { kvm_pte_t pte = kvm_phys_to_pte(mm_ops->virt_to_phys(childp)); pte |= FIELD_PREP(KVM_PTE_TYPE, KVM_PTE_TYPE_TABLE); pte |= KVM_PTE_VALID; return pte; } static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, s8 level) { kvm_pte_t pte = kvm_phys_to_pte(pa); u64 type = (level == KVM_PGTABLE_LAST_LEVEL) ? KVM_PTE_TYPE_PAGE : KVM_PTE_TYPE_BLOCK; pte |= attr & (KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI); pte |= FIELD_PREP(KVM_PTE_TYPE, type); pte |= KVM_PTE_VALID; return pte; } static kvm_pte_t kvm_init_invalid_leaf_owner(u8 owner_id) { return FIELD_PREP(KVM_INVALID_PTE_OWNER_MASK, owner_id); } static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit) { struct kvm_pgtable_walker *walker = data->walker; /* Ensure the appropriate lock is held (e.g. RCU lock for stage-2 MMU) */ WARN_ON_ONCE(kvm_pgtable_walk_shared(ctx) && !kvm_pgtable_walk_lock_held()); return walker->cb(ctx, visit); } static bool kvm_pgtable_walk_continue(const struct kvm_pgtable_walker *walker, int r) { /* * Visitor callbacks return EAGAIN when the conditions that led to a * fault are no longer reflected in the page tables due to a race to * update a PTE. In the context of a fault handler this is interpreted * as a signal to retry guest execution. * * Ignore the return code altogether for walkers outside a fault handler * (e.g. write protecting a range of memory) and chug along with the * page table walk. */ if (r == -EAGAIN) return !(walker->flags & KVM_PGTABLE_WALK_HANDLE_FAULT); return !r; } static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data, struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, s8 level); static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data, struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pteref, s8 level) { enum kvm_pgtable_walk_flags flags = data->walker->flags; kvm_pte_t *ptep = kvm_dereference_pteref(data->walker, pteref); struct kvm_pgtable_visit_ctx ctx = { .ptep = ptep, .old = READ_ONCE(*ptep), .arg = data->walker->arg, .mm_ops = mm_ops, .start = data->start, .addr = data->addr, .end = data->end, .level = level, .flags = flags, }; int ret = 0; bool reload = false; kvm_pteref_t childp; bool table = kvm_pte_table(ctx.old, level); if (table && (ctx.flags & KVM_PGTABLE_WALK_TABLE_PRE)) { ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_TABLE_PRE); reload = true; } if (!table && (ctx.flags & KVM_PGTABLE_WALK_LEAF)) { ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_LEAF); reload = true; } /* * Reload the page table after invoking the walker callback for leaf * entries or after pre-order traversal, to allow the walker to descend * into a newly installed or replaced table. */ if (reload) { ctx.old = READ_ONCE(*ptep); table = kvm_pte_table(ctx.old, level); } if (!kvm_pgtable_walk_continue(data->walker, ret)) goto out; if (!table) { data->addr = ALIGN_DOWN(data->addr, kvm_granule_size(level)); data->addr += kvm_granule_size(level); goto out; } childp = (kvm_pteref_t)kvm_pte_follow(ctx.old, mm_ops); ret = __kvm_pgtable_walk(data, mm_ops, childp, level + 1); if (!kvm_pgtable_walk_continue(data->walker, ret)) goto out; if (ctx.flags & KVM_PGTABLE_WALK_TABLE_POST) ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_TABLE_POST); out: if (kvm_pgtable_walk_continue(data->walker, ret)) return 0; return ret; } static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data, struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, s8 level) { u32 idx; int ret = 0; if (WARN_ON_ONCE(level < KVM_PGTABLE_FIRST_LEVEL || level > KVM_PGTABLE_LAST_LEVEL)) return -EINVAL; for (idx = kvm_pgtable_idx(data, level); idx < PTRS_PER_PTE; ++idx) { kvm_pteref_t pteref = &pgtable[idx]; if (data->addr >= data->end) break; ret = __kvm_pgtable_visit(data, mm_ops, pteref, level); if (ret) break; } return ret; } static int _kvm_pgtable_walk(struct kvm_pgtable *pgt, struct kvm_pgtable_walk_data *data) { u32 idx; int ret = 0; u64 limit = BIT(pgt->ia_bits); if (data->addr > limit || data->end > limit) return -ERANGE; if (!pgt->pgd) return -EINVAL; for (idx = kvm_pgd_page_idx(pgt, data->addr); data->addr < data->end; ++idx) { kvm_pteref_t pteref = &pgt->pgd[idx * PTRS_PER_PTE]; ret = __kvm_pgtable_walk(data, pgt->mm_ops, pteref, pgt->start_level); if (ret) break; } return ret; } int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size, struct kvm_pgtable_walker *walker) { struct kvm_pgtable_walk_data walk_data = { .start = ALIGN_DOWN(addr, PAGE_SIZE), .addr = ALIGN_DOWN(addr, PAGE_SIZE), .end = PAGE_ALIGN(walk_data.addr + size), .walker = walker, }; int r; r = kvm_pgtable_walk_begin(walker); if (r) return r; r = _kvm_pgtable_walk(pgt, &walk_data); kvm_pgtable_walk_end(walker); return r; } struct leaf_walk_data { kvm_pte_t pte; s8 level; }; static int leaf_walker(const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit) { struct leaf_walk_data *data = ctx->arg; data->pte = ctx->old; data->level = ctx->level; return 0; } int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr, kvm_pte_t *ptep, s8 *level) { struct leaf_walk_data data; struct kvm_pgtable_walker walker = { .cb = leaf_walker, .flags = KVM_PGTABLE_WALK_LEAF, .arg = &data, }; int ret; ret = kvm_pgtable_walk(pgt, ALIGN_DOWN(addr, PAGE_SIZE), PAGE_SIZE, &walker); if (!ret) { if (ptep) *ptep = data.pte; if (level) *level = data.level; } return ret; } struct hyp_map_data { const u64 phys; kvm_pte_t attr; }; static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep) { bool device = prot & KVM_PGTABLE_PROT_DEVICE; u32 mtype = device ? MT_DEVICE_nGnRE : MT_NORMAL; kvm_pte_t attr = FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX, mtype); u32 sh = KVM_PTE_LEAF_ATTR_LO_S1_SH_IS; u32 ap = (prot & KVM_PGTABLE_PROT_W) ? KVM_PTE_LEAF_ATTR_LO_S1_AP_RW : KVM_PTE_LEAF_ATTR_LO_S1_AP_RO; if (!(prot & KVM_PGTABLE_PROT_R)) return -EINVAL; if (prot & KVM_PGTABLE_PROT_X) { if (prot & KVM_PGTABLE_PROT_W) return -EINVAL; if (device) return -EINVAL; if (system_supports_bti_kernel()) attr |= KVM_PTE_LEAF_ATTR_HI_S1_GP; } else { attr |= KVM_PTE_LEAF_ATTR_HI_S1_XN; } attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap); if (!kvm_lpa2_is_enabled()) attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh); attr |= KVM_PTE_LEAF_ATTR_LO_S1_AF; attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW; *ptep = attr; return 0; } enum kvm_pgtable_prot kvm_pgtable_hyp_pte_prot(kvm_pte_t pte) { enum kvm_pgtable_prot prot = pte & KVM_PTE_LEAF_ATTR_HI_SW; u32 ap; if (!kvm_pte_valid(pte)) return prot; if (!(pte & KVM_PTE_LEAF_ATTR_HI_S1_XN)) prot |= KVM_PGTABLE_PROT_X; ap = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S1_AP, pte); if (ap == KVM_PTE_LEAF_ATTR_LO_S1_AP_RO) prot |= KVM_PGTABLE_PROT_R; else if (ap == KVM_PTE_LEAF_ATTR_LO_S1_AP_RW) prot |= KVM_PGTABLE_PROT_RW; return prot; } static bool hyp_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx, struct hyp_map_data *data) { u64 phys = data->phys + (ctx->addr - ctx->start); kvm_pte_t new; if (!kvm_block_mapping_supported(ctx, phys)) return false; new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level); if (ctx->old == new) return true; if (!kvm_pte_valid(ctx->old)) ctx->mm_ops->get_page(ctx->ptep); else if (WARN_ON((ctx->old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW)) return false; smp_store_release(ctx->ptep, new); return true; } static int hyp_map_walker(const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit) { kvm_pte_t *childp, new; struct hyp_map_data *data = ctx->arg; struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; if (hyp_map_walker_try_leaf(ctx, data)) return 0; if (WARN_ON(ctx->level == KVM_PGTABLE_LAST_LEVEL)) return -EINVAL; childp = (kvm_pte_t *)mm_ops->zalloc_page(NULL); if (!childp) return -ENOMEM; new = kvm_init_table_pte(childp, mm_ops); mm_ops->get_page(ctx->ptep); smp_store_release(ctx->ptep, new); return 0; } int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, enum kvm_pgtable_prot prot) { int ret; struct hyp_map_data map_data = { .phys = ALIGN_DOWN(phys, PAGE_SIZE), }; struct kvm_pgtable_walker walker = { .cb = hyp_map_walker, .flags = KVM_PGTABLE_WALK_LEAF, .arg = &map_data, }; ret = hyp_set_prot_attr(prot, &map_data.attr); if (ret) return ret; ret = kvm_pgtable_walk(pgt, addr, size, &walker); dsb(ishst); isb(); return ret; } static int hyp_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit) { kvm_pte_t *childp = NULL; u64 granule = kvm_granule_size(ctx->level); u64 *unmapped = ctx->arg; struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; if (!kvm_pte_valid(ctx->old)) return -EINVAL; if (kvm_pte_table(ctx->old, ctx->level)) { childp = kvm_pte_follow(ctx->old, mm_ops); if (mm_ops->page_count(childp) != 1) return 0; kvm_clear_pte(ctx->ptep); dsb(ishst); __tlbi_level(vae2is, __TLBI_VADDR(ctx->addr, 0), TLBI_TTL_UNKNOWN); } else { if (ctx->end - ctx->addr < granule) return -EINVAL; kvm_clear_pte(ctx->ptep); dsb(ishst); __tlbi_level(vale2is, __TLBI_VADDR(ctx->addr, 0), ctx->level); *unmapped += granule; } dsb(ish); isb(); mm_ops->put_page(ctx->ptep); if (childp) mm_ops->put_page(childp); return 0; } u64 kvm_pgtable_hyp_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size) { u64 unmapped = 0; struct kvm_pgtable_walker walker = { .cb = hyp_unmap_walker, .arg = &unmapped, .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, }; if (!pgt->mm_ops->page_count) return 0; kvm_pgtable_walk(pgt, addr, size, &walker); return unmapped; } int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits, struct kvm_pgtable_mm_ops *mm_ops) { s8 start_level = KVM_PGTABLE_LAST_LEVEL + 1 - ARM64_HW_PGTABLE_LEVELS(va_bits); if (start_level < KVM_PGTABLE_FIRST_LEVEL || start_level > KVM_PGTABLE_LAST_LEVEL) return -EINVAL; pgt->pgd = (kvm_pteref_t)mm_ops->zalloc_page(NULL); if (!pgt->pgd) return -ENOMEM; pgt->ia_bits = va_bits; pgt->start_level = start_level; pgt->mm_ops = mm_ops; pgt->mmu = NULL; pgt->force_pte_cb = NULL; return 0; } static int hyp_free_walker(const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit) { struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; if (!kvm_pte_valid(ctx->old)) return 0; mm_ops->put_page(ctx->ptep); if (kvm_pte_table(ctx->old, ctx->level)) mm_ops->put_page(kvm_pte_follow(ctx->old, mm_ops)); return 0; } void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt) { struct kvm_pgtable_walker walker = { .cb = hyp_free_walker, .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, }; WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker)); pgt->mm_ops->put_page(kvm_dereference_pteref(&walker, pgt->pgd)); pgt->pgd = NULL; } struct stage2_map_data { const u64 phys; kvm_pte_t attr; u8 owner_id; kvm_pte_t *anchor; kvm_pte_t *childp; struct kvm_s2_mmu *mmu; void *memcache; /* Force mappings to page granularity */ bool force_pte; /* Walk should update owner_id only */ bool annotation; }; u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift) { u64 vtcr = VTCR_EL2_FLAGS; s8 lvls; vtcr |= kvm_get_parange(mmfr0) << VTCR_EL2_PS_SHIFT; vtcr |= VTCR_EL2_T0SZ(phys_shift); /* * Use a minimum 2 level page table to prevent splitting * host PMD huge pages at stage2. */ lvls = stage2_pgtable_levels(phys_shift); if (lvls < 2) lvls = 2; /* * When LPA2 is enabled, the HW supports an extra level of translation * (for 5 in total) when using 4K pages. It also introduces VTCR_EL2.SL2 * to as an addition to SL0 to enable encoding this extra start level. * However, since we always use concatenated pages for the first level * lookup, we will never need this extra level and therefore do not need * to touch SL2. */ vtcr |= VTCR_EL2_LVLS_TO_SL0(lvls); #ifdef CONFIG_ARM64_HW_AFDBM /* * Enable the Hardware Access Flag management, unconditionally * on all CPUs. In systems that have asymmetric support for the feature * this allows KVM to leverage hardware support on the subset of cores * that implement the feature. * * The architecture requires VTCR_EL2.HA to be RES0 (thus ignored by * hardware) on implementations that do not advertise support for the * feature. As such, setting HA unconditionally is safe, unless you * happen to be running on a design that has unadvertised support for * HAFDBS. Here be dragons. */ if (!cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38)) vtcr |= VTCR_EL2_HA; #endif /* CONFIG_ARM64_HW_AFDBM */ if (kvm_lpa2_is_enabled()) vtcr |= VTCR_EL2_DS; /* Set the vmid bits */ vtcr |= (get_vmid_bits(mmfr1) == 16) ? VTCR_EL2_VS_16BIT : VTCR_EL2_VS_8BIT; return vtcr; } static bool stage2_has_fwb(struct kvm_pgtable *pgt) { if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) return false; return !(pgt->flags & KVM_PGTABLE_S2_NOFWB); } void kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, size_t size) { unsigned long pages, inval_pages; if (!system_supports_tlb_range()) { kvm_call_hyp(__kvm_tlb_flush_vmid, mmu); return; } pages = size >> PAGE_SHIFT; while (pages > 0) { inval_pages = min(pages, MAX_TLBI_RANGE_PAGES); kvm_call_hyp(__kvm_tlb_flush_vmid_range, mmu, addr, inval_pages); addr += inval_pages << PAGE_SHIFT; pages -= inval_pages; } } #define KVM_S2_MEMATTR(pgt, attr) PAGE_S2_MEMATTR(attr, stage2_has_fwb(pgt)) static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot prot, kvm_pte_t *ptep) { kvm_pte_t attr; u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS; switch (prot & (KVM_PGTABLE_PROT_DEVICE | KVM_PGTABLE_PROT_NORMAL_NC)) { case KVM_PGTABLE_PROT_DEVICE | KVM_PGTABLE_PROT_NORMAL_NC: return -EINVAL; case KVM_PGTABLE_PROT_DEVICE: if (prot & KVM_PGTABLE_PROT_X) return -EINVAL; attr = KVM_S2_MEMATTR(pgt, DEVICE_nGnRE); break; case KVM_PGTABLE_PROT_NORMAL_NC: if (prot & KVM_PGTABLE_PROT_X) return -EINVAL; attr = KVM_S2_MEMATTR(pgt, NORMAL_NC); break; default: attr = KVM_S2_MEMATTR(pgt, NORMAL); } if (!(prot & KVM_PGTABLE_PROT_X)) attr |= KVM_PTE_LEAF_ATTR_HI_S2_XN; if (prot & KVM_PGTABLE_PROT_R) attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R; if (prot & KVM_PGTABLE_PROT_W) attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W; if (!kvm_lpa2_is_enabled()) attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh); attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF; attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW; *ptep = attr; return 0; } enum kvm_pgtable_prot kvm_pgtable_stage2_pte_prot(kvm_pte_t pte) { enum kvm_pgtable_prot prot = pte & KVM_PTE_LEAF_ATTR_HI_SW; if (!kvm_pte_valid(pte)) return prot; if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R) prot |= KVM_PGTABLE_PROT_R; if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W) prot |= KVM_PGTABLE_PROT_W; if (!(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN)) prot |= KVM_PGTABLE_PROT_X; return prot; } static bool stage2_pte_needs_update(kvm_pte_t old, kvm_pte_t new) { if (!kvm_pte_valid(old) || !kvm_pte_valid(new)) return true; return ((old ^ new) & (~KVM_PTE_LEAF_ATTR_S2_PERMS)); } static bool stage2_pte_is_counted(kvm_pte_t pte) { /* * The refcount tracks valid entries as well as invalid entries if they * encode ownership of a page to another entity than the page-table * owner, whose id is 0. */ return !!pte; } static bool stage2_pte_is_locked(kvm_pte_t pte) { return !kvm_pte_valid(pte) && (pte & KVM_INVALID_PTE_LOCKED); } static bool stage2_try_set_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t new) { if (!kvm_pgtable_walk_shared(ctx)) { WRITE_ONCE(*ctx->ptep, new); return true; } return cmpxchg(ctx->ptep, ctx->old, new) == ctx->old; } /** * stage2_try_break_pte() - Invalidates a pte according to the * 'break-before-make' requirements of the * architecture. * * @ctx: context of the visited pte. * @mmu: stage-2 mmu * * Returns: true if the pte was successfully broken. * * If the removed pte was valid, performs the necessary serialization and TLB * invalidation for the old value. For counted ptes, drops the reference count * on the containing table page. */ static bool stage2_try_break_pte(const struct kvm_pgtable_visit_ctx *ctx, struct kvm_s2_mmu *mmu) { struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; if (stage2_pte_is_locked(ctx->old)) { /* * Should never occur if this walker has exclusive access to the * page tables. */ WARN_ON(!kvm_pgtable_walk_shared(ctx)); return false; } if (!stage2_try_set_pte(ctx, KVM_INVALID_PTE_LOCKED)) return false; if (!kvm_pgtable_walk_skip_bbm_tlbi(ctx)) { /* * Perform the appropriate TLB invalidation based on the * evicted pte value (if any). */ if (kvm_pte_table(ctx->old, ctx->level)) { u64 size = kvm_granule_size(ctx->level); u64 addr = ALIGN_DOWN(ctx->addr, size); kvm_tlb_flush_vmid_range(mmu, addr, size); } else if (kvm_pte_valid(ctx->old)) { kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, ctx->level); } } if (stage2_pte_is_counted(ctx->old)) mm_ops->put_page(ctx->ptep); return true; } static void stage2_make_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t new) { struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; WARN_ON(!stage2_pte_is_locked(*ctx->ptep)); if (stage2_pte_is_counted(new)) mm_ops->get_page(ctx->ptep); smp_store_release(ctx->ptep, new); } static bool stage2_unmap_defer_tlb_flush(struct kvm_pgtable *pgt) { /* * If FEAT_TLBIRANGE is implemented, defer the individual * TLB invalidations until the entire walk is finished, and * then use the range-based TLBI instructions to do the * invalidations. Condition deferred TLB invalidation on the * system supporting FWB as the optimization is entirely * pointless when the unmap walker needs to perform CMOs. */ return system_supports_tlb_range() && stage2_has_fwb(pgt); } static void stage2_unmap_put_pte(const struct kvm_pgtable_visit_ctx *ctx, struct kvm_s2_mmu *mmu, struct kvm_pgtable_mm_ops *mm_ops) { struct kvm_pgtable *pgt = ctx->arg; /* * Clear the existing PTE, and perform break-before-make if it was * valid. Depending on the system support, defer the TLB maintenance * for the same until the entire unmap walk is completed. */ if (kvm_pte_valid(ctx->old)) { kvm_clear_pte(ctx->ptep); if (kvm_pte_table(ctx->old, ctx->level)) { kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, TLBI_TTL_UNKNOWN); } else if (!stage2_unmap_defer_tlb_flush(pgt)) { kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, ctx->level); } } mm_ops->put_page(ctx->ptep); } static bool stage2_pte_cacheable(struct kvm_pgtable *pgt, kvm_pte_t pte) { u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR; return kvm_pte_valid(pte) && memattr == KVM_S2_MEMATTR(pgt, NORMAL); } static bool stage2_pte_executable(kvm_pte_t pte) { return kvm_pte_valid(pte) && !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN); } static u64 stage2_map_walker_phys_addr(const struct kvm_pgtable_visit_ctx *ctx, const struct stage2_map_data *data) { u64 phys = data->phys; /* Work out the correct PA based on how far the walk has gotten */ return phys + (ctx->addr - ctx->start); } static bool stage2_leaf_mapping_allowed(const struct kvm_pgtable_visit_ctx *ctx, struct stage2_map_data *data) { u64 phys = stage2_map_walker_phys_addr(ctx, data); if (data->force_pte && ctx->level < KVM_PGTABLE_LAST_LEVEL) return false; if (data->annotation) return true; return kvm_block_mapping_supported(ctx, phys); } static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx, struct stage2_map_data *data) { kvm_pte_t new; u64 phys = stage2_map_walker_phys_addr(ctx, data); u64 granule = kvm_granule_size(ctx->level); struct kvm_pgtable *pgt = data->mmu->pgt; struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; if (!stage2_leaf_mapping_allowed(ctx, data)) return -E2BIG; if (!data->annotation) new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level); else new = kvm_init_invalid_leaf_owner(data->owner_id); /* * Skip updating the PTE if we are trying to recreate the exact * same mapping or only change the access permissions. Instead, * the vCPU will exit one more time from guest if still needed * and then go through the path of relaxing permissions. */ if (!stage2_pte_needs_update(ctx->old, new)) return -EAGAIN; /* If we're only changing software bits, then store them and go! */ if (!kvm_pgtable_walk_shared(ctx) && !((ctx->old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW)) { bool old_is_counted = stage2_pte_is_counted(ctx->old); if (old_is_counted != stage2_pte_is_counted(new)) { if (old_is_counted) mm_ops->put_page(ctx->ptep); else mm_ops->get_page(ctx->ptep); } WARN_ON_ONCE(!stage2_try_set_pte(ctx, new)); return 0; } if (!stage2_try_break_pte(ctx, data->mmu)) return -EAGAIN; /* Perform CMOs before installation of the guest stage-2 PTE */ if (!kvm_pgtable_walk_skip_cmo(ctx) && mm_ops->dcache_clean_inval_poc && stage2_pte_cacheable(pgt, new)) mm_ops->dcache_clean_inval_poc(kvm_pte_follow(new, mm_ops), granule); if (!kvm_pgtable_walk_skip_cmo(ctx) && mm_ops->icache_inval_pou && stage2_pte_executable(new)) mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule); stage2_make_pte(ctx, new); return 0; } static int stage2_map_walk_table_pre(const struct kvm_pgtable_visit_ctx *ctx, struct stage2_map_data *data) { struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; kvm_pte_t *childp = kvm_pte_follow(ctx->old, mm_ops); int ret; if (!stage2_leaf_mapping_allowed(ctx, data)) return 0; ret = stage2_map_walker_try_leaf(ctx, data); if (ret) return ret; mm_ops->free_unlinked_table(childp, ctx->level); return 0; } static int stage2_map_walk_leaf(const struct kvm_pgtable_visit_ctx *ctx, struct stage2_map_data *data) { struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; kvm_pte_t *childp, new; int ret; ret = stage2_map_walker_try_leaf(ctx, data); if (ret != -E2BIG) return ret; if (WARN_ON(ctx->level == KVM_PGTABLE_LAST_LEVEL)) return -EINVAL; if (!data->memcache) return -ENOMEM; childp = mm_ops->zalloc_page(data->memcache); if (!childp) return -ENOMEM; if (!stage2_try_break_pte(ctx, data->mmu)) { mm_ops->put_page(childp); return -EAGAIN; } /* * If we've run into an existing block mapping then replace it with * a table. Accesses beyond 'end' that fall within the new table * will be mapped lazily. */ new = kvm_init_table_pte(childp, mm_ops); stage2_make_pte(ctx, new); return 0; } /* * The TABLE_PRE callback runs for table entries on the way down, looking * for table entries which we could conceivably replace with a block entry * for this mapping. If it finds one it replaces the entry and calls * kvm_pgtable_mm_ops::free_unlinked_table() to tear down the detached table. * * Otherwise, the LEAF callback performs the mapping at the existing leaves * instead. */ static int stage2_map_walker(const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit) { struct stage2_map_data *data = ctx->arg; switch (visit) { case KVM_PGTABLE_WALK_TABLE_PRE: return stage2_map_walk_table_pre(ctx, data); case KVM_PGTABLE_WALK_LEAF: return stage2_map_walk_leaf(ctx, data); default: return -EINVAL; } } int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, enum kvm_pgtable_prot prot, void *mc, enum kvm_pgtable_walk_flags flags) { int ret; struct stage2_map_data map_data = { .phys = ALIGN_DOWN(phys, PAGE_SIZE), .mmu = pgt->mmu, .memcache = mc, .force_pte = pgt->force_pte_cb && pgt->force_pte_cb(addr, addr + size, prot), }; struct kvm_pgtable_walker walker = { .cb = stage2_map_walker, .flags = flags | KVM_PGTABLE_WALK_TABLE_PRE | KVM_PGTABLE_WALK_LEAF, .arg = &map_data, }; if (WARN_ON((pgt->flags & KVM_PGTABLE_S2_IDMAP) && (addr != phys))) return -EINVAL; ret = stage2_set_prot_attr(pgt, prot, &map_data.attr); if (ret) return ret; ret = kvm_pgtable_walk(pgt, addr, size, &walker); dsb(ishst); return ret; } int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size, void *mc, u8 owner_id) { int ret; struct stage2_map_data map_data = { .mmu = pgt->mmu, .memcache = mc, .owner_id = owner_id, .force_pte = true, .annotation = true, }; struct kvm_pgtable_walker walker = { .cb = stage2_map_walker, .flags = KVM_PGTABLE_WALK_TABLE_PRE | KVM_PGTABLE_WALK_LEAF, .arg = &map_data, }; if (owner_id > KVM_MAX_OWNER_ID) return -EINVAL; ret = kvm_pgtable_walk(pgt, addr, size, &walker); return ret; } static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit) { struct kvm_pgtable *pgt = ctx->arg; struct kvm_s2_mmu *mmu = pgt->mmu; struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; kvm_pte_t *childp = NULL; bool need_flush = false; if (!kvm_pte_valid(ctx->old)) { if (stage2_pte_is_counted(ctx->old)) { kvm_clear_pte(ctx->ptep); mm_ops->put_page(ctx->ptep); } return 0; } if (kvm_pte_table(ctx->old, ctx->level)) { childp = kvm_pte_follow(ctx->old, mm_ops); if (mm_ops->page_count(childp) != 1) return 0; } else if (stage2_pte_cacheable(pgt, ctx->old)) { need_flush = !stage2_has_fwb(pgt); } /* * This is similar to the map() path in that we unmap the entire * block entry and rely on the remaining portions being faulted * back lazily. */ stage2_unmap_put_pte(ctx, mmu, mm_ops); if (need_flush && mm_ops->dcache_clean_inval_poc) mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops), kvm_granule_size(ctx->level)); if (childp) mm_ops->put_page(childp); return 0; } int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size) { int ret; struct kvm_pgtable_walker walker = { .cb = stage2_unmap_walker, .arg = pgt, .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, }; ret = kvm_pgtable_walk(pgt, addr, size, &walker); if (stage2_unmap_defer_tlb_flush(pgt)) /* Perform the deferred TLB invalidations */ kvm_tlb_flush_vmid_range(pgt->mmu, addr, size); return ret; } struct stage2_attr_data { kvm_pte_t attr_set; kvm_pte_t attr_clr; kvm_pte_t pte; s8 level; }; static int stage2_attr_walker(const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit) { kvm_pte_t pte = ctx->old; struct stage2_attr_data *data = ctx->arg; struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; if (!kvm_pte_valid(ctx->old)) return -EAGAIN; data->level = ctx->level; data->pte = pte; pte &= ~data->attr_clr; pte |= data->attr_set; /* * We may race with the CPU trying to set the access flag here, * but worst-case the access flag update gets lost and will be * set on the next access instead. */ if (data->pte != pte) { /* * Invalidate instruction cache before updating the guest * stage-2 PTE if we are going to add executable permission. */ if (mm_ops->icache_inval_pou && stage2_pte_executable(pte) && !stage2_pte_executable(ctx->old)) mm_ops->icache_inval_pou(kvm_pte_follow(pte, mm_ops), kvm_granule_size(ctx->level)); if (!stage2_try_set_pte(ctx, pte)) return -EAGAIN; } return 0; } static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr, u64 size, kvm_pte_t attr_set, kvm_pte_t attr_clr, kvm_pte_t *orig_pte, s8 *level, enum kvm_pgtable_walk_flags flags) { int ret; kvm_pte_t attr_mask = KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI; struct stage2_attr_data data = { .attr_set = attr_set & attr_mask, .attr_clr = attr_clr & attr_mask, }; struct kvm_pgtable_walker walker = { .cb = stage2_attr_walker, .arg = &data, .flags = flags | KVM_PGTABLE_WALK_LEAF, }; ret = kvm_pgtable_walk(pgt, addr, size, &walker); if (ret) return ret; if (orig_pte) *orig_pte = data.pte; if (level) *level = data.level; return 0; } int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size) { return stage2_update_leaf_attrs(pgt, addr, size, 0, KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W, NULL, NULL, 0); } void kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr, enum kvm_pgtable_walk_flags flags) { int ret; ret = stage2_update_leaf_attrs(pgt, addr, 1, KVM_PTE_LEAF_ATTR_LO_S2_AF, 0, NULL, NULL, flags); if (!ret) dsb(ishst); } struct stage2_age_data { bool mkold; bool young; }; static int stage2_age_walker(const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit) { kvm_pte_t new = ctx->old & ~KVM_PTE_LEAF_ATTR_LO_S2_AF; struct stage2_age_data *data = ctx->arg; if (!kvm_pte_valid(ctx->old) || new == ctx->old) return 0; data->young = true; /* * stage2_age_walker() is always called while holding the MMU lock for * write, so this will always succeed. Nonetheless, this deliberately * follows the race detection pattern of the other stage-2 walkers in * case the locking mechanics of the MMU notifiers is ever changed. */ if (data->mkold && !stage2_try_set_pte(ctx, new)) return -EAGAIN; /* * "But where's the TLBI?!", you scream. * "Over in the core code", I sigh. * * See the '->clear_flush_young()' callback on the KVM mmu notifier. */ return 0; } bool kvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr, u64 size, bool mkold) { struct stage2_age_data data = { .mkold = mkold, }; struct kvm_pgtable_walker walker = { .cb = stage2_age_walker, .arg = &data, .flags = KVM_PGTABLE_WALK_LEAF, }; WARN_ON(kvm_pgtable_walk(pgt, addr, size, &walker)); return data.young; } int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, enum kvm_pgtable_prot prot, enum kvm_pgtable_walk_flags flags) { int ret; s8 level; kvm_pte_t set = 0, clr = 0; if (prot & KVM_PTE_LEAF_ATTR_HI_SW) return -EINVAL; if (prot & KVM_PGTABLE_PROT_R) set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R; if (prot & KVM_PGTABLE_PROT_W) set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W; if (prot & KVM_PGTABLE_PROT_X) clr |= KVM_PTE_LEAF_ATTR_HI_S2_XN; ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level, flags); if (!ret || ret == -EAGAIN) kvm_call_hyp(__kvm_tlb_flush_vmid_ipa_nsh, pgt->mmu, addr, level); return ret; } static int stage2_flush_walker(const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit) { struct kvm_pgtable *pgt = ctx->arg; struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops; if (!stage2_pte_cacheable(pgt, ctx->old)) return 0; if (mm_ops->dcache_clean_inval_poc) mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops), kvm_granule_size(ctx->level)); return 0; } int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size) { struct kvm_pgtable_walker walker = { .cb = stage2_flush_walker, .flags = KVM_PGTABLE_WALK_LEAF, .arg = pgt, }; if (stage2_has_fwb(pgt)) return 0; return kvm_pgtable_walk(pgt, addr, size, &walker); } kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt, u64 phys, s8 level, enum kvm_pgtable_prot prot, void *mc, bool force_pte) { struct stage2_map_data map_data = { .phys = phys, .mmu = pgt->mmu, .memcache = mc, .force_pte = force_pte, }; struct kvm_pgtable_walker walker = { .cb = stage2_map_walker, .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_SKIP_BBM_TLBI | KVM_PGTABLE_WALK_SKIP_CMO, .arg = &map_data, }; /* * The input address (.addr) is irrelevant for walking an * unlinked table. Construct an ambiguous IA range to map * kvm_granule_size(level) worth of memory. */ struct kvm_pgtable_walk_data data = { .walker = &walker, .addr = 0, .end = kvm_granule_size(level), }; struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops; kvm_pte_t *pgtable; int ret; if (!IS_ALIGNED(phys, kvm_granule_size(level))) return ERR_PTR(-EINVAL); ret = stage2_set_prot_attr(pgt, prot, &map_data.attr); if (ret) return ERR_PTR(ret); pgtable = mm_ops->zalloc_page(mc); if (!pgtable) return ERR_PTR(-ENOMEM); ret = __kvm_pgtable_walk(&data, mm_ops, (kvm_pteref_t)pgtable, level + 1); if (ret) { kvm_pgtable_stage2_free_unlinked(mm_ops, pgtable, level); return ERR_PTR(ret); } return pgtable; } /* * Get the number of page-tables needed to replace a block with a * fully populated tree up to the PTE entries. Note that @level is * interpreted as in "level @level entry". */ static int stage2_block_get_nr_page_tables(s8 level) { switch (level) { case 1: return PTRS_PER_PTE + 1; case 2: return 1; case 3: return 0; default: WARN_ON_ONCE(level < KVM_PGTABLE_MIN_BLOCK_LEVEL || level > KVM_PGTABLE_LAST_LEVEL); return -EINVAL; }; } static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit) { struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; struct kvm_mmu_memory_cache *mc = ctx->arg; struct kvm_s2_mmu *mmu; kvm_pte_t pte = ctx->old, new, *childp; enum kvm_pgtable_prot prot; s8 level = ctx->level; bool force_pte; int nr_pages; u64 phys; /* No huge-pages exist at the last level */ if (level == KVM_PGTABLE_LAST_LEVEL) return 0; /* We only split valid block mappings */ if (!kvm_pte_valid(pte)) return 0; nr_pages = stage2_block_get_nr_page_tables(level); if (nr_pages < 0) return nr_pages; if (mc->nobjs >= nr_pages) { /* Build a tree mapped down to the PTE granularity. */ force_pte = true; } else { /* * Don't force PTEs, so create_unlinked() below does * not populate the tree up to the PTE level. The * consequence is that the call will require a single * page of level 2 entries at level 1, or a single * page of PTEs at level 2. If we are at level 1, the * PTEs will be created recursively. */ force_pte = false; nr_pages = 1; } if (mc->nobjs < nr_pages) return -ENOMEM; mmu = container_of(mc, struct kvm_s2_mmu, split_page_cache); phys = kvm_pte_to_phys(pte); prot = kvm_pgtable_stage2_pte_prot(pte); childp = kvm_pgtable_stage2_create_unlinked(mmu->pgt, phys, level, prot, mc, force_pte); if (IS_ERR(childp)) return PTR_ERR(childp); if (!stage2_try_break_pte(ctx, mmu)) { kvm_pgtable_stage2_free_unlinked(mm_ops, childp, level); return -EAGAIN; } /* * Note, the contents of the page table are guaranteed to be made * visible before the new PTE is assigned because stage2_make_pte() * writes the PTE using smp_store_release(). */ new = kvm_init_table_pte(childp, mm_ops); stage2_make_pte(ctx, new); return 0; } int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size, struct kvm_mmu_memory_cache *mc) { struct kvm_pgtable_walker walker = { .cb = stage2_split_walker, .flags = KVM_PGTABLE_WALK_LEAF, .arg = mc, }; int ret; ret = kvm_pgtable_walk(pgt, addr, size, &walker); dsb(ishst); return ret; } int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, struct kvm_pgtable_mm_ops *mm_ops, enum kvm_pgtable_stage2_flags flags, kvm_pgtable_force_pte_cb_t force_pte_cb) { size_t pgd_sz; u64 vtcr = mmu->vtcr; u32 ia_bits = VTCR_EL2_IPA(vtcr); u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr); s8 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0; pgd_sz = kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE; pgt->pgd = (kvm_pteref_t)mm_ops->zalloc_pages_exact(pgd_sz); if (!pgt->pgd) return -ENOMEM; pgt->ia_bits = ia_bits; pgt->start_level = start_level; pgt->mm_ops = mm_ops; pgt->mmu = mmu; pgt->flags = flags; pgt->force_pte_cb = force_pte_cb; /* Ensure zeroed PGD pages are visible to the hardware walker */ dsb(ishst); return 0; } size_t kvm_pgtable_stage2_pgd_size(u64 vtcr) { u32 ia_bits = VTCR_EL2_IPA(vtcr); u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr); s8 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0; return kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE; } static int stage2_free_walker(const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit) { struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; if (!stage2_pte_is_counted(ctx->old)) return 0; mm_ops->put_page(ctx->ptep); if (kvm_pte_table(ctx->old, ctx->level)) mm_ops->put_page(kvm_pte_follow(ctx->old, mm_ops)); return 0; } void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) { size_t pgd_sz; struct kvm_pgtable_walker walker = { .cb = stage2_free_walker, .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, }; WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker)); pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE; pgt->mm_ops->free_pages_exact(kvm_dereference_pteref(&walker, pgt->pgd), pgd_sz); pgt->pgd = NULL; } void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, s8 level) { kvm_pteref_t ptep = (kvm_pteref_t)pgtable; struct kvm_pgtable_walker walker = { .cb = stage2_free_walker, .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, }; struct kvm_pgtable_walk_data data = { .walker = &walker, /* * At this point the IPA really doesn't matter, as the page * table being traversed has already been removed from the stage * 2. Set an appropriate range to cover the entire page table. */ .addr = 0, .end = kvm_granule_size(level), }; WARN_ON(__kvm_pgtable_walk(&data, mm_ops, ptep, level + 1)); WARN_ON(mm_ops->page_count(pgtable) != 1); mm_ops->put_page(pgtable); }
26 26 26 44 44 44 4 4 4 4 4 1 4 4 2 2 299 301 241 299 301 302 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2012,2013 - ARM Ltd * Author: Marc Zyngier <marc.zyngier@arm.com> * * Derived from arch/arm/kvm/handle_exit.c: * Copyright (C) 2012 - Virtual Open Systems and Columbia University * Author: Christoffer Dall <c.dall@virtualopensystems.com> */ #include <linux/kvm.h> #include <linux/kvm_host.h> #include <linux/ubsan.h> #include <asm/esr.h> #include <asm/exception.h> #include <asm/kvm_asm.h> #include <asm/kvm_emulate.h> #include <asm/kvm_mmu.h> #include <asm/kvm_nested.h> #include <asm/debug-monitors.h> #include <asm/stacktrace/nvhe.h> #include <asm/traps.h> #include <kvm/arm_hypercalls.h> #define CREATE_TRACE_POINTS #include "trace_handle_exit.h" typedef int (*exit_handle_fn)(struct kvm_vcpu *); static void kvm_handle_guest_serror(struct kvm_vcpu *vcpu, u64 esr) { if (!arm64_is_ras_serror(esr) || arm64_is_fatal_ras_serror(NULL, esr)) kvm_inject_serror(vcpu); } static int handle_hvc(struct kvm_vcpu *vcpu) { trace_kvm_hvc_arm64(*vcpu_pc(vcpu), vcpu_get_reg(vcpu, 0), kvm_vcpu_hvc_get_imm(vcpu)); vcpu->stat.hvc_exit_stat++; /* Forward hvc instructions to the virtual EL2 if the guest has EL2. */ if (vcpu_has_nv(vcpu)) { if (vcpu_read_sys_reg(vcpu, HCR_EL2) & HCR_HCD) kvm_inject_undefined(vcpu); else kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu)); return 1; } return kvm_smccc_call_handler(vcpu); } static int handle_smc(struct kvm_vcpu *vcpu) { /* * Forward this trapped smc instruction to the virtual EL2 if * the guest has asked for it. */ if (forward_smc_trap(vcpu)) return 1; /* * "If an SMC instruction executed at Non-secure EL1 is * trapped to EL2 because HCR_EL2.TSC is 1, the exception is a * Trap exception, not a Secure Monitor Call exception [...]" * * We need to advance the PC after the trap, as it would * otherwise return to the same address. Furthermore, pre-incrementing * the PC before potentially exiting to userspace maintains the same * abstraction for both SMCs and HVCs. */ kvm_incr_pc(vcpu); /* * SMCs with a nonzero immediate are reserved according to DEN0028E 2.9 * "SMC and HVC immediate value". */ if (kvm_vcpu_hvc_get_imm(vcpu)) { vcpu_set_reg(vcpu, 0, ~0UL); return 1; } /* * If imm is zero then it is likely an SMCCC call. * * Note that on ARMv8.3, even if EL3 is not implemented, SMC executed * at Non-secure EL1 is trapped to EL2 if HCR_EL2.TSC==1, rather than * being treated as UNDEFINED. */ return kvm_smccc_call_handler(vcpu); } /* * This handles the cases where the system does not support FP/ASIMD or when * we are running nested virtualization and the guest hypervisor is trapping * FP/ASIMD accesses by its guest guest. * * All other handling of guest vs. host FP/ASIMD register state is handled in * fixup_guest_exit(). */ static int kvm_handle_fpasimd(struct kvm_vcpu *vcpu) { if (guest_hyp_fpsimd_traps_enabled(vcpu)) return kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu)); /* This is the case when the system doesn't support FP/ASIMD. */ kvm_inject_undefined(vcpu); return 1; } /** * kvm_handle_wfx - handle a wait-for-interrupts or wait-for-event * instruction executed by a guest * * @vcpu: the vcpu pointer * * WFE[T]: Yield the CPU and come back to this vcpu when the scheduler * decides to. * WFI: Simply call kvm_vcpu_halt(), which will halt execution of * world-switches and schedule other host processes until there is an * incoming IRQ or FIQ to the VM. * WFIT: Same as WFI, with a timed wakeup implemented as a background timer * * WF{I,E}T can immediately return if the deadline has already expired. */ static int kvm_handle_wfx(struct kvm_vcpu *vcpu) { u64 esr = kvm_vcpu_get_esr(vcpu); bool is_wfe = !!(esr & ESR_ELx_WFx_ISS_WFE); if (guest_hyp_wfx_traps_enabled(vcpu)) return kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu)); if (is_wfe) { trace_kvm_wfx_arm64(*vcpu_pc(vcpu), true); vcpu->stat.wfe_exit_stat++; } else { trace_kvm_wfx_arm64(*vcpu_pc(vcpu), false); vcpu->stat.wfi_exit_stat++; } if (esr & ESR_ELx_WFx_ISS_WFxT) { if (esr & ESR_ELx_WFx_ISS_RV) { u64 val, now; now = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_TIMER_CNT); val = vcpu_get_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu)); if (now >= val) goto out; } else { /* Treat WFxT as WFx if RN is invalid */ esr &= ~ESR_ELx_WFx_ISS_WFxT; } } if (esr & ESR_ELx_WFx_ISS_WFE) { kvm_vcpu_on_spin(vcpu, vcpu_mode_priv(vcpu)); } else { if (esr & ESR_ELx_WFx_ISS_WFxT) vcpu_set_flag(vcpu, IN_WFIT); kvm_vcpu_wfi(vcpu); } out: kvm_incr_pc(vcpu); return 1; } /** * kvm_handle_guest_debug - handle a debug exception instruction * * @vcpu: the vcpu pointer * * We route all debug exceptions through the same handler. If both the * guest and host are using the same debug facilities it will be up to * userspace to re-inject the correct exception for guest delivery. * * @return: 0 (while setting vcpu->run->exit_reason) */ static int kvm_handle_guest_debug(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; u64 esr = kvm_vcpu_get_esr(vcpu); if (!vcpu->guest_debug && forward_debug_exception(vcpu)) return 1; run->exit_reason = KVM_EXIT_DEBUG; run->debug.arch.hsr = lower_32_bits(esr); run->debug.arch.hsr_high = upper_32_bits(esr); run->flags = KVM_DEBUG_ARCH_HSR_HIGH_VALID; switch (ESR_ELx_EC(esr)) { case ESR_ELx_EC_WATCHPT_LOW: run->debug.arch.far = vcpu->arch.fault.far_el2; break; case ESR_ELx_EC_SOFTSTP_LOW: *vcpu_cpsr(vcpu) |= DBG_SPSR_SS; break; } return 0; } static int kvm_handle_unknown_ec(struct kvm_vcpu *vcpu) { u64 esr = kvm_vcpu_get_esr(vcpu); kvm_pr_unimpl("Unknown exception class: esr: %#016llx -- %s\n", esr, esr_get_class_string(esr)); kvm_inject_undefined(vcpu); return 1; } /* * Guest access to SVE registers should be routed to this handler only * when the system doesn't support SVE. */ static int handle_sve(struct kvm_vcpu *vcpu) { if (guest_hyp_sve_traps_enabled(vcpu)) return kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu)); kvm_inject_undefined(vcpu); return 1; } /* * Two possibilities to handle a trapping ptrauth instruction: * * - Guest usage of a ptrauth instruction (which the guest EL1 did not * turn into a NOP). If we get here, it is because we didn't enable * ptrauth for the guest. This results in an UNDEF, as it isn't * supposed to use ptrauth without being told it could. * * - Running an L2 NV guest while L1 has left HCR_EL2.API==0, and for * which we reinject the exception into L1. * * Anything else is an emulation bug (hence the WARN_ON + UNDEF). */ static int kvm_handle_ptrauth(struct kvm_vcpu *vcpu) { if (!vcpu_has_ptrauth(vcpu)) { kvm_inject_undefined(vcpu); return 1; } if (is_nested_ctxt(vcpu)) { kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu)); return 1; } /* Really shouldn't be here! */ WARN_ON_ONCE(1); kvm_inject_undefined(vcpu); return 1; } static int kvm_handle_eret(struct kvm_vcpu *vcpu) { if (esr_iss_is_eretax(kvm_vcpu_get_esr(vcpu)) && !vcpu_has_ptrauth(vcpu)) return kvm_handle_ptrauth(vcpu); /* * If we got here, two possibilities: * * - the guest is in EL2, and we need to fully emulate ERET * * - the guest is in EL1, and we need to reinject the * exception into the L1 hypervisor. * * If KVM ever traps ERET for its own use, we'll have to * revisit this. */ if (is_hyp_ctxt(vcpu)) kvm_emulate_nested_eret(vcpu); else kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu)); return 1; } static int handle_svc(struct kvm_vcpu *vcpu) { /* * So far, SVC traps only for NV via HFGITR_EL2. A SVC from a * 32bit guest would be caught by vpcu_mode_is_bad_32bit(), so * we should only have to deal with a 64 bit exception. */ kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu)); return 1; } static int kvm_handle_gcs(struct kvm_vcpu *vcpu) { /* We don't expect GCS, so treat it with contempt */ if (kvm_has_feat(vcpu->kvm, ID_AA64PFR1_EL1, GCS, IMP)) WARN_ON_ONCE(1); kvm_inject_undefined(vcpu); return 1; } static int handle_other(struct kvm_vcpu *vcpu) { bool allowed, fwd = is_nested_ctxt(vcpu); u64 hcrx = __vcpu_sys_reg(vcpu, HCRX_EL2); u64 esr = kvm_vcpu_get_esr(vcpu); u64 iss = ESR_ELx_ISS(esr); struct kvm *kvm = vcpu->kvm; /* * We only trap for two reasons: * * - the feature is disabled, and the only outcome is to * generate an UNDEF. * * - the feature is enabled, but a NV guest wants to trap the * feature used by its L2 guest. We forward the exception in * this case. * * What we don't expect is to end-up here if the guest is * expected be be able to directly use the feature, hence the * WARN_ON below. */ switch (iss) { case ESR_ELx_ISS_OTHER_ST64BV: allowed = kvm_has_feat(kvm, ID_AA64ISAR1_EL1, LS64, LS64_V); fwd &= !(hcrx & HCRX_EL2_EnASR); break; case ESR_ELx_ISS_OTHER_ST64BV0: allowed = kvm_has_feat(kvm, ID_AA64ISAR1_EL1, LS64, LS64_ACCDATA); fwd &= !(hcrx & HCRX_EL2_EnAS0); break; case ESR_ELx_ISS_OTHER_LDST64B: allowed = kvm_has_feat(kvm, ID_AA64ISAR1_EL1, LS64, LS64); fwd &= !(hcrx & HCRX_EL2_EnALS); break; case ESR_ELx_ISS_OTHER_TSBCSYNC: allowed = kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceBuffer, TRBE_V1P1); fwd &= (__vcpu_sys_reg(vcpu, HFGITR2_EL2) & HFGITR2_EL2_TSBCSYNC); break; case ESR_ELx_ISS_OTHER_PSBCSYNC: allowed = kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSVer, V1P5); fwd &= (__vcpu_sys_reg(vcpu, HFGITR_EL2) & HFGITR_EL2_PSBCSYNC); break; default: /* Clearly, we're missing something. */ WARN_ON_ONCE(1); allowed = false; } WARN_ON_ONCE(allowed && !fwd); if (allowed && fwd) kvm_inject_nested_sync(vcpu, esr); else kvm_inject_undefined(vcpu); return 1; } static exit_handle_fn arm_exit_handlers[] = { [0 ... ESR_ELx_EC_MAX] = kvm_handle_unknown_ec, [ESR_ELx_EC_WFx] = kvm_handle_wfx, [ESR_ELx_EC_CP15_32] = kvm_handle_cp15_32, [ESR_ELx_EC_CP15_64] = kvm_handle_cp15_64, [ESR_ELx_EC_CP14_MR] = kvm_handle_cp14_32, [ESR_ELx_EC_CP14_LS] = kvm_handle_cp14_load_store, [ESR_ELx_EC_CP10_ID] = kvm_handle_cp10_id, [ESR_ELx_EC_CP14_64] = kvm_handle_cp14_64, [ESR_ELx_EC_OTHER] = handle_other, [ESR_ELx_EC_HVC32] = handle_hvc, [ESR_ELx_EC_SMC32] = handle_smc, [ESR_ELx_EC_HVC64] = handle_hvc, [ESR_ELx_EC_SMC64] = handle_smc, [ESR_ELx_EC_SVC64] = handle_svc, [ESR_ELx_EC_SYS64] = kvm_handle_sys_reg, [ESR_ELx_EC_SVE] = handle_sve, [ESR_ELx_EC_ERET] = kvm_handle_eret, [ESR_ELx_EC_IABT_LOW] = kvm_handle_guest_abort, [ESR_ELx_EC_DABT_LOW] = kvm_handle_guest_abort, [ESR_ELx_EC_DABT_CUR] = kvm_handle_vncr_abort, [ESR_ELx_EC_SOFTSTP_LOW]= kvm_handle_guest_debug, [ESR_ELx_EC_WATCHPT_LOW]= kvm_handle_guest_debug, [ESR_ELx_EC_BREAKPT_LOW]= kvm_handle_guest_debug, [ESR_ELx_EC_BKPT32] = kvm_handle_guest_debug, [ESR_ELx_EC_BRK64] = kvm_handle_guest_debug, [ESR_ELx_EC_FP_ASIMD] = kvm_handle_fpasimd, [ESR_ELx_EC_PAC] = kvm_handle_ptrauth, [ESR_ELx_EC_GCS] = kvm_handle_gcs, }; static exit_handle_fn kvm_get_exit_handler(struct kvm_vcpu *vcpu) { u64 esr = kvm_vcpu_get_esr(vcpu); u8 esr_ec = ESR_ELx_EC(esr); return arm_exit_handlers[esr_ec]; } /* * We may be single-stepping an emulated instruction. If the emulation * has been completed in the kernel, we can return to userspace with a * KVM_EXIT_DEBUG, otherwise userspace needs to complete its * emulation first. */ static int handle_trap_exceptions(struct kvm_vcpu *vcpu) { int handled; /* * See ARM ARM B1.14.1: "Hyp traps on instructions * that fail their condition code check" */ if (!kvm_condition_valid(vcpu)) { kvm_incr_pc(vcpu); handled = 1; } else { exit_handle_fn exit_handler; exit_handler = kvm_get_exit_handler(vcpu); handled = exit_handler(vcpu); } return handled; } /* * Return > 0 to return to guest, < 0 on error, 0 (and set exit_reason) on * proper exit to userspace. */ int handle_exit(struct kvm_vcpu *vcpu, int exception_index) { struct kvm_run *run = vcpu->run; if (ARM_SERROR_PENDING(exception_index)) { /* * The SError is handled by handle_exit_early(). If the guest * survives it will re-execute the original instruction. */ return 1; } exception_index = ARM_EXCEPTION_CODE(exception_index); switch (exception_index) { case ARM_EXCEPTION_IRQ: return 1; case ARM_EXCEPTION_EL1_SERROR: return 1; case ARM_EXCEPTION_TRAP: return handle_trap_exceptions(vcpu); case ARM_EXCEPTION_HYP_GONE: /* * EL2 has been reset to the hyp-stub. This happens when a guest * is pre-emptied by kvm_reboot()'s shutdown call. */ run->exit_reason = KVM_EXIT_FAIL_ENTRY; return 0; case ARM_EXCEPTION_IL: /* * We attempted an illegal exception return. Guest state must * have been corrupted somehow. Give up. */ run->exit_reason = KVM_EXIT_FAIL_ENTRY; return -EINVAL; default: kvm_pr_unimpl("Unsupported exception type: %d", exception_index); run->exit_reason = KVM_EXIT_INTERNAL_ERROR; return 0; } } /* For exit types that need handling before we can be preempted */ void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index) { if (ARM_SERROR_PENDING(exception_index)) { if (this_cpu_has_cap(ARM64_HAS_RAS_EXTN)) { u64 disr = kvm_vcpu_get_disr(vcpu); kvm_handle_guest_serror(vcpu, disr_to_esr(disr)); } else { kvm_inject_serror(vcpu); } return; } exception_index = ARM_EXCEPTION_CODE(exception_index); if (exception_index == ARM_EXCEPTION_EL1_SERROR) kvm_handle_guest_serror(vcpu, kvm_vcpu_get_esr(vcpu)); } static void print_nvhe_hyp_panic(const char *name, u64 panic_addr) { kvm_err("nVHE hyp %s at: [<%016llx>] %pB!\n", name, panic_addr, (void *)(panic_addr + kaslr_offset())); } static void kvm_nvhe_report_cfi_failure(u64 panic_addr) { print_nvhe_hyp_panic("CFI failure", panic_addr); if (IS_ENABLED(CONFIG_CFI_PERMISSIVE)) kvm_err(" (CONFIG_CFI_PERMISSIVE ignored for hyp failures)\n"); } void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, u64 elr_virt, u64 elr_phys, u64 par, uintptr_t vcpu, u64 far, u64 hpfar) { u64 elr_in_kimg = __phys_to_kimg(elr_phys); u64 hyp_offset = elr_in_kimg - kaslr_offset() - elr_virt; u64 mode = spsr & PSR_MODE_MASK; u64 panic_addr = elr_virt + hyp_offset; if (mode != PSR_MODE_EL2t && mode != PSR_MODE_EL2h) { kvm_err("Invalid host exception to nVHE hyp!\n"); } else if (ESR_ELx_EC(esr) == ESR_ELx_EC_BRK64 && esr_brk_comment(esr) == BUG_BRK_IMM) { const char *file = NULL; unsigned int line = 0; /* All hyp bugs, including warnings, are treated as fatal. */ if (!is_protected_kvm_enabled() || IS_ENABLED(CONFIG_NVHE_EL2_DEBUG)) { struct bug_entry *bug = find_bug(elr_in_kimg); if (bug) bug_get_file_line(bug, &file, &line); } if (file) kvm_err("nVHE hyp BUG at: %s:%u!\n", file, line); else print_nvhe_hyp_panic("BUG", panic_addr); } else if (IS_ENABLED(CONFIG_CFI_CLANG) && esr_is_cfi_brk(esr)) { kvm_nvhe_report_cfi_failure(panic_addr); } else if (IS_ENABLED(CONFIG_UBSAN_KVM_EL2) && ESR_ELx_EC(esr) == ESR_ELx_EC_BRK64 && esr_is_ubsan_brk(esr)) { print_nvhe_hyp_panic(report_ubsan_failure(esr & UBSAN_BRK_MASK), panic_addr); } else { print_nvhe_hyp_panic("panic", panic_addr); } /* Dump the nVHE hypervisor backtrace */ kvm_nvhe_dump_backtrace(hyp_offset); /* * Hyp has panicked and we're going to handle that by panicking the * kernel. The kernel offset will be revealed in the panic so we're * also safe to reveal the hyp offset as a debugging aid for translating * hyp VAs to vmlinux addresses. */ kvm_err("Hyp Offset: 0x%llx\n", hyp_offset); panic("HYP panic:\nPS:%08llx PC:%016llx ESR:%016llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%016lx\n", spsr, elr_virt, esr, far, hpfar, par, vcpu); }
267 267 255 255 307 305 306 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 // SPDX-License-Identifier: GPL-2.0-or-later /* * Functions for initialisaing, allocating, freeing and duplicating VMAs. Shared * between CONFIG_MMU and non-CONFIG_MMU kernel configurations. */ #include "vma_internal.h" #include "vma.h" /* SLAB cache for vm_area_struct structures */ static struct kmem_cache *vm_area_cachep; void __init vma_state_init(void) { struct kmem_cache_args args = { .use_freeptr_offset = true, .freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr), }; vm_area_cachep = kmem_cache_create("vm_area_struct", sizeof(struct vm_area_struct), &args, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| SLAB_ACCOUNT); } struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) { struct vm_area_struct *vma; vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (!vma) return NULL; vma_init(vma, mm); return vma; } static void vm_area_init_from(const struct vm_area_struct *src, struct vm_area_struct *dest) { dest->vm_mm = src->vm_mm; dest->vm_ops = src->vm_ops; dest->vm_start = src->vm_start; dest->vm_end = src->vm_end; dest->anon_vma = src->anon_vma; dest->vm_pgoff = src->vm_pgoff; dest->vm_file = src->vm_file; dest->vm_private_data = src->vm_private_data; vm_flags_init(dest, src->vm_flags); memcpy(&dest->vm_page_prot, &src->vm_page_prot, sizeof(dest->vm_page_prot)); /* * src->shared.rb may be modified concurrently when called from * dup_mmap(), but the clone will reinitialize it. */ data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared))); memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx, sizeof(dest->vm_userfaultfd_ctx)); #ifdef CONFIG_ANON_VMA_NAME dest->anon_name = src->anon_name; #endif #ifdef CONFIG_SWAP memcpy(&dest->swap_readahead_info, &src->swap_readahead_info, sizeof(dest->swap_readahead_info)); #endif #ifndef CONFIG_MMU dest->vm_region = src->vm_region; #endif #ifdef CONFIG_NUMA dest->vm_policy = src->vm_policy; #endif #ifdef __HAVE_PFNMAP_TRACKING dest->pfnmap_track_ctx = NULL; #endif } #ifdef __HAVE_PFNMAP_TRACKING static inline int vma_pfnmap_track_ctx_dup(struct vm_area_struct *orig, struct vm_area_struct *new) { struct pfnmap_track_ctx *ctx = orig->pfnmap_track_ctx; if (likely(!ctx)) return 0; /* * We don't expect to ever hit this. If ever required, we would have * to duplicate the tracking. */ if (unlikely(kref_read(&ctx->kref) >= REFCOUNT_MAX)) return -ENOMEM; kref_get(&ctx->kref); new->pfnmap_track_ctx = ctx; return 0; } static inline void vma_pfnmap_track_ctx_release(struct vm_area_struct *vma) { struct pfnmap_track_ctx *ctx = vma->pfnmap_track_ctx; if (likely(!ctx)) return; kref_put(&ctx->kref, pfnmap_track_ctx_release); vma->pfnmap_track_ctx = NULL; } #else static inline int vma_pfnmap_track_ctx_dup(struct vm_area_struct *orig, struct vm_area_struct *new) { return 0; } static inline void vma_pfnmap_track_ctx_release(struct vm_area_struct *vma) { } #endif struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) { struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (!new) return NULL; ASSERT_EXCLUSIVE_WRITER(orig->vm_flags); ASSERT_EXCLUSIVE_WRITER(orig->vm_file); vm_area_init_from(orig, new); if (vma_pfnmap_track_ctx_dup(orig, new)) { kmem_cache_free(vm_area_cachep, new); return NULL; } vma_lock_init(new, true); INIT_LIST_HEAD(&new->anon_vma_chain); vma_numab_state_init(new); dup_anon_vma_name(orig, new); return new; } void vm_area_free(struct vm_area_struct *vma) { /* The vma should be detached while being destroyed. */ vma_assert_detached(vma); vma_numab_state_free(vma); free_anon_vma_name(vma); vma_pfnmap_track_ctx_release(vma); kmem_cache_free(vm_area_cachep, vma); }
3 3 3 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 1994, Karl Keyte: Added support for disk statistics * Elevator latency, (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de> * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au> * - July2000 * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001 */ /* * This handles all read/write requests to block devices */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/blk-pm.h> #include <linux/blk-integrity.h> #include <linux/highmem.h> #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/kernel_stat.h> #include <linux/string.h> #include <linux/init.h> #include <linux/completion.h> #include <linux/slab.h> #include <linux/swap.h> #include <linux/writeback.h> #include <linux/task_io_accounting_ops.h> #include <linux/fault-inject.h> #include <linux/list_sort.h> #include <linux/delay.h> #include <linux/ratelimit.h> #include <linux/pm_runtime.h> #include <linux/t10-pi.h> #include <linux/debugfs.h> #include <linux/bpf.h> #include <linux/part_stat.h> #include <linux/sched/sysctl.h> #include <linux/blk-crypto.h> #define CREATE_TRACE_POINTS #include <trace/events/block.h> #include "blk.h" #include "blk-mq-sched.h" #include "blk-pm.h" #include "blk-cgroup.h" #include "blk-throttle.h" #include "blk-ioprio.h" struct dentry *blk_debugfs_root; EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete); EXPORT_TRACEPOINT_SYMBOL_GPL(block_split); EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug); EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_insert); static DEFINE_IDA(blk_queue_ida); /* * For queue allocation */ static struct kmem_cache *blk_requestq_cachep; /* * Controlling structure to kblockd */ static struct workqueue_struct *kblockd_workqueue; /** * blk_queue_flag_set - atomically set a queue flag * @flag: flag to be set * @q: request queue */ void blk_queue_flag_set(unsigned int flag, struct request_queue *q) { set_bit(flag, &q->queue_flags); } EXPORT_SYMBOL(blk_queue_flag_set); /** * blk_queue_flag_clear - atomically clear a queue flag * @flag: flag to be cleared * @q: request queue */ void blk_queue_flag_clear(unsigned int flag, struct request_queue *q) { clear_bit(flag, &q->queue_flags); } EXPORT_SYMBOL(blk_queue_flag_clear); #define REQ_OP_NAME(name) [REQ_OP_##name] = #name static const char *const blk_op_name[] = { REQ_OP_NAME(READ), REQ_OP_NAME(WRITE), REQ_OP_NAME(FLUSH), REQ_OP_NAME(DISCARD), REQ_OP_NAME(SECURE_ERASE), REQ_OP_NAME(ZONE_RESET), REQ_OP_NAME(ZONE_RESET_ALL), REQ_OP_NAME(ZONE_OPEN), REQ_OP_NAME(ZONE_CLOSE), REQ_OP_NAME(ZONE_FINISH), REQ_OP_NAME(ZONE_APPEND), REQ_OP_NAME(WRITE_ZEROES), REQ_OP_NAME(DRV_IN), REQ_OP_NAME(DRV_OUT), }; #undef REQ_OP_NAME /** * blk_op_str - Return string XXX in the REQ_OP_XXX. * @op: REQ_OP_XXX. * * Description: Centralize block layer function to convert REQ_OP_XXX into * string format. Useful in the debugging and tracing bio or request. For * invalid REQ_OP_XXX it returns string "UNKNOWN". */ inline const char *blk_op_str(enum req_op op) { const char *op_str = "UNKNOWN"; if (op < ARRAY_SIZE(blk_op_name) && blk_op_name[op]) op_str = blk_op_name[op]; return op_str; } EXPORT_SYMBOL_GPL(blk_op_str); static const struct { int errno; const char *name; } blk_errors[] = { [BLK_STS_OK] = { 0, "" }, [BLK_STS_NOTSUPP] = { -EOPNOTSUPP, "operation not supported" }, [BLK_STS_TIMEOUT] = { -ETIMEDOUT, "timeout" }, [BLK_STS_NOSPC] = { -ENOSPC, "critical space allocation" }, [BLK_STS_TRANSPORT] = { -ENOLINK, "recoverable transport" }, [BLK_STS_TARGET] = { -EREMOTEIO, "critical target" }, [BLK_STS_RESV_CONFLICT] = { -EBADE, "reservation conflict" }, [BLK_STS_MEDIUM] = { -ENODATA, "critical medium" }, [BLK_STS_PROTECTION] = { -EILSEQ, "protection" }, [BLK_STS_RESOURCE] = { -ENOMEM, "kernel resource" }, [BLK_STS_DEV_RESOURCE] = { -EBUSY, "device resource" }, [BLK_STS_AGAIN] = { -EAGAIN, "nonblocking retry" }, [BLK_STS_OFFLINE] = { -ENODEV, "device offline" }, /* device mapper special case, should not leak out: */ [BLK_STS_DM_REQUEUE] = { -EREMCHG, "dm internal retry" }, /* zone device specific errors */ [BLK_STS_ZONE_OPEN_RESOURCE] = { -ETOOMANYREFS, "open zones exceeded" }, [BLK_STS_ZONE_ACTIVE_RESOURCE] = { -EOVERFLOW, "active zones exceeded" }, /* Command duration limit device-side timeout */ [BLK_STS_DURATION_LIMIT] = { -ETIME, "duration limit exceeded" }, [BLK_STS_INVAL] = { -EINVAL, "invalid" }, /* everything else not covered above: */ [BLK_STS_IOERR] = { -EIO, "I/O" }, }; blk_status_t errno_to_blk_status(int errno) { int i; for (i = 0; i < ARRAY_SIZE(blk_errors); i++) { if (blk_errors[i].errno == errno) return (__force blk_status_t)i; } return BLK_STS_IOERR; } EXPORT_SYMBOL_GPL(errno_to_blk_status); int blk_status_to_errno(blk_status_t status) { int idx = (__force int)status; if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors))) return -EIO; return blk_errors[idx].errno; } EXPORT_SYMBOL_GPL(blk_status_to_errno); const char *blk_status_to_str(blk_status_t status) { int idx = (__force int)status; if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors))) return "<null>"; return blk_errors[idx].name; } EXPORT_SYMBOL_GPL(blk_status_to_str); /** * blk_sync_queue - cancel any pending callbacks on a queue * @q: the queue * * Description: * The block layer may perform asynchronous callback activity * on a queue, such as calling the unplug function after a timeout. * A block device may call blk_sync_queue to ensure that any * such activity is cancelled, thus allowing it to release resources * that the callbacks might use. The caller must already have made sure * that its ->submit_bio will not re-add plugging prior to calling * this function. * * This function does not cancel any asynchronous activity arising * out of elevator or throttling code. That would require elevator_exit() * and blkcg_exit_queue() to be called with queue lock initialized. * */ void blk_sync_queue(struct request_queue *q) { timer_delete_sync(&q->timeout); cancel_work_sync(&q->timeout_work); } EXPORT_SYMBOL(blk_sync_queue); /** * blk_set_pm_only - increment pm_only counter * @q: request queue pointer */ void blk_set_pm_only(struct request_queue *q) { atomic_inc(&q->pm_only); } EXPORT_SYMBOL_GPL(blk_set_pm_only); void blk_clear_pm_only(struct request_queue *q) { int pm_only; pm_only = atomic_dec_return(&q->pm_only); WARN_ON_ONCE(pm_only < 0); if (pm_only == 0) wake_up_all(&q->mq_freeze_wq); } EXPORT_SYMBOL_GPL(blk_clear_pm_only); static void blk_free_queue_rcu(struct rcu_head *rcu_head) { struct request_queue *q = container_of(rcu_head, struct request_queue, rcu_head); percpu_ref_exit(&q->q_usage_counter); kmem_cache_free(blk_requestq_cachep, q); } static void blk_free_queue(struct request_queue *q) { blk_free_queue_stats(q->stats); if (queue_is_mq(q)) blk_mq_release(q); ida_free(&blk_queue_ida, q->id); lockdep_unregister_key(&q->io_lock_cls_key); lockdep_unregister_key(&q->q_lock_cls_key); call_rcu(&q->rcu_head, blk_free_queue_rcu); } /** * blk_put_queue - decrement the request_queue refcount * @q: the request_queue structure to decrement the refcount for * * Decrements the refcount of the request_queue and free it when the refcount * reaches 0. */ void blk_put_queue(struct request_queue *q) { if (refcount_dec_and_test(&q->refs)) blk_free_queue(q); } EXPORT_SYMBOL(blk_put_queue); bool blk_queue_start_drain(struct request_queue *q) { /* * When queue DYING flag is set, we need to block new req * entering queue, so we call blk_freeze_queue_start() to * prevent I/O from crossing blk_queue_enter(). */ bool freeze = __blk_freeze_queue_start(q, current); if (queue_is_mq(q)) blk_mq_wake_waiters(q); /* Make blk_queue_enter() reexamine the DYING flag. */ wake_up_all(&q->mq_freeze_wq); return freeze; } /** * blk_queue_enter() - try to increase q->q_usage_counter * @q: request queue pointer * @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PM */ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags) { const bool pm = flags & BLK_MQ_REQ_PM; while (!blk_try_enter_queue(q, pm)) { if (flags & BLK_MQ_REQ_NOWAIT) return -EAGAIN; /* * read pair of barrier in blk_freeze_queue_start(), we need to * order reading __PERCPU_REF_DEAD flag of .q_usage_counter and * reading .mq_freeze_depth or queue dying flag, otherwise the * following wait may never return if the two reads are * reordered. */ smp_rmb(); wait_event(q->mq_freeze_wq, (!q->mq_freeze_depth && blk_pm_resume_queue(pm, q)) || blk_queue_dying(q)); if (blk_queue_dying(q)) return -ENODEV; } rwsem_acquire_read(&q->q_lockdep_map, 0, 0, _RET_IP_); rwsem_release(&q->q_lockdep_map, _RET_IP_); return 0; } int __bio_queue_enter(struct request_queue *q, struct bio *bio) { while (!blk_try_enter_queue(q, false)) { struct gendisk *disk = bio->bi_bdev->bd_disk; if (bio->bi_opf & REQ_NOWAIT) { if (test_bit(GD_DEAD, &disk->state)) goto dead; bio_wouldblock_error(bio); return -EAGAIN; } /* * read pair of barrier in blk_freeze_queue_start(), we need to * order reading __PERCPU_REF_DEAD flag of .q_usage_counter and * reading .mq_freeze_depth or queue dying flag, otherwise the * following wait may never return if the two reads are * reordered. */ smp_rmb(); wait_event(q->mq_freeze_wq, (!q->mq_freeze_depth && blk_pm_resume_queue(false, q)) || test_bit(GD_DEAD, &disk->state)); if (test_bit(GD_DEAD, &disk->state)) goto dead; } rwsem_acquire_read(&q->io_lockdep_map, 0, 0, _RET_IP_); rwsem_release(&q->io_lockdep_map, _RET_IP_); return 0; dead: bio_io_error(bio); return -ENODEV; } void blk_queue_exit(struct request_queue *q) { percpu_ref_put(&q->q_usage_counter); } static void blk_queue_usage_counter_release(struct percpu_ref *ref) { struct request_queue *q = container_of(ref, struct request_queue, q_usage_counter); wake_up_all(&q->mq_freeze_wq); } static void blk_rq_timed_out_timer(struct timer_list *t) { struct request_queue *q = timer_container_of(q, t, timeout); kblockd_schedule_work(&q->timeout_work); } static void blk_timeout_work(struct work_struct *work) { } struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id) { struct request_queue *q; int error; q = kmem_cache_alloc_node(blk_requestq_cachep, GFP_KERNEL | __GFP_ZERO, node_id); if (!q) return ERR_PTR(-ENOMEM); q->last_merge = NULL; q->id = ida_alloc(&blk_queue_ida, GFP_KERNEL); if (q->id < 0) { error = q->id; goto fail_q; } q->stats = blk_alloc_queue_stats(); if (!q->stats) { error = -ENOMEM; goto fail_id; } error = blk_set_default_limits(lim); if (error) goto fail_stats; q->limits = *lim; q->node = node_id; atomic_set(&q->nr_active_requests_shared_tags, 0); timer_setup(&q->timeout, blk_rq_timed_out_timer, 0); INIT_WORK(&q->timeout_work, blk_timeout_work); INIT_LIST_HEAD(&q->icq_list); refcount_set(&q->refs, 1); mutex_init(&q->debugfs_mutex); mutex_init(&q->elevator_lock); mutex_init(&q->sysfs_lock); mutex_init(&q->limits_lock); mutex_init(&q->rq_qos_mutex); spin_lock_init(&q->queue_lock); init_waitqueue_head(&q->mq_freeze_wq); mutex_init(&q->mq_freeze_lock); blkg_init_queue(q); /* * Init percpu_ref in atomic mode so that it's faster to shutdown. * See blk_register_queue() for details. */ error = percpu_ref_init(&q->q_usage_counter, blk_queue_usage_counter_release, PERCPU_REF_INIT_ATOMIC, GFP_KERNEL); if (error) goto fail_stats; lockdep_register_key(&q->io_lock_cls_key); lockdep_register_key(&q->q_lock_cls_key); lockdep_init_map(&q->io_lockdep_map, "&q->q_usage_counter(io)", &q->io_lock_cls_key, 0); lockdep_init_map(&q->q_lockdep_map, "&q->q_usage_counter(queue)", &q->q_lock_cls_key, 0); /* Teach lockdep about lock ordering (reclaim WRT queue freeze lock). */ fs_reclaim_acquire(GFP_KERNEL); rwsem_acquire_read(&q->io_lockdep_map, 0, 0, _RET_IP_); rwsem_release(&q->io_lockdep_map, _RET_IP_); fs_reclaim_release(GFP_KERNEL); q->nr_requests = BLKDEV_DEFAULT_RQ; return q; fail_stats: blk_free_queue_stats(q->stats); fail_id: ida_free(&blk_queue_ida, q->id); fail_q: kmem_cache_free(blk_requestq_cachep, q); return ERR_PTR(error); } /** * blk_get_queue - increment the request_queue refcount * @q: the request_queue structure to increment the refcount for * * Increment the refcount of the request_queue kobject. * * Context: Any context. */ bool blk_get_queue(struct request_queue *q) { if (unlikely(blk_queue_dying(q))) return false; refcount_inc(&q->refs); return true; } EXPORT_SYMBOL(blk_get_queue); #ifdef CONFIG_FAIL_MAKE_REQUEST static DECLARE_FAULT_ATTR(fail_make_request); static int __init setup_fail_make_request(char *str) { return setup_fault_attr(&fail_make_request, str); } __setup("fail_make_request=", setup_fail_make_request); bool should_fail_request(struct block_device *part, unsigned int bytes) { return bdev_test_flag(part, BD_MAKE_IT_FAIL) && should_fail(&fail_make_request, bytes); } static int __init fail_make_request_debugfs(void) { struct dentry *dir = fault_create_debugfs_attr("fail_make_request", NULL, &fail_make_request); return PTR_ERR_OR_ZERO(dir); } late_initcall(fail_make_request_debugfs); #endif /* CONFIG_FAIL_MAKE_REQUEST */ static inline void bio_check_ro(struct bio *bio) { if (op_is_write(bio_op(bio)) && bdev_read_only(bio->bi_bdev)) { if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) return; if (bdev_test_flag(bio->bi_bdev, BD_RO_WARNED)) return; bdev_set_flag(bio->bi_bdev, BD_RO_WARNED); /* * Use ioctl to set underlying disk of raid/dm to read-only * will trigger this. */ pr_warn("Trying to write to read-only block-device %pg\n", bio->bi_bdev); } } static noinline int should_fail_bio(struct bio *bio) { if (should_fail_request(bdev_whole(bio->bi_bdev), bio->bi_iter.bi_size)) return -EIO; return 0; } ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO); /* * Check whether this bio extends beyond the end of the device or partition. * This may well happen - the kernel calls bread() without checking the size of * the device, e.g., when mounting a file system. */ static inline int bio_check_eod(struct bio *bio) { sector_t maxsector = bdev_nr_sectors(bio->bi_bdev); unsigned int nr_sectors = bio_sectors(bio); if (nr_sectors && (nr_sectors > maxsector || bio->bi_iter.bi_sector > maxsector - nr_sectors)) { pr_info_ratelimited("%s: attempt to access beyond end of device\n" "%pg: rw=%d, sector=%llu, nr_sectors = %u limit=%llu\n", current->comm, bio->bi_bdev, bio->bi_opf, bio->bi_iter.bi_sector, nr_sectors, maxsector); return -EIO; } return 0; } /* * Remap block n of partition p to block n+start(p) of the disk. */ static int blk_partition_remap(struct bio *bio) { struct block_device *p = bio->bi_bdev; if (unlikely(should_fail_request(p, bio->bi_iter.bi_size))) return -EIO; if (bio_sectors(bio)) { bio->bi_iter.bi_sector += p->bd_start_sect; trace_block_bio_remap(bio, p->bd_dev, bio->bi_iter.bi_sector - p->bd_start_sect); } bio_set_flag(bio, BIO_REMAPPED); return 0; } /* * Check write append to a zoned block device. */ static inline blk_status_t blk_check_zone_append(struct request_queue *q, struct bio *bio) { int nr_sectors = bio_sectors(bio); /* Only applicable to zoned block devices */ if (!bdev_is_zoned(bio->bi_bdev)) return BLK_STS_NOTSUPP; /* The bio sector must point to the start of a sequential zone */ if (!bdev_is_zone_start(bio->bi_bdev, bio->bi_iter.bi_sector)) return BLK_STS_IOERR; /* * Not allowed to cross zone boundaries. Otherwise, the BIO will be * split and could result in non-contiguous sectors being written in * different zones. */ if (nr_sectors > q->limits.chunk_sectors) return BLK_STS_IOERR; /* Make sure the BIO is small enough and will not get split */ if (nr_sectors > q->limits.max_zone_append_sectors) return BLK_STS_IOERR; bio->bi_opf |= REQ_NOMERGE; return BLK_STS_OK; } static void __submit_bio(struct bio *bio) { /* If plug is not used, add new plug here to cache nsecs time. */ struct blk_plug plug; if (unlikely(!blk_crypto_bio_prep(&bio))) return; blk_start_plug(&plug); if (!bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO)) { blk_mq_submit_bio(bio); } else if (likely(bio_queue_enter(bio) == 0)) { struct gendisk *disk = bio->bi_bdev->bd_disk; if ((bio->bi_opf & REQ_POLLED) && !(disk->queue->limits.features & BLK_FEAT_POLL)) { bio->bi_status = BLK_STS_NOTSUPP; bio_endio(bio); } else { disk->fops->submit_bio(bio); } blk_queue_exit(disk->queue); } blk_finish_plug(&plug); } /* * The loop in this function may be a bit non-obvious, and so deserves some * explanation: * * - Before entering the loop, bio->bi_next is NULL (as all callers ensure * that), so we have a list with a single bio. * - We pretend that we have just taken it off a longer list, so we assign * bio_list to a pointer to the bio_list_on_stack, thus initialising the * bio_list of new bios to be added. ->submit_bio() may indeed add some more * bios through a recursive call to submit_bio_noacct. If it did, we find a * non-NULL value in bio_list and re-enter the loop from the top. * - In this case we really did just take the bio of the top of the list (no * pretending) and so remove it from bio_list, and call into ->submit_bio() * again. * * bio_list_on_stack[0] contains bios submitted by the current ->submit_bio. * bio_list_on_stack[1] contains bios that were submitted before the current * ->submit_bio, but that haven't been processed yet. */ static void __submit_bio_noacct(struct bio *bio) { struct bio_list bio_list_on_stack[2]; BUG_ON(bio->bi_next); bio_list_init(&bio_list_on_stack[0]); current->bio_list = bio_list_on_stack; do { struct request_queue *q = bdev_get_queue(bio->bi_bdev); struct bio_list lower, same; /* * Create a fresh bio_list for all subordinate requests. */ bio_list_on_stack[1] = bio_list_on_stack[0]; bio_list_init(&bio_list_on_stack[0]); __submit_bio(bio); /* * Sort new bios into those for a lower level and those for the * same level. */ bio_list_init(&lower); bio_list_init(&same); while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL) if (q == bdev_get_queue(bio->bi_bdev)) bio_list_add(&same, bio); else bio_list_add(&lower, bio); /* * Now assemble so we handle the lowest level first. */ bio_list_merge(&bio_list_on_stack[0], &lower); bio_list_merge(&bio_list_on_stack[0], &same); bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]); } while ((bio = bio_list_pop(&bio_list_on_stack[0]))); current->bio_list = NULL; } static void __submit_bio_noacct_mq(struct bio *bio) { struct bio_list bio_list[2] = { }; current->bio_list = bio_list; do { __submit_bio(bio); } while ((bio = bio_list_pop(&bio_list[0]))); current->bio_list = NULL; } void submit_bio_noacct_nocheck(struct bio *bio) { blk_cgroup_bio_start(bio); blkcg_bio_issue_init(bio); if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) { trace_block_bio_queue(bio); /* * Now that enqueuing has been traced, we need to trace * completion as well. */ bio_set_flag(bio, BIO_TRACE_COMPLETION); } /* * We only want one ->submit_bio to be active at a time, else stack * usage with stacked devices could be a problem. Use current->bio_list * to collect a list of requests submited by a ->submit_bio method while * it is active, and then process them after it returned. */ if (current->bio_list) bio_list_add(&current->bio_list[0], bio); else if (!bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO)) __submit_bio_noacct_mq(bio); else __submit_bio_noacct(bio); } static blk_status_t blk_validate_atomic_write_op_size(struct request_queue *q, struct bio *bio) { if (bio->bi_iter.bi_size > queue_atomic_write_unit_max_bytes(q)) return BLK_STS_INVAL; if (bio->bi_iter.bi_size % queue_atomic_write_unit_min_bytes(q)) return BLK_STS_INVAL; return BLK_STS_OK; } /** * submit_bio_noacct - re-submit a bio to the block device layer for I/O * @bio: The bio describing the location in memory and on the device. * * This is a version of submit_bio() that shall only be used for I/O that is * resubmitted to lower level drivers by stacking block drivers. All file * systems and other upper level users of the block layer should use * submit_bio() instead. */ void submit_bio_noacct(struct bio *bio) { struct block_device *bdev = bio->bi_bdev; struct request_queue *q = bdev_get_queue(bdev); blk_status_t status = BLK_STS_IOERR; might_sleep(); /* * For a REQ_NOWAIT based request, return -EOPNOTSUPP * if queue does not support NOWAIT. */ if ((bio->bi_opf & REQ_NOWAIT) && !bdev_nowait(bdev)) goto not_supported; if (should_fail_bio(bio)) goto end_io; bio_check_ro(bio); if (!bio_flagged(bio, BIO_REMAPPED)) { if (unlikely(bio_check_eod(bio))) goto end_io; if (bdev_is_partition(bdev) && unlikely(blk_partition_remap(bio))) goto end_io; } /* * Filter flush bio's early so that bio based drivers without flush * support don't have to worry about them. */ if (op_is_flush(bio->bi_opf)) { if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE && bio_op(bio) != REQ_OP_ZONE_APPEND)) goto end_io; if (!bdev_write_cache(bdev)) { bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA); if (!bio_sectors(bio)) { status = BLK_STS_OK; goto end_io; } } } switch (bio_op(bio)) { case REQ_OP_READ: break; case REQ_OP_WRITE: if (bio->bi_opf & REQ_ATOMIC) { status = blk_validate_atomic_write_op_size(q, bio); if (status != BLK_STS_OK) goto end_io; } break; case REQ_OP_FLUSH: /* * REQ_OP_FLUSH can't be submitted through bios, it is only * synthetized in struct request by the flush state machine. */ goto not_supported; case REQ_OP_DISCARD: if (!bdev_max_discard_sectors(bdev)) goto not_supported; break; case REQ_OP_SECURE_ERASE: if (!bdev_max_secure_erase_sectors(bdev)) goto not_supported; break; case REQ_OP_ZONE_APPEND: status = blk_check_zone_append(q, bio); if (status != BLK_STS_OK) goto end_io; break; case REQ_OP_WRITE_ZEROES: if (!q->limits.max_write_zeroes_sectors) goto not_supported; break; case REQ_OP_ZONE_RESET: case REQ_OP_ZONE_OPEN: case REQ_OP_ZONE_CLOSE: case REQ_OP_ZONE_FINISH: case REQ_OP_ZONE_RESET_ALL: if (!bdev_is_zoned(bio->bi_bdev)) goto not_supported; break; case REQ_OP_DRV_IN: case REQ_OP_DRV_OUT: /* * Driver private operations are only used with passthrough * requests. */ fallthrough; default: goto not_supported; } if (blk_throtl_bio(bio)) return; submit_bio_noacct_nocheck(bio); return; not_supported: status = BLK_STS_NOTSUPP; end_io: bio->bi_status = status; bio_endio(bio); } EXPORT_SYMBOL(submit_bio_noacct); static void bio_set_ioprio(struct bio *bio) { /* Nobody set ioprio so far? Initialize it based on task's nice value */ if (IOPRIO_PRIO_CLASS(bio->bi_ioprio) == IOPRIO_CLASS_NONE) bio->bi_ioprio = get_current_ioprio(); blkcg_set_ioprio(bio); } /** * submit_bio - submit a bio to the block device layer for I/O * @bio: The &struct bio which describes the I/O * * submit_bio() is used to submit I/O requests to block devices. It is passed a * fully set up &struct bio that describes the I/O that needs to be done. The * bio will be send to the device described by the bi_bdev field. * * The success/failure status of the request, along with notification of * completion, is delivered asynchronously through the ->bi_end_io() callback * in @bio. The bio must NOT be touched by the caller until ->bi_end_io() has * been called. */ void submit_bio(struct bio *bio) { if (bio_op(bio) == REQ_OP_READ) { task_io_account_read(bio->bi_iter.bi_size); count_vm_events(PGPGIN, bio_sectors(bio)); } else if (bio_op(bio) == REQ_OP_WRITE) { count_vm_events(PGPGOUT, bio_sectors(bio)); } bio_set_ioprio(bio); submit_bio_noacct(bio); } EXPORT_SYMBOL(submit_bio); /** * bio_poll - poll for BIO completions * @bio: bio to poll for * @iob: batches of IO * @flags: BLK_POLL_* flags that control the behavior * * Poll for completions on queue associated with the bio. Returns number of * completed entries found. * * Note: the caller must either be the context that submitted @bio, or * be in a RCU critical section to prevent freeing of @bio. */ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags) { blk_qc_t cookie = READ_ONCE(bio->bi_cookie); struct block_device *bdev; struct request_queue *q; int ret = 0; bdev = READ_ONCE(bio->bi_bdev); if (!bdev) return 0; q = bdev_get_queue(bdev); if (cookie == BLK_QC_T_NONE) return 0; blk_flush_plug(current->plug, false); /* * We need to be able to enter a frozen queue, similar to how * timeouts also need to do that. If that is blocked, then we can * have pending IO when a queue freeze is started, and then the * wait for the freeze to finish will wait for polled requests to * timeout as the poller is preventer from entering the queue and * completing them. As long as we prevent new IO from being queued, * that should be all that matters. */ if (!percpu_ref_tryget(&q->q_usage_counter)) return 0; if (queue_is_mq(q)) { ret = blk_mq_poll(q, cookie, iob, flags); } else { struct gendisk *disk = q->disk; if ((q->limits.features & BLK_FEAT_POLL) && disk && disk->fops->poll_bio) ret = disk->fops->poll_bio(bio, iob, flags); } blk_queue_exit(q); return ret; } EXPORT_SYMBOL_GPL(bio_poll); /* * Helper to implement file_operations.iopoll. Requires the bio to be stored * in iocb->private, and cleared before freeing the bio. */ int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob, unsigned int flags) { struct bio *bio; int ret = 0; /* * Note: the bio cache only uses SLAB_TYPESAFE_BY_RCU, so bio can * point to a freshly allocated bio at this point. If that happens * we have a few cases to consider: * * 1) the bio is beeing initialized and bi_bdev is NULL. We can just * simply nothing in this case * 2) the bio points to a not poll enabled device. bio_poll will catch * this and return 0 * 3) the bio points to a poll capable device, including but not * limited to the one that the original bio pointed to. In this * case we will call into the actual poll method and poll for I/O, * even if we don't need to, but it won't cause harm either. * * For cases 2) and 3) above the RCU grace period ensures that bi_bdev * is still allocated. Because partitions hold a reference to the whole * device bdev and thus disk, the disk is also still valid. Grabbing * a reference to the queue in bio_poll() ensures the hctxs and requests * are still valid as well. */ rcu_read_lock(); bio = READ_ONCE(kiocb->private); if (bio) ret = bio_poll(bio, iob, flags); rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(iocb_bio_iopoll); void update_io_ticks(struct block_device *part, unsigned long now, bool end) { unsigned long stamp; again: stamp = READ_ONCE(part->bd_stamp); if (unlikely(time_after(now, stamp)) && likely(try_cmpxchg(&part->bd_stamp, &stamp, now)) && (end || bdev_count_inflight(part))) __part_stat_add(part, io_ticks, now - stamp); if (bdev_is_partition(part)) { part = bdev_whole(part); goto again; } } unsigned long bdev_start_io_acct(struct block_device *bdev, enum req_op op, unsigned long start_time) { part_stat_lock(); update_io_ticks(bdev, start_time, false); part_stat_local_inc(bdev, in_flight[op_is_write(op)]); part_stat_unlock(); return start_time; } EXPORT_SYMBOL(bdev_start_io_acct); /** * bio_start_io_acct - start I/O accounting for bio based drivers * @bio: bio to start account for * * Returns the start time that should be passed back to bio_end_io_acct(). */ unsigned long bio_start_io_acct(struct bio *bio) { return bdev_start_io_acct(bio->bi_bdev, bio_op(bio), jiffies); } EXPORT_SYMBOL_GPL(bio_start_io_acct); void bdev_end_io_acct(struct block_device *bdev, enum req_op op, unsigned int sectors, unsigned long start_time) { const int sgrp = op_stat_group(op); unsigned long now = READ_ONCE(jiffies); unsigned long duration = now - start_time; part_stat_lock(); update_io_ticks(bdev, now, true); part_stat_inc(bdev, ios[sgrp]); part_stat_add(bdev, sectors[sgrp], sectors); part_stat_add(bdev, nsecs[sgrp], jiffies_to_nsecs(duration)); part_stat_local_dec(bdev, in_flight[op_is_write(op)]); part_stat_unlock(); } EXPORT_SYMBOL(bdev_end_io_acct); void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time, struct block_device *orig_bdev) { bdev_end_io_acct(orig_bdev, bio_op(bio), bio_sectors(bio), start_time); } EXPORT_SYMBOL_GPL(bio_end_io_acct_remapped); /** * blk_lld_busy - Check if underlying low-level drivers of a device are busy * @q : the queue of the device being checked * * Description: * Check if underlying low-level drivers of a device are busy. * If the drivers want to export their busy state, they must set own * exporting function using blk_queue_lld_busy() first. * * Basically, this function is used only by request stacking drivers * to stop dispatching requests to underlying devices when underlying * devices are busy. This behavior helps more I/O merging on the queue * of the request stacking driver and prevents I/O throughput regression * on burst I/O load. * * Return: * 0 - Not busy (The request stacking driver should dispatch request) * 1 - Busy (The request stacking driver should stop dispatching request) */ int blk_lld_busy(struct request_queue *q) { if (queue_is_mq(q) && q->mq_ops->busy) return q->mq_ops->busy(q); return 0; } EXPORT_SYMBOL_GPL(blk_lld_busy); int kblockd_schedule_work(struct work_struct *work) { return queue_work(kblockd_workqueue, work); } EXPORT_SYMBOL(kblockd_schedule_work); int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay) { return mod_delayed_work_on(cpu, kblockd_workqueue, dwork, delay); } EXPORT_SYMBOL(kblockd_mod_delayed_work_on); void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios) { struct task_struct *tsk = current; /* * If this is a nested plug, don't actually assign it. */ if (tsk->plug) return; plug->cur_ktime = 0; rq_list_init(&plug->mq_list); rq_list_init(&plug->cached_rqs); plug->nr_ios = min_t(unsigned short, nr_ios, BLK_MAX_REQUEST_COUNT); plug->rq_count = 0; plug->multiple_queues = false; plug->has_elevator = false; INIT_LIST_HEAD(&plug->cb_list); /* * Store ordering should not be needed here, since a potential * preempt will imply a full memory barrier */ tsk->plug = plug; } /** * blk_start_plug - initialize blk_plug and track it inside the task_struct * @plug: The &struct blk_plug that needs to be initialized * * Description: * blk_start_plug() indicates to the block layer an intent by the caller * to submit multiple I/O requests in a batch. The block layer may use * this hint to defer submitting I/Os from the caller until blk_finish_plug() * is called. However, the block layer may choose to submit requests * before a call to blk_finish_plug() if the number of queued I/Os * exceeds %BLK_MAX_REQUEST_COUNT, or if the size of the I/O is larger than * %BLK_PLUG_FLUSH_SIZE. The queued I/Os may also be submitted early if * the task schedules (see below). * * Tracking blk_plug inside the task_struct will help with auto-flushing the * pending I/O should the task end up blocking between blk_start_plug() and * blk_finish_plug(). This is important from a performance perspective, but * also ensures that we don't deadlock. For instance, if the task is blocking * for a memory allocation, memory reclaim could end up wanting to free a * page belonging to that request that is currently residing in our private * plug. By flushing the pending I/O when the process goes to sleep, we avoid * this kind of deadlock. */ void blk_start_plug(struct blk_plug *plug) { blk_start_plug_nr_ios(plug, 1); } EXPORT_SYMBOL(blk_start_plug); static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule) { LIST_HEAD(callbacks); while (!list_empty(&plug->cb_list)) { list_splice_init(&plug->cb_list, &callbacks); while (!list_empty(&callbacks)) { struct blk_plug_cb *cb = list_first_entry(&callbacks, struct blk_plug_cb, list); list_del(&cb->list); cb->callback(cb, from_schedule); } } } struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, void *data, int size) { struct blk_plug *plug = current->plug; struct blk_plug_cb *cb; if (!plug) return NULL; list_for_each_entry(cb, &plug->cb_list, list) if (cb->callback == unplug && cb->data == data) return cb; /* Not currently on the callback list */ BUG_ON(size < sizeof(*cb)); cb = kzalloc(size, GFP_ATOMIC); if (cb) { cb->data = data; cb->callback = unplug; list_add(&cb->list, &plug->cb_list); } return cb; } EXPORT_SYMBOL(blk_check_plugged); void __blk_flush_plug(struct blk_plug *plug, bool from_schedule) { if (!list_empty(&plug->cb_list)) flush_plug_callbacks(plug, from_schedule); blk_mq_flush_plug_list(plug, from_schedule); /* * Unconditionally flush out cached requests, even if the unplug * event came from schedule. Since we know hold references to the * queue for cached requests, we don't want a blocked task holding * up a queue freeze/quiesce event. */ if (unlikely(!rq_list_empty(&plug->cached_rqs))) blk_mq_free_plug_rqs(plug); plug->cur_ktime = 0; current->flags &= ~PF_BLOCK_TS; } /** * blk_finish_plug - mark the end of a batch of submitted I/O * @plug: The &struct blk_plug passed to blk_start_plug() * * Description: * Indicate that a batch of I/O submissions is complete. This function * must be paired with an initial call to blk_start_plug(). The intent * is to allow the block layer to optimize I/O submission. See the * documentation for blk_start_plug() for more information. */ void blk_finish_plug(struct blk_plug *plug) { if (plug == current->plug) { __blk_flush_plug(plug, false); current->plug = NULL; } } EXPORT_SYMBOL(blk_finish_plug); void blk_io_schedule(void) { /* Prevent hang_check timer from firing at us during very long I/O */ unsigned long timeout = sysctl_hung_task_timeout_secs * HZ / 2; if (timeout) io_schedule_timeout(timeout); else io_schedule(); } EXPORT_SYMBOL_GPL(blk_io_schedule); int __init blk_dev_init(void) { BUILD_BUG_ON((__force u32)REQ_OP_LAST >= (1 << REQ_OP_BITS)); BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 * sizeof_field(struct request, cmd_flags)); BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 * sizeof_field(struct bio, bi_opf)); /* used for unplugging and affects IO latency/throughput - HIGHPRI */ kblockd_workqueue = alloc_workqueue("kblockd", WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); if (!kblockd_workqueue) panic("Failed to create kblockd\n"); blk_requestq_cachep = KMEM_CACHE(request_queue, SLAB_PANIC); blk_debugfs_root = debugfs_create_dir("block", NULL); return 0; }
3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 // SPDX-License-Identifier: GPL-2.0 /* * Workingset detection * * Copyright (C) 2013 Red Hat, Inc., Johannes Weiner */ #include <linux/memcontrol.h> #include <linux/mm_inline.h> #include <linux/writeback.h> #include <linux/shmem_fs.h> #include <linux/pagemap.h> #include <linux/atomic.h> #include <linux/module.h> #include <linux/swap.h> #include <linux/dax.h> #include <linux/fs.h> #include <linux/mm.h> #include "internal.h" /* * Double CLOCK lists * * Per node, two clock lists are maintained for file pages: the * inactive and the active list. Freshly faulted pages start out at * the head of the inactive list and page reclaim scans pages from the * tail. Pages that are accessed multiple times on the inactive list * are promoted to the active list, to protect them from reclaim, * whereas active pages are demoted to the inactive list when the * active list grows too big. * * fault ------------------------+ * | * +--------------+ | +-------------+ * reclaim <- | inactive | <-+-- demotion | active | <--+ * +--------------+ +-------------+ | * | | * +-------------- promotion ------------------+ * * * Access frequency and refault distance * * A workload is thrashing when its pages are frequently used but they * are evicted from the inactive list every time before another access * would have promoted them to the active list. * * In cases where the average access distance between thrashing pages * is bigger than the size of memory there is nothing that can be * done - the thrashing set could never fit into memory under any * circumstance. * * However, the average access distance could be bigger than the * inactive list, yet smaller than the size of memory. In this case, * the set could fit into memory if it weren't for the currently * active pages - which may be used more, hopefully less frequently: * * +-memory available to cache-+ * | | * +-inactive------+-active----+ * a b | c d e f g h i | J K L M N | * +---------------+-----------+ * * It is prohibitively expensive to accurately track access frequency * of pages. But a reasonable approximation can be made to measure * thrashing on the inactive list, after which refaulting pages can be * activated optimistically to compete with the existing active pages. * * Approximating inactive page access frequency - Observations: * * 1. When a page is accessed for the first time, it is added to the * head of the inactive list, slides every existing inactive page * towards the tail by one slot, and pushes the current tail page * out of memory. * * 2. When a page is accessed for the second time, it is promoted to * the active list, shrinking the inactive list by one slot. This * also slides all inactive pages that were faulted into the cache * more recently than the activated page towards the tail of the * inactive list. * * Thus: * * 1. The sum of evictions and activations between any two points in * time indicate the minimum number of inactive pages accessed in * between. * * 2. Moving one inactive page N page slots towards the tail of the * list requires at least N inactive page accesses. * * Combining these: * * 1. When a page is finally evicted from memory, the number of * inactive pages accessed while the page was in cache is at least * the number of page slots on the inactive list. * * 2. In addition, measuring the sum of evictions and activations (E) * at the time of a page's eviction, and comparing it to another * reading (R) at the time the page faults back into memory tells * the minimum number of accesses while the page was not cached. * This is called the refault distance. * * Because the first access of the page was the fault and the second * access the refault, we combine the in-cache distance with the * out-of-cache distance to get the complete minimum access distance * of this page: * * NR_inactive + (R - E) * * And knowing the minimum access distance of a page, we can easily * tell if the page would be able to stay in cache assuming all page * slots in the cache were available: * * NR_inactive + (R - E) <= NR_inactive + NR_active * * If we have swap we should consider about NR_inactive_anon and * NR_active_anon, so for page cache and anonymous respectively: * * NR_inactive_file + (R - E) <= NR_inactive_file + NR_active_file * + NR_inactive_anon + NR_active_anon * * NR_inactive_anon + (R - E) <= NR_inactive_anon + NR_active_anon * + NR_inactive_file + NR_active_file * * Which can be further simplified to: * * (R - E) <= NR_active_file + NR_inactive_anon + NR_active_anon * * (R - E) <= NR_active_anon + NR_inactive_file + NR_active_file * * Put into words, the refault distance (out-of-cache) can be seen as * a deficit in inactive list space (in-cache). If the inactive list * had (R - E) more page slots, the page would not have been evicted * in between accesses, but activated instead. And on a full system, * the only thing eating into inactive list space is active pages. * * * Refaulting inactive pages * * All that is known about the active list is that the pages have been * accessed more than once in the past. This means that at any given * time there is actually a good chance that pages on the active list * are no longer in active use. * * So when a refault distance of (R - E) is observed and there are at * least (R - E) pages in the userspace workingset, the refaulting page * is activated optimistically in the hope that (R - E) pages are actually * used less frequently than the refaulting page - or even not used at * all anymore. * * That means if inactive cache is refaulting with a suitable refault * distance, we assume the cache workingset is transitioning and put * pressure on the current workingset. * * If this is wrong and demotion kicks in, the pages which are truly * used more frequently will be reactivated while the less frequently * used once will be evicted from memory. * * But if this is right, the stale pages will be pushed out of memory * and the used pages get to stay in cache. * * Refaulting active pages * * If on the other hand the refaulting pages have recently been * deactivated, it means that the active list is no longer protecting * actively used cache from reclaim. The cache is NOT transitioning to * a different workingset; the existing workingset is thrashing in the * space allocated to the page cache. * * * Implementation * * For each node's LRU lists, a counter for inactive evictions and * activations is maintained (node->nonresident_age). * * On eviction, a snapshot of this counter (along with some bits to * identify the node) is stored in the now empty page cache * slot of the evicted page. This is called a shadow entry. * * On cache misses for which there are shadow entries, an eligible * refault distance will immediately activate the refaulting page. */ #define WORKINGSET_SHIFT 1 #define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \ WORKINGSET_SHIFT + NODES_SHIFT + \ MEM_CGROUP_ID_SHIFT) #define EVICTION_MASK (~0UL >> EVICTION_SHIFT) /* * Eviction timestamps need to be able to cover the full range of * actionable refaults. However, bits are tight in the xarray * entry, and after storing the identifier for the lruvec there might * not be enough left to represent every single actionable refault. In * that case, we have to sacrifice granularity for distance, and group * evictions into coarser buckets by shaving off lower timestamp bits. */ static unsigned int bucket_order __read_mostly; static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, bool workingset) { eviction &= EVICTION_MASK; eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; eviction = (eviction << NODES_SHIFT) | pgdat->node_id; eviction = (eviction << WORKINGSET_SHIFT) | workingset; return xa_mk_value(eviction); } static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, unsigned long *evictionp, bool *workingsetp) { unsigned long entry = xa_to_value(shadow); int memcgid, nid; bool workingset; workingset = entry & ((1UL << WORKINGSET_SHIFT) - 1); entry >>= WORKINGSET_SHIFT; nid = entry & ((1UL << NODES_SHIFT) - 1); entry >>= NODES_SHIFT; memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); entry >>= MEM_CGROUP_ID_SHIFT; *memcgidp = memcgid; *pgdat = NODE_DATA(nid); *evictionp = entry; *workingsetp = workingset; } #ifdef CONFIG_LRU_GEN static void *lru_gen_eviction(struct folio *folio) { int hist; unsigned long token; unsigned long min_seq; struct lruvec *lruvec; struct lru_gen_folio *lrugen; int type = folio_is_file_lru(folio); int delta = folio_nr_pages(folio); int refs = folio_lru_refs(folio); bool workingset = folio_test_workingset(folio); int tier = lru_tier_from_refs(refs, workingset); struct mem_cgroup *memcg = folio_memcg(folio); struct pglist_data *pgdat = folio_pgdat(folio); BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT); lruvec = mem_cgroup_lruvec(memcg, pgdat); lrugen = &lruvec->lrugen; min_seq = READ_ONCE(lrugen->min_seq[type]); token = (min_seq << LRU_REFS_WIDTH) | max(refs - 1, 0); hist = lru_hist_from_seq(min_seq); atomic_long_add(delta, &lrugen->evicted[hist][type][tier]); return pack_shadow(mem_cgroup_id(memcg), pgdat, token, workingset); } /* * Tests if the shadow entry is for a folio that was recently evicted. * Fills in @lruvec, @token, @workingset with the values unpacked from shadow. */ static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec, unsigned long *token, bool *workingset) { int memcg_id; unsigned long max_seq; struct mem_cgroup *memcg; struct pglist_data *pgdat; unpack_shadow(shadow, &memcg_id, &pgdat, token, workingset); memcg = mem_cgroup_from_id(memcg_id); *lruvec = mem_cgroup_lruvec(memcg, pgdat); max_seq = READ_ONCE((*lruvec)->lrugen.max_seq); max_seq &= EVICTION_MASK >> LRU_REFS_WIDTH; return abs_diff(max_seq, *token >> LRU_REFS_WIDTH) < MAX_NR_GENS; } static void lru_gen_refault(struct folio *folio, void *shadow) { bool recent; int hist, tier, refs; bool workingset; unsigned long token; struct lruvec *lruvec; struct lru_gen_folio *lrugen; int type = folio_is_file_lru(folio); int delta = folio_nr_pages(folio); rcu_read_lock(); recent = lru_gen_test_recent(shadow, &lruvec, &token, &workingset); if (lruvec != folio_lruvec(folio)) goto unlock; mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta); if (!recent) goto unlock; lrugen = &lruvec->lrugen; hist = lru_hist_from_seq(READ_ONCE(lrugen->min_seq[type])); refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + 1; tier = lru_tier_from_refs(refs, workingset); atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]); /* see folio_add_lru() where folio_set_active() will be called */ if (lru_gen_in_fault()) mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta); if (workingset) { folio_set_workingset(folio); mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta); } else set_mask_bits(&folio->flags, LRU_REFS_MASK, (refs - 1UL) << LRU_REFS_PGOFF); unlock: rcu_read_unlock(); } #else /* !CONFIG_LRU_GEN */ static void *lru_gen_eviction(struct folio *folio) { return NULL; } static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec, unsigned long *token, bool *workingset) { return false; } static void lru_gen_refault(struct folio *folio, void *shadow) { } #endif /* CONFIG_LRU_GEN */ /** * workingset_age_nonresident - age non-resident entries as LRU ages * @lruvec: the lruvec that was aged * @nr_pages: the number of pages to count * * As in-memory pages are aged, non-resident pages need to be aged as * well, in order for the refault distances later on to be comparable * to the in-memory dimensions. This function allows reclaim and LRU * operations to drive the non-resident aging along in parallel. */ void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages) { /* * Reclaiming a cgroup means reclaiming all its children in a * round-robin fashion. That means that each cgroup has an LRU * order that is composed of the LRU orders of its child * cgroups; and every page has an LRU position not just in the * cgroup that owns it, but in all of that group's ancestors. * * So when the physical inactive list of a leaf cgroup ages, * the virtual inactive lists of all its parents, including * the root cgroup's, age as well. */ do { atomic_long_add(nr_pages, &lruvec->nonresident_age); } while ((lruvec = parent_lruvec(lruvec))); } /** * workingset_eviction - note the eviction of a folio from memory * @target_memcg: the cgroup that is causing the reclaim * @folio: the folio being evicted * * Return: a shadow entry to be stored in @folio->mapping->i_pages in place * of the evicted @folio so that a later refault can be detected. */ void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg) { struct pglist_data *pgdat = folio_pgdat(folio); unsigned long eviction; struct lruvec *lruvec; int memcgid; /* Folio is fully exclusive and pins folio's memory cgroup pointer */ VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (lru_gen_enabled()) return lru_gen_eviction(folio); lruvec = mem_cgroup_lruvec(target_memcg, pgdat); /* XXX: target_memcg can be NULL, go through lruvec */ memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); eviction = atomic_long_read(&lruvec->nonresident_age); eviction >>= bucket_order; workingset_age_nonresident(lruvec, folio_nr_pages(folio)); return pack_shadow(memcgid, pgdat, eviction, folio_test_workingset(folio)); } /** * workingset_test_recent - tests if the shadow entry is for a folio that was * recently evicted. Also fills in @workingset with the value unpacked from * shadow. * @shadow: the shadow entry to be tested. * @file: whether the corresponding folio is from the file lru. * @workingset: where the workingset value unpacked from shadow should * be stored. * @flush: whether to flush cgroup rstat. * * Return: true if the shadow is for a recently evicted folio; false otherwise. */ bool workingset_test_recent(void *shadow, bool file, bool *workingset, bool flush) { struct mem_cgroup *eviction_memcg; struct lruvec *eviction_lruvec; unsigned long refault_distance; unsigned long workingset_size; unsigned long refault; int memcgid; struct pglist_data *pgdat; unsigned long eviction; if (lru_gen_enabled()) { bool recent; rcu_read_lock(); recent = lru_gen_test_recent(shadow, &eviction_lruvec, &eviction, workingset); rcu_read_unlock(); return recent; } rcu_read_lock(); unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset); eviction <<= bucket_order; /* * Look up the memcg associated with the stored ID. It might * have been deleted since the folio's eviction. * * Note that in rare events the ID could have been recycled * for a new cgroup that refaults a shared folio. This is * impossible to tell from the available data. However, this * should be a rare and limited disturbance, and activations * are always speculative anyway. Ultimately, it's the aging * algorithm's job to shake out the minimum access frequency * for the active cache. * * XXX: On !CONFIG_MEMCG, this will always return NULL; it * would be better if the root_mem_cgroup existed in all * configurations instead. */ eviction_memcg = mem_cgroup_from_id(memcgid); if (!mem_cgroup_tryget(eviction_memcg)) eviction_memcg = NULL; rcu_read_unlock(); if (!mem_cgroup_disabled() && !eviction_memcg) return false; /* * Flush stats (and potentially sleep) outside the RCU read section. * * Note that workingset_test_recent() itself might be called in RCU read * section (for e.g, in cachestat) - these callers need to skip flushing * stats (via the flush argument). * * XXX: With per-memcg flushing and thresholding, is ratelimiting * still needed here? */ if (flush) mem_cgroup_flush_stats_ratelimited(eviction_memcg); eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat); refault = atomic_long_read(&eviction_lruvec->nonresident_age); /* * Calculate the refault distance * * The unsigned subtraction here gives an accurate distance * across nonresident_age overflows in most cases. There is a * special case: usually, shadow entries have a short lifetime * and are either refaulted or reclaimed along with the inode * before they get too old. But it is not impossible for the * nonresident_age to lap a shadow entry in the field, which * can then result in a false small refault distance, leading * to a false activation should this old entry actually * refault again. However, earlier kernels used to deactivate * unconditionally with *every* reclaim invocation for the * longest time, so the occasional inappropriate activation * leading to pressure on the active list is not a problem. */ refault_distance = (refault - eviction) & EVICTION_MASK; /* * Compare the distance to the existing workingset size. We * don't activate pages that couldn't stay resident even if * all the memory was available to the workingset. Whether * workingset competition needs to consider anon or not depends * on having free swap space. */ workingset_size = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE); if (!file) { workingset_size += lruvec_page_state(eviction_lruvec, NR_INACTIVE_FILE); } if (mem_cgroup_get_nr_swap_pages(eviction_memcg) > 0) { workingset_size += lruvec_page_state(eviction_lruvec, NR_ACTIVE_ANON); if (file) { workingset_size += lruvec_page_state(eviction_lruvec, NR_INACTIVE_ANON); } } mem_cgroup_put(eviction_memcg); return refault_distance <= workingset_size; } /** * workingset_refault - Evaluate the refault of a previously evicted folio. * @folio: The freshly allocated replacement folio. * @shadow: Shadow entry of the evicted folio. * * Calculates and evaluates the refault distance of the previously * evicted folio in the context of the node and the memcg whose memory * pressure caused the eviction. */ void workingset_refault(struct folio *folio, void *shadow) { bool file = folio_is_file_lru(folio); struct pglist_data *pgdat; struct mem_cgroup *memcg; struct lruvec *lruvec; bool workingset; long nr; VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (lru_gen_enabled()) { lru_gen_refault(folio, shadow); return; } /* * The activation decision for this folio is made at the level * where the eviction occurred, as that is where the LRU order * during folio reclaim is being determined. * * However, the cgroup that will own the folio is the one that * is actually experiencing the refault event. Make sure the folio is * locked to guarantee folio_memcg() stability throughout. */ nr = folio_nr_pages(folio); memcg = folio_memcg(folio); pgdat = folio_pgdat(folio); lruvec = mem_cgroup_lruvec(memcg, pgdat); mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr); if (!workingset_test_recent(shadow, file, &workingset, true)) return; folio_set_active(folio); workingset_age_nonresident(lruvec, nr); mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file, nr); /* Folio was active prior to eviction */ if (workingset) { folio_set_workingset(folio); /* * XXX: Move to folio_add_lru() when it supports new vs * putback */ lru_note_cost_refault(folio); mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr); } } /** * workingset_activation - note a page activation * @folio: Folio that is being activated. */ void workingset_activation(struct folio *folio) { /* * Filter non-memcg pages here, e.g. unmap can call * mark_page_accessed() on VDSO pages. */ if (mem_cgroup_disabled() || folio_memcg_charged(folio)) workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio)); } /* * Shadow entries reflect the share of the working set that does not * fit into memory, so their number depends on the access pattern of * the workload. In most cases, they will refault or get reclaimed * along with the inode, but a (malicious) workload that streams * through files with a total size several times that of available * memory, while preventing the inodes from being reclaimed, can * create excessive amounts of shadow nodes. To keep a lid on this, * track shadow nodes and reclaim them when they grow way past the * point where they would still be useful. */ struct list_lru shadow_nodes; void workingset_update_node(struct xa_node *node) { struct page *page = virt_to_page(node); /* * Track non-empty nodes that contain only shadow entries; * unlink those that contain pages or are being freed. * * Avoid acquiring the list_lru lock when the nodes are * already where they should be. The list_empty() test is safe * as node->private_list is protected by the i_pages lock. */ lockdep_assert_held(&node->array->xa_lock); if (node->count && node->count == node->nr_values) { if (list_empty(&node->private_list)) { list_lru_add_obj(&shadow_nodes, &node->private_list); __inc_node_page_state(page, WORKINGSET_NODES); } } else { if (!list_empty(&node->private_list)) { list_lru_del_obj(&shadow_nodes, &node->private_list); __dec_node_page_state(page, WORKINGSET_NODES); } } } static unsigned long count_shadow_nodes(struct shrinker *shrinker, struct shrink_control *sc) { unsigned long max_nodes; unsigned long nodes; unsigned long pages; nodes = list_lru_shrink_count(&shadow_nodes, sc); if (!nodes) return SHRINK_EMPTY; /* * Approximate a reasonable limit for the nodes * containing shadow entries. We don't need to keep more * shadow entries than possible pages on the active list, * since refault distances bigger than that are dismissed. * * The size of the active list converges toward 100% of * overall page cache as memory grows, with only a tiny * inactive list. Assume the total cache size for that. * * Nodes might be sparsely populated, with only one shadow * entry in the extreme case. Obviously, we cannot keep one * node for every eligible shadow entry, so compromise on a * worst-case density of 1/8th. Below that, not all eligible * refaults can be detected anymore. * * On 64-bit with 7 xa_nodes per page and 64 slots * each, this will reclaim shadow entries when they consume * ~1.8% of available memory: * * PAGE_SIZE / xa_nodes / node_entries * 8 / PAGE_SIZE */ #ifdef CONFIG_MEMCG if (sc->memcg) { struct lruvec *lruvec; int i; mem_cgroup_flush_stats_ratelimited(sc->memcg); lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); for (pages = 0, i = 0; i < NR_LRU_LISTS; i++) pages += lruvec_page_state_local(lruvec, NR_LRU_BASE + i); pages += lruvec_page_state_local( lruvec, NR_SLAB_RECLAIMABLE_B) >> PAGE_SHIFT; pages += lruvec_page_state_local( lruvec, NR_SLAB_UNRECLAIMABLE_B) >> PAGE_SHIFT; } else #endif pages = node_present_pages(sc->nid); max_nodes = pages >> (XA_CHUNK_SHIFT - 3); if (nodes <= max_nodes) return 0; return nodes - max_nodes; } static enum lru_status shadow_lru_isolate(struct list_head *item, struct list_lru_one *lru, void *arg) __must_hold(lru->lock) { struct xa_node *node = container_of(item, struct xa_node, private_list); struct address_space *mapping; int ret; /* * Page cache insertions and deletions synchronously maintain * the shadow node LRU under the i_pages lock and the * &lru->lock. Because the page cache tree is emptied before * the inode can be destroyed, holding the &lru->lock pins any * address_space that has nodes on the LRU. * * We can then safely transition to the i_pages lock to * pin only the address_space of the particular node we want * to reclaim, take the node off-LRU, and drop the &lru->lock. */ mapping = container_of(node->array, struct address_space, i_pages); /* Coming from the list, invert the lock order */ if (!xa_trylock(&mapping->i_pages)) { spin_unlock_irq(&lru->lock); ret = LRU_RETRY; goto out; } /* For page cache we need to hold i_lock */ if (mapping->host != NULL) { if (!spin_trylock(&mapping->host->i_lock)) { xa_unlock(&mapping->i_pages); spin_unlock_irq(&lru->lock); ret = LRU_RETRY; goto out; } } list_lru_isolate(lru, item); __dec_node_page_state(virt_to_page(node), WORKINGSET_NODES); spin_unlock(&lru->lock); /* * The nodes should only contain one or more shadow entries, * no pages, so we expect to be able to remove them all and * delete and free the empty node afterwards. */ if (WARN_ON_ONCE(!node->nr_values)) goto out_invalid; if (WARN_ON_ONCE(node->count != node->nr_values)) goto out_invalid; xa_delete_node(node, workingset_update_node); __inc_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM); out_invalid: xa_unlock_irq(&mapping->i_pages); if (mapping->host != NULL) { if (mapping_shrinkable(mapping)) inode_add_lru(mapping->host); spin_unlock(&mapping->host->i_lock); } ret = LRU_REMOVED_RETRY; out: cond_resched(); return ret; } static unsigned long scan_shadow_nodes(struct shrinker *shrinker, struct shrink_control *sc) { /* list_lru lock nests inside the IRQ-safe i_pages lock */ return list_lru_shrink_walk_irq(&shadow_nodes, sc, shadow_lru_isolate, NULL); } /* * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe * i_pages lock. */ static struct lock_class_key shadow_nodes_key; static int __init workingset_init(void) { struct shrinker *workingset_shadow_shrinker; unsigned int timestamp_bits; unsigned int max_order; int ret = -ENOMEM; BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT); /* * Calculate the eviction bucket size to cover the longest * actionable refault distance, which is currently half of * memory (totalram_pages/2). However, memory hotplug may add * some more pages at runtime, so keep working with up to * double the initial memory by using totalram_pages as-is. */ timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT; max_order = fls_long(totalram_pages() - 1); if (max_order > timestamp_bits) bucket_order = max_order - timestamp_bits; pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", timestamp_bits, max_order, bucket_order); workingset_shadow_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "mm-shadow"); if (!workingset_shadow_shrinker) goto err; ret = list_lru_init_memcg_key(&shadow_nodes, workingset_shadow_shrinker, &shadow_nodes_key); if (ret) goto err_list_lru; workingset_shadow_shrinker->count_objects = count_shadow_nodes; workingset_shadow_shrinker->scan_objects = scan_shadow_nodes; /* ->count reports only fully expendable nodes */ workingset_shadow_shrinker->seeks = 0; shrinker_register(workingset_shadow_shrinker); return 0; err_list_lru: shrinker_free(workingset_shadow_shrinker); err: return ret; } module_init(workingset_init);
625 340 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 /* SPDX-License-Identifier: GPL-2.0 */ /* Perform sanity checking for object sizes for uaccess.h and uio.h. */ #ifndef __LINUX_UCOPYSIZE_H__ #define __LINUX_UCOPYSIZE_H__ #include <linux/bug.h> #ifdef CONFIG_HARDENED_USERCOPY #include <linux/jump_label.h> extern void __check_object_size(const void *ptr, unsigned long n, bool to_user); DECLARE_STATIC_KEY_MAYBE(CONFIG_HARDENED_USERCOPY_DEFAULT_ON, validate_usercopy_range); static __always_inline void check_object_size(const void *ptr, unsigned long n, bool to_user) { if (!__builtin_constant_p(n) && static_branch_maybe(CONFIG_HARDENED_USERCOPY_DEFAULT_ON, &validate_usercopy_range)) { __check_object_size(ptr, n, to_user); } } #else static inline void check_object_size(const void *ptr, unsigned long n, bool to_user) { } #endif /* CONFIG_HARDENED_USERCOPY */ extern void __compiletime_error("copy source size is too small") __bad_copy_from(void); extern void __compiletime_error("copy destination size is too small") __bad_copy_to(void); void __copy_overflow(int size, unsigned long count); static inline void copy_overflow(int size, unsigned long count) { if (IS_ENABLED(CONFIG_BUG)) __copy_overflow(size, count); } static __always_inline __must_check bool check_copy_size(const void *addr, size_t bytes, bool is_source) { int sz = __builtin_object_size(addr, 0); if (unlikely(sz >= 0 && sz < bytes)) { if (!__builtin_constant_p(bytes)) copy_overflow(sz, bytes); else if (is_source) __bad_copy_from(); else __bad_copy_to(); return false; } if (WARN_ON_ONCE(bytes > INT_MAX)) return false; check_object_size(addr, bytes, is_source); return true; } #endif /* __LINUX_UCOPYSIZE_H__ */
3 3 3 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/inode.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * from * * linux/fs/minix/inode.c * * Copyright (C) 1991, 1992 Linus Torvalds * * 64-bit file support on 64-bit platforms by Jakub Jelinek * (jj@sunsite.ms.mff.cuni.cz) * * Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000 */ #include <linux/fs.h> #include <linux/mount.h> #include <linux/time.h> #include <linux/highuid.h> #include <linux/pagemap.h> #include <linux/dax.h> #include <linux/quotaops.h> #include <linux/string.h> #include <linux/buffer_head.h> #include <linux/writeback.h> #include <linux/pagevec.h> #include <linux/mpage.h> #include <linux/rmap.h> #include <linux/namei.h> #include <linux/uio.h> #include <linux/bio.h> #include <linux/workqueue.h> #include <linux/kernel.h> #include <linux/printk.h> #include <linux/slab.h> #include <linux/bitops.h> #include <linux/iomap.h> #include <linux/iversion.h> #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" #include "truncate.h" #include <trace/events/ext4.h> static void ext4_journalled_zero_new_buffers(handle_t *handle, struct inode *inode, struct folio *folio, unsigned from, unsigned to); static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei) { __u32 csum; __u16 dummy_csum = 0; int offset = offsetof(struct ext4_inode, i_checksum_lo); unsigned int csum_size = sizeof(dummy_csum); csum = ext4_chksum(ei->i_csum_seed, (__u8 *)raw, offset); csum = ext4_chksum(csum, (__u8 *)&dummy_csum, csum_size); offset += csum_size; csum = ext4_chksum(csum, (__u8 *)raw + offset, EXT4_GOOD_OLD_INODE_SIZE - offset); if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { offset = offsetof(struct ext4_inode, i_checksum_hi); csum = ext4_chksum(csum, (__u8 *)raw + EXT4_GOOD_OLD_INODE_SIZE, offset - EXT4_GOOD_OLD_INODE_SIZE); if (EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) { csum = ext4_chksum(csum, (__u8 *)&dummy_csum, csum_size); offset += csum_size; } csum = ext4_chksum(csum, (__u8 *)raw + offset, EXT4_INODE_SIZE(inode->i_sb) - offset); } return csum; } static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei) { __u32 provided, calculated; if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != cpu_to_le32(EXT4_OS_LINUX) || !ext4_has_feature_metadata_csum(inode->i_sb)) return 1; provided = le16_to_cpu(raw->i_checksum_lo); calculated = ext4_inode_csum(inode, raw, ei); if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) provided |= ((__u32)le16_to_cpu(raw->i_checksum_hi)) << 16; else calculated &= 0xFFFF; return provided == calculated; } void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei) { __u32 csum; if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != cpu_to_le32(EXT4_OS_LINUX) || !ext4_has_feature_metadata_csum(inode->i_sb)) return; csum = ext4_inode_csum(inode, raw, ei); raw->i_checksum_lo = cpu_to_le16(csum & 0xFFFF); if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) raw->i_checksum_hi = cpu_to_le16(csum >> 16); } static inline int ext4_begin_ordered_truncate(struct inode *inode, loff_t new_size) { trace_ext4_begin_ordered_truncate(inode, new_size); /* * If jinode is zero, then we never opened the file for * writing, so there's no need to call * jbd2_journal_begin_ordered_truncate() since there's no * outstanding writes we need to flush. */ if (!EXT4_I(inode)->jinode) return 0; return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode), EXT4_I(inode)->jinode, new_size); } /* * Test whether an inode is a fast symlink. * A fast symlink has its symlink data stored in ext4_inode_info->i_data. */ int ext4_inode_is_fast_symlink(struct inode *inode) { if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { int ea_blocks = EXT4_I(inode)->i_file_acl ? EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0; if (ext4_has_inline_data(inode)) return 0; return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); } return S_ISLNK(inode->i_mode) && inode->i_size && (inode->i_size < EXT4_N_BLOCKS * 4); } /* * Called at the last iput() if i_nlink is zero. */ void ext4_evict_inode(struct inode *inode) { handle_t *handle; int err; /* * Credits for final inode cleanup and freeing: * sb + inode (ext4_orphan_del()), block bitmap, group descriptor * (xattr block freeing), bitmap, group descriptor (inode freeing) */ int extra_credits = 6; struct ext4_xattr_inode_array *ea_inode_array = NULL; bool freeze_protected = false; trace_ext4_evict_inode(inode); dax_break_layout_final(inode); if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL) ext4_evict_ea_inode(inode); if (inode->i_nlink) { truncate_inode_pages_final(&inode->i_data); goto no_delete; } if (is_bad_inode(inode)) goto no_delete; dquot_initialize(inode); if (ext4_should_order_data(inode)) ext4_begin_ordered_truncate(inode, 0); truncate_inode_pages_final(&inode->i_data); /* * For inodes with journalled data, transaction commit could have * dirtied the inode. And for inodes with dioread_nolock, unwritten * extents converting worker could merge extents and also have dirtied * the inode. Flush worker is ignoring it because of I_FREEING flag but * we still need to remove the inode from the writeback lists. */ if (!list_empty_careful(&inode->i_io_list)) inode_io_list_del(inode); /* * Protect us against freezing - iput() caller didn't have to have any * protection against it. When we are in a running transaction though, * we are already protected against freezing and we cannot grab further * protection due to lock ordering constraints. */ if (!ext4_journal_current_handle()) { sb_start_intwrite(inode->i_sb); freeze_protected = true; } if (!IS_NOQUOTA(inode)) extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb); /* * Block bitmap, group descriptor, and inode are accounted in both * ext4_blocks_for_truncate() and extra_credits. So subtract 3. */ handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, ext4_blocks_for_truncate(inode) + extra_credits - 3); if (IS_ERR(handle)) { ext4_std_error(inode->i_sb, PTR_ERR(handle)); /* * If we're going to skip the normal cleanup, we still need to * make sure that the in-core orphan linked list is properly * cleaned up. */ ext4_orphan_del(NULL, inode); if (freeze_protected) sb_end_intwrite(inode->i_sb); goto no_delete; } if (IS_SYNC(inode)) ext4_handle_sync(handle); /* * Set inode->i_size to 0 before calling ext4_truncate(). We need * special handling of symlinks here because i_size is used to * determine whether ext4_inode_info->i_data contains symlink data or * block mappings. Setting i_size to 0 will remove its fast symlink * status. Erase i_data so that it becomes a valid empty block map. */ if (ext4_inode_is_fast_symlink(inode)) memset(EXT4_I(inode)->i_data, 0, sizeof(EXT4_I(inode)->i_data)); inode->i_size = 0; err = ext4_mark_inode_dirty(handle, inode); if (err) { ext4_warning(inode->i_sb, "couldn't mark inode dirty (err %d)", err); goto stop_handle; } if (inode->i_blocks) { err = ext4_truncate(inode); if (err) { ext4_error_err(inode->i_sb, -err, "couldn't truncate inode %lu (err %d)", inode->i_ino, err); goto stop_handle; } } /* Remove xattr references. */ err = ext4_xattr_delete_inode(handle, inode, &ea_inode_array, extra_credits); if (err) { ext4_warning(inode->i_sb, "xattr delete (err %d)", err); stop_handle: ext4_journal_stop(handle); ext4_orphan_del(NULL, inode); if (freeze_protected) sb_end_intwrite(inode->i_sb); ext4_xattr_inode_array_free(ea_inode_array); goto no_delete; } /* * Kill off the orphan record which ext4_truncate created. * AKPM: I think this can be inside the above `if'. * Note that ext4_orphan_del() has to be able to cope with the * deletion of a non-existent orphan - this is because we don't * know if ext4_truncate() actually created an orphan record. * (Well, we could do this if we need to, but heck - it works) */ ext4_orphan_del(handle, inode); EXT4_I(inode)->i_dtime = (__u32)ktime_get_real_seconds(); /* * One subtle ordering requirement: if anything has gone wrong * (transaction abort, IO errors, whatever), then we can still * do these next steps (the fs will already have been marked as * having errors), but we can't free the inode if the mark_dirty * fails. */ if (ext4_mark_inode_dirty(handle, inode)) /* If that failed, just do the required in-core inode clear. */ ext4_clear_inode(inode); else ext4_free_inode(handle, inode); ext4_journal_stop(handle); if (freeze_protected) sb_end_intwrite(inode->i_sb); ext4_xattr_inode_array_free(ea_inode_array); return; no_delete: /* * Check out some where else accidentally dirty the evicting inode, * which may probably cause inode use-after-free issues later. */ WARN_ON_ONCE(!list_empty_careful(&inode->i_io_list)); if (!list_empty(&EXT4_I(inode)->i_fc_list)) ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL); ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ } #ifdef CONFIG_QUOTA qsize_t *ext4_get_reserved_space(struct inode *inode) { return &EXT4_I(inode)->i_reserved_quota; } #endif /* * Called with i_data_sem down, which is important since we can call * ext4_discard_preallocations() from here. */ void ext4_da_update_reserve_space(struct inode *inode, int used, int quota_claim) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); spin_lock(&ei->i_block_reservation_lock); trace_ext4_da_update_reserve_space(inode, used, quota_claim); if (unlikely(used > ei->i_reserved_data_blocks)) { ext4_warning(inode->i_sb, "%s: ino %lu, used %d " "with only %d reserved data blocks", __func__, inode->i_ino, used, ei->i_reserved_data_blocks); WARN_ON(1); used = ei->i_reserved_data_blocks; } /* Update per-inode reservations */ ei->i_reserved_data_blocks -= used; percpu_counter_sub(&sbi->s_dirtyclusters_counter, used); spin_unlock(&ei->i_block_reservation_lock); /* Update quota subsystem for data blocks */ if (quota_claim) dquot_claim_block(inode, EXT4_C2B(sbi, used)); else { /* * We did fallocate with an offset that is already delayed * allocated. So on delayed allocated writeback we should * not re-claim the quota for fallocated blocks. */ dquot_release_reservation_block(inode, EXT4_C2B(sbi, used)); } /* * If we have done all the pending block allocations and if * there aren't any writers on the inode, we can discard the * inode's preallocations. */ if ((ei->i_reserved_data_blocks == 0) && !inode_is_open_for_write(inode)) ext4_discard_preallocations(inode); } static int __check_block_validity(struct inode *inode, const char *func, unsigned int line, struct ext4_map_blocks *map) { journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; if (journal && inode == journal->j_inode) return 0; if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) { ext4_error_inode(inode, func, line, map->m_pblk, "lblock %lu mapped to illegal pblock %llu " "(length %d)", (unsigned long) map->m_lblk, map->m_pblk, map->m_len); return -EFSCORRUPTED; } return 0; } int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, ext4_lblk_t len) { int ret; if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) return fscrypt_zeroout_range(inode, lblk, pblk, len); ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS); if (ret > 0) ret = 0; return ret; } /* * For generic regular files, when updating the extent tree, Ext4 should * hold the i_rwsem and invalidate_lock exclusively. This ensures * exclusion against concurrent page faults, as well as reads and writes. */ #ifdef CONFIG_EXT4_DEBUG void ext4_check_map_extents_env(struct inode *inode) { if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return; if (!S_ISREG(inode->i_mode) || IS_NOQUOTA(inode) || IS_VERITY(inode) || is_special_ino(inode->i_sb, inode->i_ino) || (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) || ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) || ext4_verity_in_progress(inode)) return; WARN_ON_ONCE(!inode_is_locked(inode) && !rwsem_is_locked(&inode->i_mapping->invalidate_lock)); } #else void ext4_check_map_extents_env(struct inode *inode) {} #endif #define check_block_validity(inode, map) \ __check_block_validity((inode), __func__, __LINE__, (map)) #ifdef ES_AGGRESSIVE_TEST static void ext4_map_blocks_es_recheck(handle_t *handle, struct inode *inode, struct ext4_map_blocks *es_map, struct ext4_map_blocks *map, int flags) { int retval; map->m_flags = 0; /* * There is a race window that the result is not the same. * e.g. xfstests #223 when dioread_nolock enables. The reason * is that we lookup a block mapping in extent status tree with * out taking i_data_sem. So at the time the unwritten extent * could be converted. */ down_read(&EXT4_I(inode)->i_data_sem); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { retval = ext4_ext_map_blocks(handle, inode, map, 0); } else { retval = ext4_ind_map_blocks(handle, inode, map, 0); } up_read((&EXT4_I(inode)->i_data_sem)); /* * We don't check m_len because extent will be collpased in status * tree. So the m_len might not equal. */ if (es_map->m_lblk != map->m_lblk || es_map->m_flags != map->m_flags || es_map->m_pblk != map->m_pblk) { printk("ES cache assertion failed for inode: %lu " "es_cached ex [%d/%d/%llu/%x] != " "found ex [%d/%d/%llu/%x] retval %d flags %x\n", inode->i_ino, es_map->m_lblk, es_map->m_len, es_map->m_pblk, es_map->m_flags, map->m_lblk, map->m_len, map->m_pblk, map->m_flags, retval, flags); } } #endif /* ES_AGGRESSIVE_TEST */ static int ext4_map_query_blocks_next_in_leaf(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, unsigned int orig_mlen) { struct ext4_map_blocks map2; unsigned int status, status2; int retval; status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; WARN_ON_ONCE(!(map->m_flags & EXT4_MAP_QUERY_LAST_IN_LEAF)); WARN_ON_ONCE(orig_mlen <= map->m_len); /* Prepare map2 for lookup in next leaf block */ map2.m_lblk = map->m_lblk + map->m_len; map2.m_len = orig_mlen - map->m_len; map2.m_flags = 0; retval = ext4_ext_map_blocks(handle, inode, &map2, 0); if (retval <= 0) { ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, status, false); return map->m_len; } if (unlikely(retval != map2.m_len)) { ext4_warning(inode->i_sb, "ES len assertion failed for inode " "%lu: retval %d != map->m_len %d", inode->i_ino, retval, map2.m_len); WARN_ON(1); } status2 = map2.m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; /* * If map2 is contiguous with map, then let's insert it as a single * extent in es cache and return the combined length of both the maps. */ if (map->m_pblk + map->m_len == map2.m_pblk && status == status2) { ext4_es_insert_extent(inode, map->m_lblk, map->m_len + map2.m_len, map->m_pblk, status, false); map->m_len += map2.m_len; } else { ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, status, false); } return map->m_len; } static int ext4_map_query_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags) { unsigned int status; int retval; unsigned int orig_mlen = map->m_len; flags &= EXT4_EX_QUERY_FILTER; if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) retval = ext4_ext_map_blocks(handle, inode, map, flags); else retval = ext4_ind_map_blocks(handle, inode, map, flags); if (retval <= 0) return retval; if (unlikely(retval != map->m_len)) { ext4_warning(inode->i_sb, "ES len assertion failed for inode " "%lu: retval %d != map->m_len %d", inode->i_ino, retval, map->m_len); WARN_ON(1); } /* * No need to query next in leaf: * - if returned extent is not last in leaf or * - if the last in leaf is the full requested range */ if (!(map->m_flags & EXT4_MAP_QUERY_LAST_IN_LEAF) || map->m_len == orig_mlen) { status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, status, false); return retval; } return ext4_map_query_blocks_next_in_leaf(handle, inode, map, orig_mlen); } static int ext4_map_create_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags) { struct extent_status es; unsigned int status; int err, retval = 0; /* * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE * indicates that the blocks and quotas has already been * checked when the data was copied into the page cache. */ if (map->m_flags & EXT4_MAP_DELAYED) flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; /* * Here we clear m_flags because after allocating an new extent, * it will be set again. */ map->m_flags &= ~EXT4_MAP_FLAGS; /* * We need to check for EXT4 here because migrate could have * changed the inode type in between. */ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { retval = ext4_ext_map_blocks(handle, inode, map, flags); } else { retval = ext4_ind_map_blocks(handle, inode, map, flags); /* * We allocated new blocks which will result in i_data's * format changing. Force the migrate to fail by clearing * migrate flags. */ if (retval > 0 && map->m_flags & EXT4_MAP_NEW) ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); } if (retval <= 0) return retval; if (unlikely(retval != map->m_len)) { ext4_warning(inode->i_sb, "ES len assertion failed for inode %lu: " "retval %d != map->m_len %d", inode->i_ino, retval, map->m_len); WARN_ON(1); } /* * We have to zeroout blocks before inserting them into extent * status tree. Otherwise someone could look them up there and * use them before they are really zeroed. We also have to * unmap metadata before zeroing as otherwise writeback can * overwrite zeros with stale data from block device. */ if (flags & EXT4_GET_BLOCKS_ZERO && map->m_flags & EXT4_MAP_MAPPED && map->m_flags & EXT4_MAP_NEW) { err = ext4_issue_zeroout(inode, map->m_lblk, map->m_pblk, map->m_len); if (err) return err; } /* * If the extent has been zeroed out, we don't need to update * extent status tree. */ if (flags & EXT4_GET_BLOCKS_PRE_IO && ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { if (ext4_es_is_written(&es)) return retval; } status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, status, flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE); return retval; } /* * The ext4_map_blocks() function tries to look up the requested blocks, * and returns if the blocks are already mapped. * * Otherwise it takes the write lock of the i_data_sem and allocate blocks * and store the allocated blocks in the result buffer head and mark it * mapped. * * If file type is extents based, it will call ext4_ext_map_blocks(), * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping * based files * * On success, it returns the number of blocks being mapped or allocated. * If flags doesn't contain EXT4_GET_BLOCKS_CREATE the blocks are * pre-allocated and unwritten, the resulting @map is marked as unwritten. * If the flags contain EXT4_GET_BLOCKS_CREATE, it will mark @map as mapped. * * It returns 0 if plain look up failed (blocks have not been allocated), in * that case, @map is returned as unmapped but we still do fill map->m_len to * indicate the length of a hole starting at map->m_lblk. * * It returns the error in case of allocation failure. */ int ext4_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags) { struct extent_status es; int retval; int ret = 0; unsigned int orig_mlen = map->m_len; #ifdef ES_AGGRESSIVE_TEST struct ext4_map_blocks orig_map; memcpy(&orig_map, map, sizeof(*map)); #endif map->m_flags = 0; ext_debug(inode, "flag 0x%x, max_blocks %u, logical block %lu\n", flags, map->m_len, (unsigned long) map->m_lblk); /* * ext4_map_blocks returns an int, and m_len is an unsigned int */ if (unlikely(map->m_len > INT_MAX)) map->m_len = INT_MAX; /* We can handle the block number less than EXT_MAX_BLOCKS */ if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS)) return -EFSCORRUPTED; /* * Callers from the context of data submission are the only exceptions * for regular files that do not hold the i_rwsem or invalidate_lock. * However, caching unrelated ranges is not permitted. */ if (flags & EXT4_GET_BLOCKS_IO_SUBMIT) WARN_ON_ONCE(!(flags & EXT4_EX_NOCACHE)); else ext4_check_map_extents_env(inode); /* Lookup extent status tree firstly */ if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) && ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk; map->m_flags |= ext4_es_is_written(&es) ? EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN; retval = es.es_len - (map->m_lblk - es.es_lblk); if (retval > map->m_len) retval = map->m_len; map->m_len = retval; } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) { map->m_pblk = 0; map->m_flags |= ext4_es_is_delayed(&es) ? EXT4_MAP_DELAYED : 0; retval = es.es_len - (map->m_lblk - es.es_lblk); if (retval > map->m_len) retval = map->m_len; map->m_len = retval; retval = 0; } else { BUG(); } if (flags & EXT4_GET_BLOCKS_CACHED_NOWAIT) return retval; #ifdef ES_AGGRESSIVE_TEST ext4_map_blocks_es_recheck(handle, inode, map, &orig_map, flags); #endif if (!(flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) || orig_mlen == map->m_len) goto found; if (flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) map->m_len = orig_mlen; } /* * In the query cache no-wait mode, nothing we can do more if we * cannot find extent in the cache. */ if (flags & EXT4_GET_BLOCKS_CACHED_NOWAIT) return 0; /* * Try to see if we can get the block without requesting a new * file system block. */ down_read(&EXT4_I(inode)->i_data_sem); retval = ext4_map_query_blocks(handle, inode, map, flags); up_read((&EXT4_I(inode)->i_data_sem)); found: if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { ret = check_block_validity(inode, map); if (ret != 0) return ret; } /* If it is only a block(s) look up */ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) return retval; /* * Returns if the blocks have already allocated * * Note that if blocks have been preallocated * ext4_ext_map_blocks() returns with buffer head unmapped */ if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) /* * If we need to convert extent to unwritten * we continue and do the actual work in * ext4_ext_map_blocks() */ if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) return retval; ext4_fc_track_inode(handle, inode); /* * New blocks allocate and/or writing to unwritten extent * will possibly result in updating i_data, so we take * the write lock of i_data_sem, and call get_block() * with create == 1 flag. */ down_write(&EXT4_I(inode)->i_data_sem); retval = ext4_map_create_blocks(handle, inode, map, flags); up_write((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { ret = check_block_validity(inode, map); if (ret != 0) return ret; /* * Inodes with freshly allocated blocks where contents will be * visible after transaction commit must be on transaction's * ordered data list. */ if (map->m_flags & EXT4_MAP_NEW && !(map->m_flags & EXT4_MAP_UNWRITTEN) && !(flags & EXT4_GET_BLOCKS_ZERO) && !ext4_is_quota_file(inode) && ext4_should_order_data(inode)) { loff_t start_byte = (loff_t)map->m_lblk << inode->i_blkbits; loff_t length = (loff_t)map->m_len << inode->i_blkbits; if (flags & EXT4_GET_BLOCKS_IO_SUBMIT) ret = ext4_jbd2_inode_add_wait(handle, inode, start_byte, length); else ret = ext4_jbd2_inode_add_write(handle, inode, start_byte, length); if (ret) return ret; } } if (retval > 0 && (map->m_flags & EXT4_MAP_UNWRITTEN || map->m_flags & EXT4_MAP_MAPPED)) ext4_fc_track_range(handle, inode, map->m_lblk, map->m_lblk + map->m_len - 1); if (retval < 0) ext_debug(inode, "failed with err %d\n", retval); return retval; } /* * Update EXT4_MAP_FLAGS in bh->b_state. For buffer heads attached to pages * we have to be careful as someone else may be manipulating b_state as well. */ static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags) { unsigned long old_state; unsigned long new_state; flags &= EXT4_MAP_FLAGS; /* Dummy buffer_head? Set non-atomically. */ if (!bh->b_folio) { bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | flags; return; } /* * Someone else may be modifying b_state. Be careful! This is ugly but * once we get rid of using bh as a container for mapping information * to pass to / from get_block functions, this can go away. */ old_state = READ_ONCE(bh->b_state); do { new_state = (old_state & ~EXT4_MAP_FLAGS) | flags; } while (unlikely(!try_cmpxchg(&bh->b_state, &old_state, new_state))); } static int _ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int flags) { struct ext4_map_blocks map; int ret = 0; if (ext4_has_inline_data(inode)) return -ERANGE; map.m_lblk = iblock; map.m_len = bh->b_size >> inode->i_blkbits; ret = ext4_map_blocks(ext4_journal_current_handle(), inode, &map, flags); if (ret > 0) { map_bh(bh, inode->i_sb, map.m_pblk); ext4_update_bh_state(bh, map.m_flags); bh->b_size = inode->i_sb->s_blocksize * map.m_len; ret = 0; } else if (ret == 0) { /* hole case, need to fill in bh->b_size */ bh->b_size = inode->i_sb->s_blocksize * map.m_len; } return ret; } int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) { return _ext4_get_block(inode, iblock, bh, create ? EXT4_GET_BLOCKS_CREATE : 0); } /* * Get block function used when preparing for buffered write if we require * creating an unwritten extent if blocks haven't been allocated. The extent * will be converted to written after the IO is complete. */ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { int ret = 0; ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n", inode->i_ino, create); ret = _ext4_get_block(inode, iblock, bh_result, EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT); /* * If the buffer is marked unwritten, mark it as new to make sure it is * zeroed out correctly in case of partial writes. Otherwise, there is * a chance of stale data getting exposed. */ if (ret == 0 && buffer_unwritten(bh_result)) set_buffer_new(bh_result); return ret; } /* Maximum number of blocks we map for direct IO at once. */ #define DIO_MAX_BLOCKS 4096 /* * `handle' can be NULL if create is zero */ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, ext4_lblk_t block, int map_flags) { struct ext4_map_blocks map; struct buffer_head *bh; int create = map_flags & EXT4_GET_BLOCKS_CREATE; bool nowait = map_flags & EXT4_GET_BLOCKS_CACHED_NOWAIT; int err; ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) || handle != NULL || create == 0); ASSERT(create == 0 || !nowait); map.m_lblk = block; map.m_len = 1; err = ext4_map_blocks(handle, inode, &map, map_flags); if (err == 0) return create ? ERR_PTR(-ENOSPC) : NULL; if (err < 0) return ERR_PTR(err); if (nowait) return sb_find_get_block(inode->i_sb, map.m_pblk); /* * Since bh could introduce extra ref count such as referred by * journal_head etc. Try to avoid using __GFP_MOVABLE here * as it may fail the migration when journal_head remains. */ bh = getblk_unmovable(inode->i_sb->s_bdev, map.m_pblk, inode->i_sb->s_blocksize); if (unlikely(!bh)) return ERR_PTR(-ENOMEM); if (map.m_flags & EXT4_MAP_NEW) { ASSERT(create != 0); ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) || (handle != NULL)); /* * Now that we do not always journal data, we should * keep in mind whether this should always journal the * new buffer as metadata. For now, regular file * writes use ext4_get_block instead, so it's not a * problem. */ lock_buffer(bh); BUFFER_TRACE(bh, "call get_create_access"); err = ext4_journal_get_create_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (unlikely(err)) { unlock_buffer(bh); goto errout; } if (!buffer_uptodate(bh)) { memset(bh->b_data, 0, inode->i_sb->s_blocksize); set_buffer_uptodate(bh); } unlock_buffer(bh); BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, inode, bh); if (unlikely(err)) goto errout; } else BUFFER_TRACE(bh, "not a new buffer"); return bh; errout: brelse(bh); return ERR_PTR(err); } struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, ext4_lblk_t block, int map_flags) { struct buffer_head *bh; int ret; bh = ext4_getblk(handle, inode, block, map_flags); if (IS_ERR(bh)) return bh; if (!bh || ext4_buffer_uptodate(bh)) return bh; ret = ext4_read_bh_lock(bh, REQ_META | REQ_PRIO, true); if (ret) { put_bh(bh); return ERR_PTR(ret); } return bh; } /* Read a contiguous batch of blocks. */ int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count, bool wait, struct buffer_head **bhs) { int i, err; for (i = 0; i < bh_count; i++) { bhs[i] = ext4_getblk(NULL, inode, block + i, 0 /* map_flags */); if (IS_ERR(bhs[i])) { err = PTR_ERR(bhs[i]); bh_count = i; goto out_brelse; } } for (i = 0; i < bh_count; i++) /* Note that NULL bhs[i] is valid because of holes. */ if (bhs[i] && !ext4_buffer_uptodate(bhs[i])) ext4_read_bh_lock(bhs[i], REQ_META | REQ_PRIO, false); if (!wait) return 0; for (i = 0; i < bh_count; i++) if (bhs[i]) wait_on_buffer(bhs[i]); for (i = 0; i < bh_count; i++) { if (bhs[i] && !buffer_uptodate(bhs[i])) { err = -EIO; goto out_brelse; } } return 0; out_brelse: for (i = 0; i < bh_count; i++) { brelse(bhs[i]); bhs[i] = NULL; } return err; } int ext4_walk_page_buffers(handle_t *handle, struct inode *inode, struct buffer_head *head, unsigned from, unsigned to, int *partial, int (*fn)(handle_t *handle, struct inode *inode, struct buffer_head *bh)) { struct buffer_head *bh; unsigned block_start, block_end; unsigned blocksize = head->b_size; int err, ret = 0; struct buffer_head *next; for (bh = head, block_start = 0; ret == 0 && (bh != head || !block_start); block_start = block_end, bh = next) { next = bh->b_this_page; block_end = block_start + blocksize; if (block_end <= from || block_start >= to) { if (partial && !buffer_uptodate(bh)) *partial = 1; continue; } err = (*fn)(handle, inode, bh); if (!ret) ret = err; } return ret; } /* * Helper for handling dirtying of journalled data. We also mark the folio as * dirty so that writeback code knows about this page (and inode) contains * dirty data. ext4_writepages() then commits appropriate transaction to * make data stable. */ static int ext4_dirty_journalled_data(handle_t *handle, struct buffer_head *bh) { struct folio *folio = bh->b_folio; struct inode *inode = folio->mapping->host; /* only regular files have a_ops */ if (S_ISREG(inode->i_mode)) folio_mark_dirty(folio); return ext4_handle_dirty_metadata(handle, NULL, bh); } int do_journal_get_write_access(handle_t *handle, struct inode *inode, struct buffer_head *bh) { if (!buffer_mapped(bh) || buffer_freed(bh)) return 0; BUFFER_TRACE(bh, "get write access"); return ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); } int ext4_block_write_begin(handle_t *handle, struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block) { unsigned int from = offset_in_folio(folio, pos); unsigned to = from + len; struct inode *inode = folio->mapping->host; unsigned block_start, block_end; sector_t block; int err = 0; unsigned blocksize = inode->i_sb->s_blocksize; unsigned bbits; struct buffer_head *bh, *head, *wait[2]; int nr_wait = 0; int i; bool should_journal_data = ext4_should_journal_data(inode); BUG_ON(!folio_test_locked(folio)); BUG_ON(to > folio_size(folio)); BUG_ON(from > to); head = folio_buffers(folio); if (!head) head = create_empty_buffers(folio, blocksize, 0); bbits = ilog2(blocksize); block = (sector_t)folio->index << (PAGE_SHIFT - bbits); for (bh = head, block_start = 0; bh != head || !block_start; block++, block_start = block_end, bh = bh->b_this_page) { block_end = block_start + blocksize; if (block_end <= from || block_start >= to) { if (folio_test_uptodate(folio)) { set_buffer_uptodate(bh); } continue; } if (buffer_new(bh)) clear_buffer_new(bh); if (!buffer_mapped(bh)) { WARN_ON(bh->b_size != blocksize); err = get_block(inode, block, bh, 1); if (err) break; if (buffer_new(bh)) { /* * We may be zeroing partial buffers or all new * buffers in case of failure. Prepare JBD2 for * that. */ if (should_journal_data) do_journal_get_write_access(handle, inode, bh); if (folio_test_uptodate(folio)) { /* * Unlike __block_write_begin() we leave * dirtying of new uptodate buffers to * ->write_end() time or * folio_zero_new_buffers(). */ set_buffer_uptodate(bh); continue; } if (block_end > to || block_start < from) folio_zero_segments(folio, to, block_end, block_start, from); continue; } } if (folio_test_uptodate(folio)) { set_buffer_uptodate(bh); continue; } if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh) && (block_start < from || block_end > to)) { ext4_read_bh_lock(bh, 0, false); wait[nr_wait++] = bh; } } /* * If we issued read requests, let them complete. */ for (i = 0; i < nr_wait; i++) { wait_on_buffer(wait[i]); if (!buffer_uptodate(wait[i])) err = -EIO; } if (unlikely(err)) { if (should_journal_data) ext4_journalled_zero_new_buffers(handle, inode, folio, from, to); else folio_zero_new_buffers(folio, from, to); } else if (fscrypt_inode_uses_fs_layer_crypto(inode)) { for (i = 0; i < nr_wait; i++) { int err2; err2 = fscrypt_decrypt_pagecache_blocks(folio, blocksize, bh_offset(wait[i])); if (err2) { clear_buffer_uptodate(wait[i]); err = err2; } } } return err; } /* * To preserve ordering, it is essential that the hole instantiation and * the data write be encapsulated in a single transaction. We cannot * close off a transaction and start a new one between the ext4_get_block() * and the ext4_write_end(). So doing the jbd2_journal_start at the start of * ext4_write_begin() is the right place. */ static int ext4_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { struct inode *inode = mapping->host; int ret, needed_blocks; handle_t *handle; int retries = 0; struct folio *folio; pgoff_t index; unsigned from, to; fgf_t fgp = FGP_WRITEBEGIN; ret = ext4_emergency_state(inode->i_sb); if (unlikely(ret)) return ret; trace_ext4_write_begin(inode, pos, len); /* * Reserve one block more for addition to orphan list in case * we allocate blocks but write fails for some reason */ needed_blocks = ext4_writepage_trans_blocks(inode) + 1; index = pos >> PAGE_SHIFT; if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { ret = ext4_try_to_write_inline_data(mapping, inode, pos, len, foliop); if (ret < 0) return ret; if (ret == 1) return 0; } /* * __filemap_get_folio() can take a long time if the * system is thrashing due to memory pressure, or if the folio * is being written back. So grab it first before we start * the transaction handle. This also allows us to allocate * the folio (if needed) without using GFP_NOFS. */ retry_grab: fgp |= fgf_set_order(len); folio = __filemap_get_folio(mapping, index, fgp, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return PTR_ERR(folio); if (pos + len > folio_pos(folio) + folio_size(folio)) len = folio_pos(folio) + folio_size(folio) - pos; from = offset_in_folio(folio, pos); to = from + len; /* * The same as page allocation, we prealloc buffer heads before * starting the handle. */ if (!folio_buffers(folio)) create_empty_buffers(folio, inode->i_sb->s_blocksize, 0); folio_unlock(folio); retry_journal: handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks); if (IS_ERR(handle)) { folio_put(folio); return PTR_ERR(handle); } folio_lock(folio); if (folio->mapping != mapping) { /* The folio got truncated from under us */ folio_unlock(folio); folio_put(folio); ext4_journal_stop(handle); goto retry_grab; } /* In case writeback began while the folio was unlocked */ folio_wait_stable(folio); if (ext4_should_dioread_nolock(inode)) ret = ext4_block_write_begin(handle, folio, pos, len, ext4_get_block_unwritten); else ret = ext4_block_write_begin(handle, folio, pos, len, ext4_get_block); if (!ret && ext4_should_journal_data(inode)) { ret = ext4_walk_page_buffers(handle, inode, folio_buffers(folio), from, to, NULL, do_journal_get_write_access); } if (ret) { bool extended = (pos + len > inode->i_size) && !ext4_verity_in_progress(inode); folio_unlock(folio); /* * ext4_block_write_begin may have instantiated a few blocks * outside i_size. Trim these off again. Don't need * i_size_read because we hold i_rwsem. * * Add inode to orphan list in case we crash before * truncate finishes */ if (extended && ext4_can_truncate(inode)) ext4_orphan_add(handle, inode); ext4_journal_stop(handle); if (extended) { ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might * still be on the orphan list; we need to * make sure the inode is removed from the * orphan list in that case. */ if (inode->i_nlink) ext4_orphan_del(NULL, inode); } if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry_journal; folio_put(folio); return ret; } *foliop = folio; return ret; } /* For write_end() in data=journal mode */ static int write_end_fn(handle_t *handle, struct inode *inode, struct buffer_head *bh) { int ret; if (!buffer_mapped(bh) || buffer_freed(bh)) return 0; set_buffer_uptodate(bh); ret = ext4_dirty_journalled_data(handle, bh); clear_buffer_meta(bh); clear_buffer_prio(bh); return ret; } /* * We need to pick up the new inode size which generic_commit_write gave us * `file' can be NULL - eg, when called from page_symlink(). * * ext4 never places buffers on inode->i_mapping->i_private_list. metadata * buffers are managed internally. */ static int ext4_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { handle_t *handle = ext4_journal_current_handle(); struct inode *inode = mapping->host; loff_t old_size = inode->i_size; int ret = 0, ret2; int i_size_changed = 0; bool verity = ext4_verity_in_progress(inode); trace_ext4_write_end(inode, pos, len, copied); if (ext4_has_inline_data(inode) && ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) return ext4_write_inline_data_end(inode, pos, len, copied, folio); copied = block_write_end(file, mapping, pos, len, copied, folio, fsdata); /* * it's important to update i_size while still holding folio lock: * page writeout could otherwise come in and zero beyond i_size. * * If FS_IOC_ENABLE_VERITY is running on this inode, then Merkle tree * blocks are being written past EOF, so skip the i_size update. */ if (!verity) i_size_changed = ext4_update_inode_size(inode, pos + copied); folio_unlock(folio); folio_put(folio); if (old_size < pos && !verity) { pagecache_isize_extended(inode, old_size, pos); ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size); } /* * Don't mark the inode dirty under folio lock. First, it unnecessarily * makes the holding time of folio lock longer. Second, it forces lock * ordering of folio lock and transaction start for journaling * filesystems. */ if (i_size_changed) ret = ext4_mark_inode_dirty(handle, inode); if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied * less. We will have blocks allocated outside * inode->i_size. So truncate them */ ext4_orphan_add(handle, inode); ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; if (pos + len > inode->i_size && !verity) { ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might still be * on the orphan list; we need to make sure the inode * is removed from the orphan list in that case. */ if (inode->i_nlink) ext4_orphan_del(NULL, inode); } return ret ? ret : copied; } /* * This is a private version of folio_zero_new_buffers() which doesn't * set the buffer to be dirty, since in data=journalled mode we need * to call ext4_dirty_journalled_data() instead. */ static void ext4_journalled_zero_new_buffers(handle_t *handle, struct inode *inode, struct folio *folio, unsigned from, unsigned to) { unsigned int block_start = 0, block_end; struct buffer_head *head, *bh; bh = head = folio_buffers(folio); do { block_end = block_start + bh->b_size; if (buffer_new(bh)) { if (block_end > from && block_start < to) { if (!folio_test_uptodate(folio)) { unsigned start, size; start = max(from, block_start); size = min(to, block_end) - start; folio_zero_range(folio, start, size); } clear_buffer_new(bh); write_end_fn(handle, inode, bh); } } block_start = block_end; bh = bh->b_this_page; } while (bh != head); } static int ext4_journalled_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { handle_t *handle = ext4_journal_current_handle(); struct inode *inode = mapping->host; loff_t old_size = inode->i_size; int ret = 0, ret2; int partial = 0; unsigned from, to; int size_changed = 0; bool verity = ext4_verity_in_progress(inode); trace_ext4_journalled_write_end(inode, pos, len, copied); from = pos & (PAGE_SIZE - 1); to = from + len; BUG_ON(!ext4_handle_valid(handle)); if (ext4_has_inline_data(inode)) return ext4_write_inline_data_end(inode, pos, len, copied, folio); if (unlikely(copied < len) && !folio_test_uptodate(folio)) { copied = 0; ext4_journalled_zero_new_buffers(handle, inode, folio, from, to); } else { if (unlikely(copied < len)) ext4_journalled_zero_new_buffers(handle, inode, folio, from + copied, to); ret = ext4_walk_page_buffers(handle, inode, folio_buffers(folio), from, from + copied, &partial, write_end_fn); if (!partial) folio_mark_uptodate(folio); } if (!verity) size_changed = ext4_update_inode_size(inode, pos + copied); EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; folio_unlock(folio); folio_put(folio); if (old_size < pos && !verity) { pagecache_isize_extended(inode, old_size, pos); ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size); } if (size_changed) { ret2 = ext4_mark_inode_dirty(handle, inode); if (!ret) ret = ret2; } if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied * less. We will have blocks allocated outside * inode->i_size. So truncate them */ ext4_orphan_add(handle, inode); ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; if (pos + len > inode->i_size && !verity) { ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might still be * on the orphan list; we need to make sure the inode * is removed from the orphan list in that case. */ if (inode->i_nlink) ext4_orphan_del(NULL, inode); } return ret ? ret : copied; } /* * Reserve space for 'nr_resv' clusters */ static int ext4_da_reserve_space(struct inode *inode, int nr_resv) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); int ret; /* * We will charge metadata quota at writeout time; this saves * us from metadata over-estimation, though we may go over by * a small amount in the end. Here we just reserve for data. */ ret = dquot_reserve_block(inode, EXT4_C2B(sbi, nr_resv)); if (ret) return ret; spin_lock(&ei->i_block_reservation_lock); if (ext4_claim_free_clusters(sbi, nr_resv, 0)) { spin_unlock(&ei->i_block_reservation_lock); dquot_release_reservation_block(inode, EXT4_C2B(sbi, nr_resv)); return -ENOSPC; } ei->i_reserved_data_blocks += nr_resv; trace_ext4_da_reserve_space(inode, nr_resv); spin_unlock(&ei->i_block_reservation_lock); return 0; /* success */ } void ext4_da_release_space(struct inode *inode, int to_free) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); if (!to_free) return; /* Nothing to release, exit */ spin_lock(&EXT4_I(inode)->i_block_reservation_lock); trace_ext4_da_release_space(inode, to_free); if (unlikely(to_free > ei->i_reserved_data_blocks)) { /* * if there aren't enough reserved blocks, then the * counter is messed up somewhere. Since this * function is called from invalidate page, it's * harmless to return without any action. */ ext4_warning(inode->i_sb, "ext4_da_release_space: " "ino %lu, to_free %d with only %d reserved " "data blocks", inode->i_ino, to_free, ei->i_reserved_data_blocks); WARN_ON(1); to_free = ei->i_reserved_data_blocks; } ei->i_reserved_data_blocks -= to_free; /* update fs dirty data blocks counter */ percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free); spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free)); } /* * Delayed allocation stuff */ struct mpage_da_data { /* These are input fields for ext4_do_writepages() */ struct inode *inode; struct writeback_control *wbc; unsigned int can_map:1; /* Can writepages call map blocks? */ /* These are internal state of ext4_do_writepages() */ pgoff_t first_page; /* The first page to write */ pgoff_t next_page; /* Current page to examine */ pgoff_t last_page; /* Last page to examine */ /* * Extent to map - this can be after first_page because that can be * fully mapped. We somewhat abuse m_flags to store whether the extent * is delalloc or unwritten. */ struct ext4_map_blocks map; struct ext4_io_submit io_submit; /* IO submission data */ unsigned int do_map:1; unsigned int scanned_until_end:1; unsigned int journalled_more_data:1; }; static void mpage_release_unused_pages(struct mpage_da_data *mpd, bool invalidate) { unsigned nr, i; pgoff_t index, end; struct folio_batch fbatch; struct inode *inode = mpd->inode; struct address_space *mapping = inode->i_mapping; /* This is necessary when next_page == 0. */ if (mpd->first_page >= mpd->next_page) return; mpd->scanned_until_end = 0; index = mpd->first_page; end = mpd->next_page - 1; if (invalidate) { ext4_lblk_t start, last; start = index << (PAGE_SHIFT - inode->i_blkbits); last = end << (PAGE_SHIFT - inode->i_blkbits); /* * avoid racing with extent status tree scans made by * ext4_insert_delayed_block() */ down_write(&EXT4_I(inode)->i_data_sem); ext4_es_remove_extent(inode, start, last - start + 1); up_write(&EXT4_I(inode)->i_data_sem); } folio_batch_init(&fbatch); while (index <= end) { nr = filemap_get_folios(mapping, &index, end, &fbatch); if (nr == 0) break; for (i = 0; i < nr; i++) { struct folio *folio = fbatch.folios[i]; if (folio->index < mpd->first_page) continue; if (folio_next_index(folio) - 1 > end) continue; BUG_ON(!folio_test_locked(folio)); BUG_ON(folio_test_writeback(folio)); if (invalidate) { if (folio_mapped(folio)) folio_clear_dirty_for_io(folio); block_invalidate_folio(folio, 0, folio_size(folio)); folio_clear_uptodate(folio); } folio_unlock(folio); } folio_batch_release(&fbatch); } } static void ext4_print_free_blocks(struct inode *inode) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct super_block *sb = inode->i_sb; struct ext4_inode_info *ei = EXT4_I(inode); ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld", EXT4_C2B(EXT4_SB(inode->i_sb), ext4_count_free_clusters(sb))); ext4_msg(sb, KERN_CRIT, "Free/Dirty block details"); ext4_msg(sb, KERN_CRIT, "free_blocks=%lld", (long long) EXT4_C2B(EXT4_SB(sb), percpu_counter_sum(&sbi->s_freeclusters_counter))); ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld", (long long) EXT4_C2B(EXT4_SB(sb), percpu_counter_sum(&sbi->s_dirtyclusters_counter))); ext4_msg(sb, KERN_CRIT, "Block reservation details"); ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u", ei->i_reserved_data_blocks); return; } /* * Check whether the cluster containing lblk has been allocated or has * delalloc reservation. * * Returns 0 if the cluster doesn't have either, 1 if it has delalloc * reservation, 2 if it's already been allocated, negative error code on * failure. */ static int ext4_clu_alloc_state(struct inode *inode, ext4_lblk_t lblk) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int ret; /* Has delalloc reservation? */ if (ext4_es_scan_clu(inode, &ext4_es_is_delayed, lblk)) return 1; /* Already been allocated? */ if (ext4_es_scan_clu(inode, &ext4_es_is_mapped, lblk)) return 2; ret = ext4_clu_mapped(inode, EXT4_B2C(sbi, lblk)); if (ret < 0) return ret; if (ret > 0) return 2; return 0; } /* * ext4_insert_delayed_blocks - adds a multiple delayed blocks to the extents * status tree, incrementing the reserved * cluster/block count or making pending * reservations where needed * * @inode - file containing the newly added block * @lblk - start logical block to be added * @len - length of blocks to be added * * Returns 0 on success, negative error code on failure. */ static int ext4_insert_delayed_blocks(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int ret; bool lclu_allocated = false; bool end_allocated = false; ext4_lblk_t resv_clu; ext4_lblk_t end = lblk + len - 1; /* * If the cluster containing lblk or end is shared with a delayed, * written, or unwritten extent in a bigalloc file system, it's * already been accounted for and does not need to be reserved. * A pending reservation must be made for the cluster if it's * shared with a written or unwritten extent and doesn't already * have one. Written and unwritten extents can be purged from the * extents status tree if the system is under memory pressure, so * it's necessary to examine the extent tree if a search of the * extents status tree doesn't get a match. */ if (sbi->s_cluster_ratio == 1) { ret = ext4_da_reserve_space(inode, len); if (ret != 0) /* ENOSPC */ return ret; } else { /* bigalloc */ resv_clu = EXT4_B2C(sbi, end) - EXT4_B2C(sbi, lblk) + 1; ret = ext4_clu_alloc_state(inode, lblk); if (ret < 0) return ret; if (ret > 0) { resv_clu--; lclu_allocated = (ret == 2); } if (EXT4_B2C(sbi, lblk) != EXT4_B2C(sbi, end)) { ret = ext4_clu_alloc_state(inode, end); if (ret < 0) return ret; if (ret > 0) { resv_clu--; end_allocated = (ret == 2); } } if (resv_clu) { ret = ext4_da_reserve_space(inode, resv_clu); if (ret != 0) /* ENOSPC */ return ret; } } ext4_es_insert_delayed_extent(inode, lblk, len, lclu_allocated, end_allocated); return 0; } /* * Looks up the requested blocks and sets the delalloc extent map. * First try to look up for the extent entry that contains the requested * blocks in the extent status tree without i_data_sem, then try to look * up for the ondisk extent mapping with i_data_sem in read mode, * finally hold i_data_sem in write mode, looks up again and add a * delalloc extent entry if it still couldn't find any extent. Pass out * the mapped extent through @map and return 0 on success. */ static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map) { struct extent_status es; int retval; #ifdef ES_AGGRESSIVE_TEST struct ext4_map_blocks orig_map; memcpy(&orig_map, map, sizeof(*map)); #endif map->m_flags = 0; ext_debug(inode, "max_blocks %u, logical block %lu\n", map->m_len, (unsigned long) map->m_lblk); ext4_check_map_extents_env(inode); /* Lookup extent status tree firstly */ if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { map->m_len = min_t(unsigned int, map->m_len, es.es_len - (map->m_lblk - es.es_lblk)); if (ext4_es_is_hole(&es)) goto add_delayed; found: /* * Delayed extent could be allocated by fallocate. * So we need to check it. */ if (ext4_es_is_delayed(&es)) { map->m_flags |= EXT4_MAP_DELAYED; return 0; } map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk; if (ext4_es_is_written(&es)) map->m_flags |= EXT4_MAP_MAPPED; else if (ext4_es_is_unwritten(&es)) map->m_flags |= EXT4_MAP_UNWRITTEN; else BUG(); #ifdef ES_AGGRESSIVE_TEST ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0); #endif return 0; } /* * Try to see if we can get the block without requesting a new * file system block. */ down_read(&EXT4_I(inode)->i_data_sem); if (ext4_has_inline_data(inode)) retval = 0; else retval = ext4_map_query_blocks(NULL, inode, map, 0); up_read(&EXT4_I(inode)->i_data_sem); if (retval) return retval < 0 ? retval : 0; add_delayed: down_write(&EXT4_I(inode)->i_data_sem); /* * Page fault path (ext4_page_mkwrite does not take i_rwsem) * and fallocate path (no folio lock) can race. Make sure we * lookup the extent status tree here again while i_data_sem * is held in write mode, before inserting a new da entry in * the extent status tree. */ if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { map->m_len = min_t(unsigned int, map->m_len, es.es_len - (map->m_lblk - es.es_lblk)); if (!ext4_es_is_hole(&es)) { up_write(&EXT4_I(inode)->i_data_sem); goto found; } } else if (!ext4_has_inline_data(inode)) { retval = ext4_map_query_blocks(NULL, inode, map, 0); if (retval) { up_write(&EXT4_I(inode)->i_data_sem); return retval < 0 ? retval : 0; } } map->m_flags |= EXT4_MAP_DELAYED; retval = ext4_insert_delayed_blocks(inode, map->m_lblk, map->m_len); up_write(&EXT4_I(inode)->i_data_sem); return retval; } /* * This is a special get_block_t callback which is used by * ext4_da_write_begin(). It will either return mapped block or * reserve space for a single block. * * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set. * We also have b_blocknr = -1 and b_bdev initialized properly * * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set. * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev * initialized properly. */ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) { struct ext4_map_blocks map; sector_t invalid_block = ~((sector_t) 0xffff); int ret = 0; BUG_ON(create == 0); BUG_ON(bh->b_size != inode->i_sb->s_blocksize); if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) invalid_block = ~0; map.m_lblk = iblock; map.m_len = 1; /* * first, we need to know whether the block is allocated already * preallocated blocks are unmapped but should treated * the same as allocated blocks. */ ret = ext4_da_map_blocks(inode, &map); if (ret < 0) return ret; if (map.m_flags & EXT4_MAP_DELAYED) { map_bh(bh, inode->i_sb, invalid_block); set_buffer_new(bh); set_buffer_delay(bh); return 0; } map_bh(bh, inode->i_sb, map.m_pblk); ext4_update_bh_state(bh, map.m_flags); if (buffer_unwritten(bh)) { /* A delayed write to unwritten bh should be marked * new and mapped. Mapped ensures that we don't do * get_block multiple times when we write to the same * offset and new ensures that we do proper zero out * for partial write. */ set_buffer_new(bh); set_buffer_mapped(bh); } return 0; } static void mpage_folio_done(struct mpage_da_data *mpd, struct folio *folio) { mpd->first_page += folio_nr_pages(folio); folio_unlock(folio); } static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio) { size_t len; loff_t size; int err; BUG_ON(folio->index != mpd->first_page); folio_clear_dirty_for_io(folio); /* * We have to be very careful here! Nothing protects writeback path * against i_size changes and the page can be writeably mapped into * page tables. So an application can be growing i_size and writing * data through mmap while writeback runs. folio_clear_dirty_for_io() * write-protects our page in page tables and the page cannot get * written to again until we release folio lock. So only after * folio_clear_dirty_for_io() we are safe to sample i_size for * ext4_bio_write_folio() to zero-out tail of the written page. We rely * on the barrier provided by folio_test_clear_dirty() in * folio_clear_dirty_for_io() to make sure i_size is really sampled only * after page tables are updated. */ size = i_size_read(mpd->inode); len = folio_size(folio); if (folio_pos(folio) + len > size && !ext4_verity_in_progress(mpd->inode)) len = size & (len - 1); err = ext4_bio_write_folio(&mpd->io_submit, folio, len); if (!err) mpd->wbc->nr_to_write -= folio_nr_pages(folio); return err; } #define BH_FLAGS (BIT(BH_Unwritten) | BIT(BH_Delay)) /* * mballoc gives us at most this number of blocks... * XXX: That seems to be only a limitation of ext4_mb_normalize_request(). * The rest of mballoc seems to handle chunks up to full group size. */ #define MAX_WRITEPAGES_EXTENT_LEN 2048 /* * mpage_add_bh_to_extent - try to add bh to extent of blocks to map * * @mpd - extent of blocks * @lblk - logical number of the block in the file * @bh - buffer head we want to add to the extent * * The function is used to collect contig. blocks in the same state. If the * buffer doesn't require mapping for writeback and we haven't started the * extent of buffers to map yet, the function returns 'true' immediately - the * caller can write the buffer right away. Otherwise the function returns true * if the block has been added to the extent, false if the block couldn't be * added. */ static bool mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk, struct buffer_head *bh) { struct ext4_map_blocks *map = &mpd->map; /* Buffer that doesn't need mapping for writeback? */ if (!buffer_dirty(bh) || !buffer_mapped(bh) || (!buffer_delay(bh) && !buffer_unwritten(bh))) { /* So far no extent to map => we write the buffer right away */ if (map->m_len == 0) return true; return false; } /* First block in the extent? */ if (map->m_len == 0) { /* We cannot map unless handle is started... */ if (!mpd->do_map) return false; map->m_lblk = lblk; map->m_len = 1; map->m_flags = bh->b_state & BH_FLAGS; return true; } /* Don't go larger than mballoc is willing to allocate */ if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN) return false; /* Can we merge the block to our big extent? */ if (lblk == map->m_lblk + map->m_len && (bh->b_state & BH_FLAGS) == map->m_flags) { map->m_len++; return true; } return false; } /* * mpage_process_page_bufs - submit page buffers for IO or add them to extent * * @mpd - extent of blocks for mapping * @head - the first buffer in the page * @bh - buffer we should start processing from * @lblk - logical number of the block in the file corresponding to @bh * * Walk through page buffers from @bh upto @head (exclusive) and either submit * the page for IO if all buffers in this page were mapped and there's no * accumulated extent of buffers to map or add buffers in the page to the * extent of buffers to map. The function returns 1 if the caller can continue * by processing the next page, 0 if it should stop adding buffers to the * extent to map because we cannot extend it anymore. It can also return value * < 0 in case of error during IO submission. */ static int mpage_process_page_bufs(struct mpage_da_data *mpd, struct buffer_head *head, struct buffer_head *bh, ext4_lblk_t lblk) { struct inode *inode = mpd->inode; int err; ext4_lblk_t blocks = (i_size_read(inode) + i_blocksize(inode) - 1) >> inode->i_blkbits; if (ext4_verity_in_progress(inode)) blocks = EXT_MAX_BLOCKS; do { BUG_ON(buffer_locked(bh)); if (lblk >= blocks || !mpage_add_bh_to_extent(mpd, lblk, bh)) { /* Found extent to map? */ if (mpd->map.m_len) return 0; /* Buffer needs mapping and handle is not started? */ if (!mpd->do_map) return 0; /* Everything mapped so far and we hit EOF */ break; } } while (lblk++, (bh = bh->b_this_page) != head); /* So far everything mapped? Submit the page for IO. */ if (mpd->map.m_len == 0) { err = mpage_submit_folio(mpd, head->b_folio); if (err < 0) return err; mpage_folio_done(mpd, head->b_folio); } if (lblk >= blocks) { mpd->scanned_until_end = 1; return 0; } return 1; } /* * mpage_process_folio - update folio buffers corresponding to changed extent * and may submit fully mapped page for IO * @mpd: description of extent to map, on return next extent to map * @folio: Contains these buffers. * @m_lblk: logical block mapping. * @m_pblk: corresponding physical mapping. * @map_bh: determines on return whether this page requires any further * mapping or not. * * Scan given folio buffers corresponding to changed extent and update buffer * state according to new extent state. * We map delalloc buffers to their physical location, clear unwritten bits. * If the given folio is not fully mapped, we update @mpd to the next extent in * the given folio that needs mapping & return @map_bh as true. */ static int mpage_process_folio(struct mpage_da_data *mpd, struct folio *folio, ext4_lblk_t *m_lblk, ext4_fsblk_t *m_pblk, bool *map_bh) { struct buffer_head *head, *bh; ext4_io_end_t *io_end = mpd->io_submit.io_end; ext4_lblk_t lblk = *m_lblk; ext4_fsblk_t pblock = *m_pblk; int err = 0; int blkbits = mpd->inode->i_blkbits; ssize_t io_end_size = 0; struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end); bh = head = folio_buffers(folio); do { if (lblk < mpd->map.m_lblk) continue; if (lblk >= mpd->map.m_lblk + mpd->map.m_len) { /* * Buffer after end of mapped extent. * Find next buffer in the folio to map. */ mpd->map.m_len = 0; mpd->map.m_flags = 0; io_end_vec->size += io_end_size; err = mpage_process_page_bufs(mpd, head, bh, lblk); if (err > 0) err = 0; if (!err && mpd->map.m_len && mpd->map.m_lblk > lblk) { io_end_vec = ext4_alloc_io_end_vec(io_end); if (IS_ERR(io_end_vec)) { err = PTR_ERR(io_end_vec); goto out; } io_end_vec->offset = (loff_t)mpd->map.m_lblk << blkbits; } *map_bh = true; goto out; } if (buffer_delay(bh)) { clear_buffer_delay(bh); bh->b_blocknr = pblock++; } clear_buffer_unwritten(bh); io_end_size += (1 << blkbits); } while (lblk++, (bh = bh->b_this_page) != head); io_end_vec->size += io_end_size; *map_bh = false; out: *m_lblk = lblk; *m_pblk = pblock; return err; } /* * mpage_map_buffers - update buffers corresponding to changed extent and * submit fully mapped pages for IO * * @mpd - description of extent to map, on return next extent to map * * Scan buffers corresponding to changed extent (we expect corresponding pages * to be already locked) and update buffer state according to new extent state. * We map delalloc buffers to their physical location, clear unwritten bits, * and mark buffers as uninit when we perform writes to unwritten extents * and do extent conversion after IO is finished. If the last page is not fully * mapped, we update @map to the next extent in the last page that needs * mapping. Otherwise we submit the page for IO. */ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) { struct folio_batch fbatch; unsigned nr, i; struct inode *inode = mpd->inode; int bpp_bits = PAGE_SHIFT - inode->i_blkbits; pgoff_t start, end; ext4_lblk_t lblk; ext4_fsblk_t pblock; int err; bool map_bh = false; start = mpd->map.m_lblk >> bpp_bits; end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits; pblock = mpd->map.m_pblk; folio_batch_init(&fbatch); while (start <= end) { nr = filemap_get_folios(inode->i_mapping, &start, end, &fbatch); if (nr == 0) break; for (i = 0; i < nr; i++) { struct folio *folio = fbatch.folios[i]; lblk = folio->index << bpp_bits; err = mpage_process_folio(mpd, folio, &lblk, &pblock, &map_bh); /* * If map_bh is true, means page may require further bh * mapping, or maybe the page was submitted for IO. * So we return to call further extent mapping. */ if (err < 0 || map_bh) goto out; /* Page fully mapped - let IO run! */ err = mpage_submit_folio(mpd, folio); if (err < 0) goto out; mpage_folio_done(mpd, folio); } folio_batch_release(&fbatch); } /* Extent fully mapped and matches with page boundary. We are done. */ mpd->map.m_len = 0; mpd->map.m_flags = 0; return 0; out: folio_batch_release(&fbatch); return err; } static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) { struct inode *inode = mpd->inode; struct ext4_map_blocks *map = &mpd->map; int get_blocks_flags; int err, dioread_nolock; trace_ext4_da_write_pages_extent(inode, map); /* * Call ext4_map_blocks() to allocate any delayed allocation blocks, or * to convert an unwritten extent to be initialized (in the case * where we have written into one or more preallocated blocks). It is * possible that we're going to need more metadata blocks than * previously reserved. However we must not fail because we're in * writeback and there is nothing we can do about it so it might result * in data loss. So use reserved blocks to allocate metadata if * possible. In addition, do not cache any unrelated extents, as it * only holds the folio lock but does not hold the i_rwsem or * invalidate_lock, which could corrupt the extent status tree. */ get_blocks_flags = EXT4_GET_BLOCKS_CREATE | EXT4_GET_BLOCKS_METADATA_NOFAIL | EXT4_GET_BLOCKS_IO_SUBMIT | EXT4_EX_NOCACHE; dioread_nolock = ext4_should_dioread_nolock(inode); if (dioread_nolock) get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; err = ext4_map_blocks(handle, inode, map, get_blocks_flags); if (err < 0) return err; if (dioread_nolock && (map->m_flags & EXT4_MAP_UNWRITTEN)) { if (!mpd->io_submit.io_end->handle && ext4_handle_valid(handle)) { mpd->io_submit.io_end->handle = handle->h_rsv_handle; handle->h_rsv_handle = NULL; } ext4_set_io_unwritten_flag(mpd->io_submit.io_end); } BUG_ON(map->m_len == 0); return 0; } /* * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length * mpd->len and submit pages underlying it for IO * * @handle - handle for journal operations * @mpd - extent to map * @give_up_on_write - we set this to true iff there is a fatal error and there * is no hope of writing the data. The caller should discard * dirty pages to avoid infinite loops. * * The function maps extent starting at mpd->lblk of length mpd->len. If it is * delayed, blocks are allocated, if it is unwritten, we may need to convert * them to initialized or split the described range from larger unwritten * extent. Note that we need not map all the described range since allocation * can return less blocks or the range is covered by more unwritten extents. We * cannot map more because we are limited by reserved transaction credits. On * the other hand we always make sure that the last touched page is fully * mapped so that it can be written out (and thus forward progress is * guaranteed). After mapping we submit all mapped pages for IO. */ static int mpage_map_and_submit_extent(handle_t *handle, struct mpage_da_data *mpd, bool *give_up_on_write) { struct inode *inode = mpd->inode; struct ext4_map_blocks *map = &mpd->map; int err; loff_t disksize; int progress = 0; ext4_io_end_t *io_end = mpd->io_submit.io_end; struct ext4_io_end_vec *io_end_vec; io_end_vec = ext4_alloc_io_end_vec(io_end); if (IS_ERR(io_end_vec)) return PTR_ERR(io_end_vec); io_end_vec->offset = ((loff_t)map->m_lblk) << inode->i_blkbits; do { err = mpage_map_one_extent(handle, mpd); if (err < 0) { struct super_block *sb = inode->i_sb; if (ext4_emergency_state(sb)) goto invalidate_dirty_pages; /* * Let the uper layers retry transient errors. * In the case of ENOSPC, if ext4_count_free_blocks() * is non-zero, a commit should free up blocks. */ if ((err == -ENOMEM) || (err == -ENOSPC && ext4_count_free_clusters(sb))) { if (progress) goto update_disksize; return err; } ext4_msg(sb, KERN_CRIT, "Delayed block allocation failed for " "inode %lu at logical offset %llu with" " max blocks %u with error %d", inode->i_ino, (unsigned long long)map->m_lblk, (unsigned)map->m_len, -err); ext4_msg(sb, KERN_CRIT, "This should not happen!! Data will " "be lost\n"); if (err == -ENOSPC) ext4_print_free_blocks(inode); invalidate_dirty_pages: *give_up_on_write = true; return err; } progress = 1; /* * Update buffer state, submit mapped pages, and get us new * extent to map */ err = mpage_map_and_submit_buffers(mpd); if (err < 0) goto update_disksize; } while (map->m_len); update_disksize: /* * Update on-disk size after IO is submitted. Races with * truncate are avoided by checking i_size under i_data_sem. */ disksize = ((loff_t)mpd->first_page) << PAGE_SHIFT; if (disksize > READ_ONCE(EXT4_I(inode)->i_disksize)) { int err2; loff_t i_size; down_write(&EXT4_I(inode)->i_data_sem); i_size = i_size_read(inode); if (disksize > i_size) disksize = i_size; if (disksize > EXT4_I(inode)->i_disksize) EXT4_I(inode)->i_disksize = disksize; up_write(&EXT4_I(inode)->i_data_sem); err2 = ext4_mark_inode_dirty(handle, inode); if (err2) { ext4_error_err(inode->i_sb, -err2, "Failed to mark inode %lu dirty", inode->i_ino); } if (!err) err = err2; } return err; } /* * Calculate the total number of credits to reserve for one writepages * iteration. This is called from ext4_writepages(). We map an extent of * up to MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN + * bpp - 1 blocks in bpp different extents. */ static int ext4_da_writepages_trans_blocks(struct inode *inode) { int bpp = ext4_journal_blocks_per_folio(inode); return ext4_meta_trans_blocks(inode, MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp); } static int ext4_journal_folio_buffers(handle_t *handle, struct folio *folio, size_t len) { struct buffer_head *page_bufs = folio_buffers(folio); struct inode *inode = folio->mapping->host; int ret, err; ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, NULL, do_journal_get_write_access); err = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, NULL, write_end_fn); if (ret == 0) ret = err; err = ext4_jbd2_inode_add_write(handle, inode, folio_pos(folio), len); if (ret == 0) ret = err; EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; return ret; } static int mpage_journal_page_buffers(handle_t *handle, struct mpage_da_data *mpd, struct folio *folio) { struct inode *inode = mpd->inode; loff_t size = i_size_read(inode); size_t len = folio_size(folio); folio_clear_checked(folio); mpd->wbc->nr_to_write -= folio_nr_pages(folio); if (folio_pos(folio) + len > size && !ext4_verity_in_progress(inode)) len = size & (len - 1); return ext4_journal_folio_buffers(handle, folio, len); } /* * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages * needing mapping, submit mapped pages * * @mpd - where to look for pages * * Walk dirty pages in the mapping. If they are fully mapped, submit them for * IO immediately. If we cannot map blocks, we submit just already mapped * buffers in the page for IO and keep page dirty. When we can map blocks and * we find a page which isn't mapped we start accumulating extent of buffers * underlying these pages that needs mapping (formed by either delayed or * unwritten buffers). We also lock the pages containing these buffers. The * extent found is returned in @mpd structure (starting at mpd->lblk with * length mpd->len blocks). * * Note that this function can attach bios to one io_end structure which are * neither logically nor physically contiguous. Although it may seem as an * unnecessary complication, it is actually inevitable in blocksize < pagesize * case as we need to track IO to all buffers underlying a page in one io_end. */ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) { struct address_space *mapping = mpd->inode->i_mapping; struct folio_batch fbatch; unsigned int nr_folios; pgoff_t index = mpd->first_page; pgoff_t end = mpd->last_page; xa_mark_t tag; int i, err = 0; int blkbits = mpd->inode->i_blkbits; ext4_lblk_t lblk; struct buffer_head *head; handle_t *handle = NULL; int bpp = ext4_journal_blocks_per_folio(mpd->inode); if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; mpd->map.m_len = 0; mpd->next_page = index; if (ext4_should_journal_data(mpd->inode)) { handle = ext4_journal_start(mpd->inode, EXT4_HT_WRITE_PAGE, bpp); if (IS_ERR(handle)) return PTR_ERR(handle); } folio_batch_init(&fbatch); while (index <= end) { nr_folios = filemap_get_folios_tag(mapping, &index, end, tag, &fbatch); if (nr_folios == 0) break; for (i = 0; i < nr_folios; i++) { struct folio *folio = fbatch.folios[i]; /* * Accumulated enough dirty pages? This doesn't apply * to WB_SYNC_ALL mode. For integrity sync we have to * keep going because someone may be concurrently * dirtying pages, and we might have synced a lot of * newly appeared dirty pages, but have not synced all * of the old dirty pages. */ if (mpd->wbc->sync_mode == WB_SYNC_NONE && mpd->wbc->nr_to_write <= mpd->map.m_len >> (PAGE_SHIFT - blkbits)) goto out; /* If we can't merge this page, we are done. */ if (mpd->map.m_len > 0 && mpd->next_page != folio->index) goto out; if (handle) { err = ext4_journal_ensure_credits(handle, bpp, 0); if (err < 0) goto out; } folio_lock(folio); /* * If the page is no longer dirty, or its mapping no * longer corresponds to inode we are writing (which * means it has been truncated or invalidated), or the * page is already under writeback and we are not doing * a data integrity writeback, skip the page */ if (!folio_test_dirty(folio) || (folio_test_writeback(folio) && (mpd->wbc->sync_mode == WB_SYNC_NONE)) || unlikely(folio->mapping != mapping)) { folio_unlock(folio); continue; } folio_wait_writeback(folio); BUG_ON(folio_test_writeback(folio)); /* * Should never happen but for buggy code in * other subsystems that call * set_page_dirty() without properly warning * the file system first. See [1] for more * information. * * [1] https://lore.kernel.org/linux-mm/20180103100430.GE4911@quack2.suse.cz */ if (!folio_buffers(folio)) { ext4_warning_inode(mpd->inode, "page %lu does not have buffers attached", folio->index); folio_clear_dirty(folio); folio_unlock(folio); continue; } if (mpd->map.m_len == 0) mpd->first_page = folio->index; mpd->next_page = folio_next_index(folio); /* * Writeout when we cannot modify metadata is simple. * Just submit the page. For data=journal mode we * first handle writeout of the page for checkpoint and * only after that handle delayed page dirtying. This * makes sure current data is checkpointed to the final * location before possibly journalling it again which * is desirable when the page is frequently dirtied * through a pin. */ if (!mpd->can_map) { err = mpage_submit_folio(mpd, folio); if (err < 0) goto out; /* Pending dirtying of journalled data? */ if (folio_test_checked(folio)) { err = mpage_journal_page_buffers(handle, mpd, folio); if (err < 0) goto out; mpd->journalled_more_data = 1; } mpage_folio_done(mpd, folio); } else { /* Add all dirty buffers to mpd */ lblk = ((ext4_lblk_t)folio->index) << (PAGE_SHIFT - blkbits); head = folio_buffers(folio); err = mpage_process_page_bufs(mpd, head, head, lblk); if (err <= 0) goto out; err = 0; } } folio_batch_release(&fbatch); cond_resched(); } mpd->scanned_until_end = 1; if (handle) ext4_journal_stop(handle); return 0; out: folio_batch_release(&fbatch); if (handle) ext4_journal_stop(handle); return err; } static int ext4_do_writepages(struct mpage_da_data *mpd) { struct writeback_control *wbc = mpd->wbc; pgoff_t writeback_index = 0; long nr_to_write = wbc->nr_to_write; int range_whole = 0; int cycled = 1; handle_t *handle = NULL; struct inode *inode = mpd->inode; struct address_space *mapping = inode->i_mapping; int needed_blocks, rsv_blocks = 0, ret = 0; struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); struct blk_plug plug; bool give_up_on_write = false; trace_ext4_writepages(inode, wbc); /* * No pages to write? This is mainly a kludge to avoid starting * a transaction for special inodes like journal inode on last iput() * because that could violate lock ordering on umount */ if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) goto out_writepages; /* * If the filesystem has aborted, it is read-only, so return * right away instead of dumping stack traces later on that * will obscure the real source of the problem. We test * fs shutdown state instead of sb->s_flag's SB_RDONLY because * the latter could be true if the filesystem is mounted * read-only, and in that case, ext4_writepages should * *never* be called, so if that ever happens, we would want * the stack trace. */ ret = ext4_emergency_state(mapping->host->i_sb); if (unlikely(ret)) goto out_writepages; /* * If we have inline data and arrive here, it means that * we will soon create the block for the 1st page, so * we'd better clear the inline data here. */ if (ext4_has_inline_data(inode)) { /* Just inode will be modified... */ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out_writepages; } BUG_ON(ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)); ext4_destroy_inline_data(handle, inode); ext4_journal_stop(handle); } /* * data=journal mode does not do delalloc so we just need to writeout / * journal already mapped buffers. On the other hand we need to commit * transaction to make data stable. We expect all the data to be * already in the journal (the only exception are DMA pinned pages * dirtied behind our back) so we commit transaction here and run the * writeback loop to checkpoint them. The checkpointing is not actually * necessary to make data persistent *but* quite a few places (extent * shifting operations, fsverity, ...) depend on being able to drop * pagecache pages after calling filemap_write_and_wait() and for that * checkpointing needs to happen. */ if (ext4_should_journal_data(inode)) { mpd->can_map = 0; if (wbc->sync_mode == WB_SYNC_ALL) ext4_fc_commit(sbi->s_journal, EXT4_I(inode)->i_datasync_tid); } mpd->journalled_more_data = 0; if (ext4_should_dioread_nolock(inode)) { /* * We may need to convert up to one extent per block in * the page and we may dirty the inode. */ rsv_blocks = 1 + ext4_chunk_trans_blocks(inode, PAGE_SIZE >> inode->i_blkbits); } if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; if (writeback_index) cycled = 0; mpd->first_page = writeback_index; mpd->last_page = -1; } else { mpd->first_page = wbc->range_start >> PAGE_SHIFT; mpd->last_page = wbc->range_end >> PAGE_SHIFT; } ext4_io_submit_init(&mpd->io_submit, wbc); retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, mpd->first_page, mpd->last_page); blk_start_plug(&plug); /* * First writeback pages that don't need mapping - we can avoid * starting a transaction unnecessarily and also avoid being blocked * in the block layer on device congestion while having transaction * started. */ mpd->do_map = 0; mpd->scanned_until_end = 0; mpd->io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); if (!mpd->io_submit.io_end) { ret = -ENOMEM; goto unplug; } ret = mpage_prepare_extent_to_map(mpd); /* Unlock pages we didn't use */ mpage_release_unused_pages(mpd, false); /* Submit prepared bio */ ext4_io_submit(&mpd->io_submit); ext4_put_io_end_defer(mpd->io_submit.io_end); mpd->io_submit.io_end = NULL; if (ret < 0) goto unplug; while (!mpd->scanned_until_end && wbc->nr_to_write > 0) { /* For each extent of pages we use new io_end */ mpd->io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); if (!mpd->io_submit.io_end) { ret = -ENOMEM; break; } WARN_ON_ONCE(!mpd->can_map); /* * We have two constraints: We find one extent to map and we * must always write out whole page (makes a difference when * blocksize < pagesize) so that we don't block on IO when we * try to write out the rest of the page. Journalled mode is * not supported by delalloc. */ BUG_ON(ext4_should_journal_data(inode)); needed_blocks = ext4_da_writepages_trans_blocks(inode); /* start a new transaction */ handle = ext4_journal_start_with_reserve(inode, EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks); if (IS_ERR(handle)) { ret = PTR_ERR(handle); ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " "%ld pages, ino %lu; err %d", __func__, wbc->nr_to_write, inode->i_ino, ret); /* Release allocated io_end */ ext4_put_io_end(mpd->io_submit.io_end); mpd->io_submit.io_end = NULL; break; } mpd->do_map = 1; trace_ext4_da_write_pages(inode, mpd->first_page, wbc); ret = mpage_prepare_extent_to_map(mpd); if (!ret && mpd->map.m_len) ret = mpage_map_and_submit_extent(handle, mpd, &give_up_on_write); /* * Caution: If the handle is synchronous, * ext4_journal_stop() can wait for transaction commit * to finish which may depend on writeback of pages to * complete or on page lock to be released. In that * case, we have to wait until after we have * submitted all the IO, released page locks we hold, * and dropped io_end reference (for extent conversion * to be able to complete) before stopping the handle. */ if (!ext4_handle_valid(handle) || handle->h_sync == 0) { ext4_journal_stop(handle); handle = NULL; mpd->do_map = 0; } /* Unlock pages we didn't use */ mpage_release_unused_pages(mpd, give_up_on_write); /* Submit prepared bio */ ext4_io_submit(&mpd->io_submit); /* * Drop our io_end reference we got from init. We have * to be careful and use deferred io_end finishing if * we are still holding the transaction as we can * release the last reference to io_end which may end * up doing unwritten extent conversion. */ if (handle) { ext4_put_io_end_defer(mpd->io_submit.io_end); ext4_journal_stop(handle); } else ext4_put_io_end(mpd->io_submit.io_end); mpd->io_submit.io_end = NULL; if (ret == -ENOSPC && sbi->s_journal) { /* * Commit the transaction which would * free blocks released in the transaction * and try again */ jbd2_journal_force_commit_nested(sbi->s_journal); ret = 0; continue; } /* Fatal error - ENOMEM, EIO... */ if (ret) break; } unplug: blk_finish_plug(&plug); if (!ret && !cycled && wbc->nr_to_write > 0) { cycled = 1; mpd->last_page = writeback_index - 1; mpd->first_page = 0; goto retry; } /* Update index */ if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) /* * Set the writeback_index so that range_cyclic * mode will write it back later */ mapping->writeback_index = mpd->first_page; out_writepages: trace_ext4_writepages_result(inode, wbc, ret, nr_to_write - wbc->nr_to_write); return ret; } static int ext4_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct super_block *sb = mapping->host->i_sb; struct mpage_da_data mpd = { .inode = mapping->host, .wbc = wbc, .can_map = 1, }; int ret; int alloc_ctx; ret = ext4_emergency_state(sb); if (unlikely(ret)) return ret; alloc_ctx = ext4_writepages_down_read(sb); ret = ext4_do_writepages(&mpd); /* * For data=journal writeback we could have come across pages marked * for delayed dirtying (PageChecked) which were just added to the * running transaction. Try once more to get them to stable storage. */ if (!ret && mpd.journalled_more_data) ret = ext4_do_writepages(&mpd); ext4_writepages_up_read(sb, alloc_ctx); return ret; } int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode) { struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, .range_start = jinode->i_dirty_start, .range_end = jinode->i_dirty_end, }; struct mpage_da_data mpd = { .inode = jinode->i_vfs_inode, .wbc = &wbc, .can_map = 0, }; return ext4_do_writepages(&mpd); } static int ext4_dax_writepages(struct address_space *mapping, struct writeback_control *wbc) { int ret; long nr_to_write = wbc->nr_to_write; struct inode *inode = mapping->host; int alloc_ctx; ret = ext4_emergency_state(inode->i_sb); if (unlikely(ret)) return ret; alloc_ctx = ext4_writepages_down_read(inode->i_sb); trace_ext4_writepages(inode, wbc); ret = dax_writeback_mapping_range(mapping, EXT4_SB(inode->i_sb)->s_daxdev, wbc); trace_ext4_writepages_result(inode, wbc, ret, nr_to_write - wbc->nr_to_write); ext4_writepages_up_read(inode->i_sb, alloc_ctx); return ret; } static int ext4_nonda_switch(struct super_block *sb) { s64 free_clusters, dirty_clusters; struct ext4_sb_info *sbi = EXT4_SB(sb); /* * switch to non delalloc mode if we are running low * on free block. The free block accounting via percpu * counters can get slightly wrong with percpu_counter_batch getting * accumulated on each CPU without updating global counters * Delalloc need an accurate free block accounting. So switch * to non delalloc when we are near to error range. */ free_clusters = percpu_counter_read_positive(&sbi->s_freeclusters_counter); dirty_clusters = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter); /* * Start pushing delalloc when 1/2 of free blocks are dirty. */ if (dirty_clusters && (free_clusters < 2 * dirty_clusters)) try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE); if (2 * free_clusters < 3 * dirty_clusters || free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK)) { /* * free block count is less than 150% of dirty blocks * or free blocks is less than watermark */ return 1; } return 0; } static int ext4_da_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { int ret, retries = 0; struct folio *folio; pgoff_t index; struct inode *inode = mapping->host; fgf_t fgp = FGP_WRITEBEGIN; ret = ext4_emergency_state(inode->i_sb); if (unlikely(ret)) return ret; index = pos >> PAGE_SHIFT; if (ext4_nonda_switch(inode->i_sb) || ext4_verity_in_progress(inode)) { *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; return ext4_write_begin(file, mapping, pos, len, foliop, fsdata); } *fsdata = (void *)0; trace_ext4_da_write_begin(inode, pos, len); if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { ret = ext4_generic_write_inline_data(mapping, inode, pos, len, foliop, fsdata, true); if (ret < 0) return ret; if (ret == 1) return 0; } retry: fgp |= fgf_set_order(len); folio = __filemap_get_folio(mapping, index, fgp, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return PTR_ERR(folio); if (pos + len > folio_pos(folio) + folio_size(folio)) len = folio_pos(folio) + folio_size(folio) - pos; ret = ext4_block_write_begin(NULL, folio, pos, len, ext4_da_get_block_prep); if (ret < 0) { folio_unlock(folio); folio_put(folio); /* * block_write_begin may have instantiated a few blocks * outside i_size. Trim these off again. Don't need * i_size_read because we hold inode lock. */ if (pos + len > inode->i_size) ext4_truncate_failed_write(inode); if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; return ret; } *foliop = folio; return ret; } /* * Check if we should update i_disksize * when write to the end of file but not require block allocation */ static int ext4_da_should_update_i_disksize(struct folio *folio, unsigned long offset) { struct buffer_head *bh; struct inode *inode = folio->mapping->host; unsigned int idx; int i; bh = folio_buffers(folio); idx = offset >> inode->i_blkbits; for (i = 0; i < idx; i++) bh = bh->b_this_page; if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh)) return 0; return 1; } static int ext4_da_do_write_end(struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio) { struct inode *inode = mapping->host; loff_t old_size = inode->i_size; bool disksize_changed = false; loff_t new_i_size, zero_len = 0; handle_t *handle; if (unlikely(!folio_buffers(folio))) { folio_unlock(folio); folio_put(folio); return -EIO; } /* * block_write_end() will mark the inode as dirty with I_DIRTY_PAGES * flag, which all that's needed to trigger page writeback. */ copied = block_write_end(NULL, mapping, pos, len, copied, folio, NULL); new_i_size = pos + copied; /* * It's important to update i_size while still holding folio lock, * because folio writeout could otherwise come in and zero beyond * i_size. * * Since we are holding inode lock, we are sure i_disksize <= * i_size. We also know that if i_disksize < i_size, there are * delalloc writes pending in the range up to i_size. If the end of * the current write is <= i_size, there's no need to touch * i_disksize since writeback will push i_disksize up to i_size * eventually. If the end of the current write is > i_size and * inside an allocated block which ext4_da_should_update_i_disksize() * checked, we need to update i_disksize here as certain * ext4_writepages() paths not allocating blocks and update i_disksize. */ if (new_i_size > inode->i_size) { unsigned long end; i_size_write(inode, new_i_size); end = offset_in_folio(folio, new_i_size - 1); if (copied && ext4_da_should_update_i_disksize(folio, end)) { ext4_update_i_disksize(inode, new_i_size); disksize_changed = true; } } folio_unlock(folio); folio_put(folio); if (pos > old_size) { pagecache_isize_extended(inode, old_size, pos); zero_len = pos - old_size; } if (!disksize_changed && !zero_len) return copied; handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) return PTR_ERR(handle); if (zero_len) ext4_zero_partial_blocks(handle, inode, old_size, zero_len); ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); return copied; } static int ext4_da_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; int write_mode = (int)(unsigned long)fsdata; if (write_mode == FALL_BACK_TO_NONDELALLOC) return ext4_write_end(file, mapping, pos, len, copied, folio, fsdata); trace_ext4_da_write_end(inode, pos, len, copied); if (write_mode != CONVERT_INLINE_DATA && ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) && ext4_has_inline_data(inode)) return ext4_write_inline_data_end(inode, pos, len, copied, folio); if (unlikely(copied < len) && !folio_test_uptodate(folio)) copied = 0; return ext4_da_do_write_end(mapping, pos, len, copied, folio); } /* * Force all delayed allocation blocks to be allocated for a given inode. */ int ext4_alloc_da_blocks(struct inode *inode) { trace_ext4_alloc_da_blocks(inode); if (!EXT4_I(inode)->i_reserved_data_blocks) return 0; /* * We do something simple for now. The filemap_flush() will * also start triggering a write of the data blocks, which is * not strictly speaking necessary (and for users of * laptop_mode, not even desirable). However, to do otherwise * would require replicating code paths in: * * ext4_writepages() -> * write_cache_pages() ---> (via passed in callback function) * __mpage_da_writepage() --> * mpage_add_bh_to_extent() * mpage_da_map_blocks() * * The problem is that write_cache_pages(), located in * mm/page-writeback.c, marks pages clean in preparation for * doing I/O, which is not desirable if we're not planning on * doing I/O at all. * * We could call write_cache_pages(), and then redirty all of * the pages by calling redirty_page_for_writepage() but that * would be ugly in the extreme. So instead we would need to * replicate parts of the code in the above functions, * simplifying them because we wouldn't actually intend to * write out the pages, but rather only collect contiguous * logical block extents, call the multi-block allocator, and * then update the buffer heads with the block allocations. * * For now, though, we'll cheat by calling filemap_flush(), * which will map the blocks, and start the I/O, but not * actually wait for the I/O to complete. */ return filemap_flush(inode->i_mapping); } /* * bmap() is special. It gets used by applications such as lilo and by * the swapper to find the on-disk block of a specific piece of data. * * Naturally, this is dangerous if the block concerned is still in the * journal. If somebody makes a swapfile on an ext4 data-journaling * filesystem and enables swap, then they may get a nasty shock when the * data getting swapped to that swapfile suddenly gets overwritten by * the original zero's written out previously to the journal and * awaiting writeback in the kernel's buffer cache. * * So, if we see any bmap calls here on a modified, data-journaled file, * take extra steps to flush any blocks which might be in the cache. */ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) { struct inode *inode = mapping->host; sector_t ret = 0; inode_lock_shared(inode); /* * We can get here for an inline file via the FIBMAP ioctl */ if (ext4_has_inline_data(inode)) goto out; if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && (test_opt(inode->i_sb, DELALLOC) || ext4_should_journal_data(inode))) { /* * With delalloc or journalled data we want to sync the file so * that we can make sure we allocate blocks for file and data * is in place for the user to see it */ filemap_write_and_wait(mapping); } ret = iomap_bmap(mapping, block, &ext4_iomap_ops); out: inode_unlock_shared(inode); return ret; } static int ext4_read_folio(struct file *file, struct folio *folio) { int ret = -EAGAIN; struct inode *inode = folio->mapping->host; trace_ext4_read_folio(inode, folio); if (ext4_has_inline_data(inode)) ret = ext4_readpage_inline(inode, folio); if (ret == -EAGAIN) return ext4_mpage_readpages(inode, NULL, folio); return ret; } static void ext4_readahead(struct readahead_control *rac) { struct inode *inode = rac->mapping->host; /* If the file has inline data, no need to do readahead. */ if (ext4_has_inline_data(inode)) return; ext4_mpage_readpages(inode, rac, NULL); } static void ext4_invalidate_folio(struct folio *folio, size_t offset, size_t length) { trace_ext4_invalidate_folio(folio, offset, length); /* No journalling happens on data buffers when this function is used */ WARN_ON(folio_buffers(folio) && buffer_jbd(folio_buffers(folio))); block_invalidate_folio(folio, offset, length); } static int __ext4_journalled_invalidate_folio(struct folio *folio, size_t offset, size_t length) { journal_t *journal = EXT4_JOURNAL(folio->mapping->host); trace_ext4_journalled_invalidate_folio(folio, offset, length); /* * If it's a full truncate we just forget about the pending dirtying */ if (offset == 0 && length == folio_size(folio)) folio_clear_checked(folio); return jbd2_journal_invalidate_folio(journal, folio, offset, length); } /* Wrapper for aops... */ static void ext4_journalled_invalidate_folio(struct folio *folio, size_t offset, size_t length) { WARN_ON(__ext4_journalled_invalidate_folio(folio, offset, length) < 0); } static bool ext4_release_folio(struct folio *folio, gfp_t wait) { struct inode *inode = folio->mapping->host; journal_t *journal = EXT4_JOURNAL(inode); trace_ext4_release_folio(inode, folio); /* Page has dirty journalled data -> cannot release */ if (folio_test_checked(folio)) return false; if (journal) return jbd2_journal_try_to_free_buffers(journal, folio); else return try_to_free_buffers(folio); } static bool ext4_inode_datasync_dirty(struct inode *inode) { journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; if (journal) { if (jbd2_transaction_committed(journal, EXT4_I(inode)->i_datasync_tid)) return false; if (test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT)) return !list_empty(&EXT4_I(inode)->i_fc_list); return true; } /* Any metadata buffers to write? */ if (!list_empty(&inode->i_mapping->i_private_list)) return true; return inode->i_state & I_DIRTY_DATASYNC; } static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, struct ext4_map_blocks *map, loff_t offset, loff_t length, unsigned int flags) { u8 blkbits = inode->i_blkbits; /* * Writes that span EOF might trigger an I/O size update on completion, * so consider them to be dirty for the purpose of O_DSYNC, even if * there is no other metadata changes being made or are pending. */ iomap->flags = 0; if (ext4_inode_datasync_dirty(inode) || offset + length > i_size_read(inode)) iomap->flags |= IOMAP_F_DIRTY; if (map->m_flags & EXT4_MAP_NEW) iomap->flags |= IOMAP_F_NEW; /* HW-offload atomics are always used */ if (flags & IOMAP_ATOMIC) iomap->flags |= IOMAP_F_ATOMIC_BIO; if (flags & IOMAP_DAX) iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; else iomap->bdev = inode->i_sb->s_bdev; iomap->offset = (u64) map->m_lblk << blkbits; iomap->length = (u64) map->m_len << blkbits; if ((map->m_flags & EXT4_MAP_MAPPED) && !ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) iomap->flags |= IOMAP_F_MERGED; /* * Flags passed to ext4_map_blocks() for direct I/O writes can result * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits * set. In order for any allocated unwritten extents to be converted * into written extents correctly within the ->end_io() handler, we * need to ensure that the iomap->type is set appropriately. Hence, the * reason why we need to check whether the EXT4_MAP_UNWRITTEN bit has * been set first. */ if (map->m_flags & EXT4_MAP_UNWRITTEN) { iomap->type = IOMAP_UNWRITTEN; iomap->addr = (u64) map->m_pblk << blkbits; if (flags & IOMAP_DAX) iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off; } else if (map->m_flags & EXT4_MAP_MAPPED) { iomap->type = IOMAP_MAPPED; iomap->addr = (u64) map->m_pblk << blkbits; if (flags & IOMAP_DAX) iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off; } else if (map->m_flags & EXT4_MAP_DELAYED) { iomap->type = IOMAP_DELALLOC; iomap->addr = IOMAP_NULL_ADDR; } else { iomap->type = IOMAP_HOLE; iomap->addr = IOMAP_NULL_ADDR; } } static int ext4_map_blocks_atomic_write_slow(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map) { ext4_lblk_t m_lblk = map->m_lblk; unsigned int m_len = map->m_len; unsigned int mapped_len = 0, m_flags = 0; ext4_fsblk_t next_pblk; bool check_next_pblk = false; int ret = 0; WARN_ON_ONCE(!ext4_has_feature_bigalloc(inode->i_sb)); /* * This is a slow path in case of mixed mapping. We use * EXT4_GET_BLOCKS_CREATE_ZERO flag here to make sure we get a single * contiguous mapped mapping. This will ensure any unwritten or hole * regions within the requested range is zeroed out and we return * a single contiguous mapped extent. */ m_flags = EXT4_GET_BLOCKS_CREATE_ZERO; do { ret = ext4_map_blocks(handle, inode, map, m_flags); if (ret < 0 && ret != -ENOSPC) goto out_err; /* * This should never happen, but let's return an error code to * avoid an infinite loop in here. */ if (ret == 0) { ret = -EFSCORRUPTED; ext4_warning_inode(inode, "ext4_map_blocks() couldn't allocate blocks m_flags: 0x%x, ret:%d", m_flags, ret); goto out_err; } /* * With bigalloc we should never get ENOSPC nor discontiguous * physical extents. */ if ((check_next_pblk && next_pblk != map->m_pblk) || ret == -ENOSPC) { ext4_warning_inode(inode, "Non-contiguous allocation detected: expected %llu, got %llu, " "or ext4_map_blocks() returned out of space ret: %d", next_pblk, map->m_pblk, ret); ret = -EFSCORRUPTED; goto out_err; } next_pblk = map->m_pblk + map->m_len; check_next_pblk = true; mapped_len += map->m_len; map->m_lblk += map->m_len; map->m_len = m_len - mapped_len; } while (mapped_len < m_len); /* * We might have done some work in above loop, so we need to query the * start of the physical extent, based on the origin m_lblk and m_len. * Let's also ensure we were able to allocate the required range for * mixed mapping case. */ map->m_lblk = m_lblk; map->m_len = m_len; map->m_flags = 0; ret = ext4_map_blocks(handle, inode, map, EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF); if (ret != m_len) { ext4_warning_inode(inode, "allocation failed for atomic write request m_lblk:%u, m_len:%u, ret:%d\n", m_lblk, m_len, ret); ret = -EINVAL; } return ret; out_err: /* reset map before returning an error */ map->m_lblk = m_lblk; map->m_len = m_len; map->m_flags = 0; return ret; } /* * ext4_map_blocks_atomic: Helper routine to ensure the entire requested * range in @map [lblk, lblk + len) is one single contiguous extent with no * mixed mappings. * * We first use m_flags passed to us by our caller (ext4_iomap_alloc()). * We only call EXT4_GET_BLOCKS_ZERO in the slow path, when the underlying * physical extent for the requested range does not have a single contiguous * mapping type i.e. (Hole, Mapped, or Unwritten) throughout. * In that case we will loop over the requested range to allocate and zero out * the unwritten / holes in between, to get a single mapped extent from * [m_lblk, m_lblk + m_len). Note that this is only possible because we know * this can be called only with bigalloc enabled filesystem where the underlying * cluster is already allocated. This avoids allocating discontiguous extents * in the slow path due to multiple calls to ext4_map_blocks(). * The slow path is mostly non-performance critical path, so it should be ok to * loop using ext4_map_blocks() with appropriate flags to allocate & zero the * underlying short holes/unwritten extents within the requested range. */ static int ext4_map_blocks_atomic_write(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int m_flags, bool *force_commit) { ext4_lblk_t m_lblk = map->m_lblk; unsigned int m_len = map->m_len; int ret = 0; WARN_ON_ONCE(m_len > 1 && !ext4_has_feature_bigalloc(inode->i_sb)); ret = ext4_map_blocks(handle, inode, map, m_flags); if (ret < 0 || ret == m_len) goto out; /* * This is a mixed mapping case where we were not able to allocate * a single contiguous extent. In that case let's reset requested * mapping and call the slow path. */ map->m_lblk = m_lblk; map->m_len = m_len; map->m_flags = 0; /* * slow path means we have mixed mapping, that means we will need * to force txn commit. */ *force_commit = true; return ext4_map_blocks_atomic_write_slow(handle, inode, map); out: return ret; } static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map, unsigned int flags) { handle_t *handle; u8 blkbits = inode->i_blkbits; int ret, dio_credits, m_flags = 0, retries = 0; bool force_commit = false; /* * Trim the mapping request to the maximum value that we can map at * once for direct I/O. */ if (map->m_len > DIO_MAX_BLOCKS) map->m_len = DIO_MAX_BLOCKS; /* * journal credits estimation for atomic writes. We call * ext4_map_blocks(), to find if there could be a mixed mapping. If yes, * then let's assume the no. of pextents required can be m_len i.e. * every alternate block can be unwritten and hole. */ if (flags & IOMAP_ATOMIC) { unsigned int orig_mlen = map->m_len; ret = ext4_map_blocks(NULL, inode, map, 0); if (ret < 0) return ret; if (map->m_len < orig_mlen) { map->m_len = orig_mlen; dio_credits = ext4_meta_trans_blocks(inode, orig_mlen, map->m_len); } else { dio_credits = ext4_chunk_trans_blocks(inode, map->m_len); } } else { dio_credits = ext4_chunk_trans_blocks(inode, map->m_len); } retry: /* * Either we allocate blocks and then don't get an unwritten extent, so * in that case we have reserved enough credits. Or, the blocks are * already allocated and unwritten. In that case, the extent conversion * fits into the credits as well. */ handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits); if (IS_ERR(handle)) return PTR_ERR(handle); /* * DAX and direct I/O are the only two operations that are currently * supported with IOMAP_WRITE. */ WARN_ON(!(flags & (IOMAP_DAX | IOMAP_DIRECT))); if (flags & IOMAP_DAX) m_flags = EXT4_GET_BLOCKS_CREATE_ZERO; /* * We use i_size instead of i_disksize here because delalloc writeback * can complete at any point during the I/O and subsequently push the * i_disksize out to i_size. This could be beyond where direct I/O is * happening and thus expose allocated blocks to direct I/O reads. */ else if (((loff_t)map->m_lblk << blkbits) >= i_size_read(inode)) m_flags = EXT4_GET_BLOCKS_CREATE; else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT; if (flags & IOMAP_ATOMIC) ret = ext4_map_blocks_atomic_write(handle, inode, map, m_flags, &force_commit); else ret = ext4_map_blocks(handle, inode, map, m_flags); /* * We cannot fill holes in indirect tree based inodes as that could * expose stale data in the case of a crash. Use the magic error code * to fallback to buffered I/O. */ if (!m_flags && !ret) ret = -ENOTBLK; ext4_journal_stop(handle); if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; /* * Force commit the current transaction if the allocation spans a mixed * mapping range. This ensures any pending metadata updates (like * unwritten to written extents conversion) in this range are in * consistent state with the file data blocks, before performing the * actual write I/O. If the commit fails, the whole I/O must be aborted * to prevent any possible torn writes. */ if (ret > 0 && force_commit) { int ret2; ret2 = ext4_force_commit(inode->i_sb); if (ret2) return ret2; } return ret; } static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned flags, struct iomap *iomap, struct iomap *srcmap) { int ret; struct ext4_map_blocks map; u8 blkbits = inode->i_blkbits; unsigned int orig_mlen; if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK) return -EINVAL; if (WARN_ON_ONCE(ext4_has_inline_data(inode))) return -ERANGE; /* * Calculate the first and last logical blocks respectively. */ map.m_lblk = offset >> blkbits; map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits, EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1; orig_mlen = map.m_len; if (flags & IOMAP_WRITE) { /* * We check here if the blocks are already allocated, then we * don't need to start a journal txn and we can directly return * the mapping information. This could boost performance * especially in multi-threaded overwrite requests. */ if (offset + length <= i_size_read(inode)) { ret = ext4_map_blocks(NULL, inode, &map, 0); /* * For atomic writes the entire requested length should * be mapped. */ if (map.m_flags & EXT4_MAP_MAPPED) { if ((!(flags & IOMAP_ATOMIC) && ret > 0) || (flags & IOMAP_ATOMIC && ret >= orig_mlen)) goto out; } map.m_len = orig_mlen; } ret = ext4_iomap_alloc(inode, &map, flags); } else { /* * This can be called for overwrites path from * ext4_iomap_overwrite_begin(). */ ret = ext4_map_blocks(NULL, inode, &map, 0); } if (ret < 0) return ret; out: /* * When inline encryption is enabled, sometimes I/O to an encrypted file * has to be broken up to guarantee DUN contiguity. Handle this by * limiting the length of the mapping returned. */ map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len); /* * Before returning to iomap, let's ensure the allocated mapping * covers the entire requested length for atomic writes. */ if (flags & IOMAP_ATOMIC) { if (map.m_len < (length >> blkbits)) { WARN_ON_ONCE(1); return -EINVAL; } } ext4_set_iomap(inode, iomap, &map, offset, length, flags); return 0; } static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset, loff_t length, unsigned flags, struct iomap *iomap, struct iomap *srcmap) { int ret; /* * Even for writes we don't need to allocate blocks, so just pretend * we are reading to save overhead of starting a transaction. */ flags &= ~IOMAP_WRITE; ret = ext4_iomap_begin(inode, offset, length, flags, iomap, srcmap); WARN_ON_ONCE(!ret && iomap->type != IOMAP_MAPPED); return ret; } static inline bool ext4_want_directio_fallback(unsigned flags, ssize_t written) { /* must be a directio to fall back to buffered */ if ((flags & (IOMAP_WRITE | IOMAP_DIRECT)) != (IOMAP_WRITE | IOMAP_DIRECT)) return false; /* atomic writes are all-or-nothing */ if (flags & IOMAP_ATOMIC) return false; /* can only try again if we wrote nothing */ return written == 0; } static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length, ssize_t written, unsigned flags, struct iomap *iomap) { /* * Check to see whether an error occurred while writing out the data to * the allocated blocks. If so, return the magic error code for * non-atomic write so that we fallback to buffered I/O and attempt to * complete the remainder of the I/O. * For non-atomic writes, any blocks that may have been * allocated in preparation for the direct I/O will be reused during * buffered I/O. For atomic write, we never fallback to buffered-io. */ if (ext4_want_directio_fallback(flags, written)) return -ENOTBLK; return 0; } const struct iomap_ops ext4_iomap_ops = { .iomap_begin = ext4_iomap_begin, .iomap_end = ext4_iomap_end, }; const struct iomap_ops ext4_iomap_overwrite_ops = { .iomap_begin = ext4_iomap_overwrite_begin, .iomap_end = ext4_iomap_end, }; static int ext4_iomap_begin_report(struct inode *inode, loff_t offset, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) { int ret; struct ext4_map_blocks map; u8 blkbits = inode->i_blkbits; if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK) return -EINVAL; if (ext4_has_inline_data(inode)) { ret = ext4_inline_data_iomap(inode, iomap); if (ret != -EAGAIN) { if (ret == 0 && offset >= iomap->length) ret = -ENOENT; return ret; } } /* * Calculate the first and last logical block respectively. */ map.m_lblk = offset >> blkbits; map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits, EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1; /* * Fiemap callers may call for offset beyond s_bitmap_maxbytes. * So handle it here itself instead of querying ext4_map_blocks(). * Since ext4_map_blocks() will warn about it and will return * -EIO error. */ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); if (offset >= sbi->s_bitmap_maxbytes) { map.m_flags = 0; goto set_iomap; } } ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret < 0) return ret; set_iomap: ext4_set_iomap(inode, iomap, &map, offset, length, flags); return 0; } const struct iomap_ops ext4_iomap_report_ops = { .iomap_begin = ext4_iomap_begin_report, }; /* * For data=journal mode, folio should be marked dirty only when it was * writeably mapped. When that happens, it was already attached to the * transaction and marked as jbddirty (we take care of this in * ext4_page_mkwrite()). On transaction commit, we writeprotect page mappings * so we should have nothing to do here, except for the case when someone * had the page pinned and dirtied the page through this pin (e.g. by doing * direct IO to it). In that case we'd need to attach buffers here to the * transaction but we cannot due to lock ordering. We cannot just dirty the * folio and leave attached buffers clean, because the buffers' dirty state is * "definitive". We cannot just set the buffers dirty or jbddirty because all * the journalling code will explode. So what we do is to mark the folio * "pending dirty" and next time ext4_writepages() is called, attach buffers * to the transaction appropriately. */ static bool ext4_journalled_dirty_folio(struct address_space *mapping, struct folio *folio) { WARN_ON_ONCE(!folio_buffers(folio)); if (folio_maybe_dma_pinned(folio)) folio_set_checked(folio); return filemap_dirty_folio(mapping, folio); } static bool ext4_dirty_folio(struct address_space *mapping, struct folio *folio) { WARN_ON_ONCE(!folio_test_locked(folio) && !folio_test_dirty(folio)); WARN_ON_ONCE(!folio_buffers(folio)); return block_dirty_folio(mapping, folio); } static int ext4_iomap_swap_activate(struct swap_info_struct *sis, struct file *file, sector_t *span) { return iomap_swapfile_activate(sis, file, span, &ext4_iomap_report_ops); } static const struct address_space_operations ext4_aops = { .read_folio = ext4_read_folio, .readahead = ext4_readahead, .writepages = ext4_writepages, .write_begin = ext4_write_begin, .write_end = ext4_write_end, .dirty_folio = ext4_dirty_folio, .bmap = ext4_bmap, .invalidate_folio = ext4_invalidate_folio, .release_folio = ext4_release_folio, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, .swap_activate = ext4_iomap_swap_activate, }; static const struct address_space_operations ext4_journalled_aops = { .read_folio = ext4_read_folio, .readahead = ext4_readahead, .writepages = ext4_writepages, .write_begin = ext4_write_begin, .write_end = ext4_journalled_write_end, .dirty_folio = ext4_journalled_dirty_folio, .bmap = ext4_bmap, .invalidate_folio = ext4_journalled_invalidate_folio, .release_folio = ext4_release_folio, .migrate_folio = buffer_migrate_folio_norefs, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, .swap_activate = ext4_iomap_swap_activate, }; static const struct address_space_operations ext4_da_aops = { .read_folio = ext4_read_folio, .readahead = ext4_readahead, .writepages = ext4_writepages, .write_begin = ext4_da_write_begin, .write_end = ext4_da_write_end, .dirty_folio = ext4_dirty_folio, .bmap = ext4_bmap, .invalidate_folio = ext4_invalidate_folio, .release_folio = ext4_release_folio, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, .swap_activate = ext4_iomap_swap_activate, }; static const struct address_space_operations ext4_dax_aops = { .writepages = ext4_dax_writepages, .dirty_folio = noop_dirty_folio, .bmap = ext4_bmap, .swap_activate = ext4_iomap_swap_activate, }; void ext4_set_aops(struct inode *inode) { switch (ext4_inode_journal_mode(inode)) { case EXT4_INODE_ORDERED_DATA_MODE: case EXT4_INODE_WRITEBACK_DATA_MODE: break; case EXT4_INODE_JOURNAL_DATA_MODE: inode->i_mapping->a_ops = &ext4_journalled_aops; return; default: BUG(); } if (IS_DAX(inode)) inode->i_mapping->a_ops = &ext4_dax_aops; else if (test_opt(inode->i_sb, DELALLOC)) inode->i_mapping->a_ops = &ext4_da_aops; else inode->i_mapping->a_ops = &ext4_aops; } /* * Here we can't skip an unwritten buffer even though it usually reads zero * because it might have data in pagecache (eg, if called from ext4_zero_range, * ext4_punch_hole, etc) which needs to be properly zeroed out. Otherwise a * racing writeback can come later and flush the stale pagecache to disk. */ static int __ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length) { unsigned int offset, blocksize, pos; ext4_lblk_t iblock; struct inode *inode = mapping->host; struct buffer_head *bh; struct folio *folio; int err = 0; folio = __filemap_get_folio(mapping, from >> PAGE_SHIFT, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mapping_gfp_constraint(mapping, ~__GFP_FS)); if (IS_ERR(folio)) return PTR_ERR(folio); blocksize = inode->i_sb->s_blocksize; iblock = folio->index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); bh = folio_buffers(folio); if (!bh) bh = create_empty_buffers(folio, blocksize, 0); /* Find the buffer that contains "offset" */ offset = offset_in_folio(folio, from); pos = blocksize; while (offset >= pos) { bh = bh->b_this_page; iblock++; pos += blocksize; } if (buffer_freed(bh)) { BUFFER_TRACE(bh, "freed: skip"); goto unlock; } if (!buffer_mapped(bh)) { BUFFER_TRACE(bh, "unmapped"); ext4_get_block(inode, iblock, bh, 0); /* unmapped? It's a hole - nothing to do */ if (!buffer_mapped(bh)) { BUFFER_TRACE(bh, "still unmapped"); goto unlock; } } /* Ok, it's mapped. Make sure it's up-to-date */ if (folio_test_uptodate(folio)) set_buffer_uptodate(bh); if (!buffer_uptodate(bh)) { err = ext4_read_bh_lock(bh, 0, true); if (err) goto unlock; if (fscrypt_inode_uses_fs_layer_crypto(inode)) { /* We expect the key to be set. */ BUG_ON(!fscrypt_has_encryption_key(inode)); err = fscrypt_decrypt_pagecache_blocks(folio, blocksize, bh_offset(bh)); if (err) { clear_buffer_uptodate(bh); goto unlock; } } } if (ext4_should_journal_data(inode)) { BUFFER_TRACE(bh, "get write access"); err = ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (err) goto unlock; } folio_zero_range(folio, offset, length); BUFFER_TRACE(bh, "zeroed end of block"); if (ext4_should_journal_data(inode)) { err = ext4_dirty_journalled_data(handle, bh); } else { err = 0; mark_buffer_dirty(bh); if (ext4_should_order_data(inode)) err = ext4_jbd2_inode_add_write(handle, inode, from, length); } unlock: folio_unlock(folio); folio_put(folio); return err; } /* * ext4_block_zero_page_range() zeros out a mapping of length 'length' * starting from file offset 'from'. The range to be zero'd must * be contained with in one block. If the specified range exceeds * the end of the block it will be shortened to end of the block * that corresponds to 'from' */ static int ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length) { struct inode *inode = mapping->host; unsigned offset = from & (PAGE_SIZE-1); unsigned blocksize = inode->i_sb->s_blocksize; unsigned max = blocksize - (offset & (blocksize - 1)); /* * correct length if it does not fall between * 'from' and the end of the block */ if (length > max || length < 0) length = max; if (IS_DAX(inode)) { return dax_zero_range(inode, from, length, NULL, &ext4_iomap_ops); } return __ext4_block_zero_page_range(handle, mapping, from, length); } /* * ext4_block_truncate_page() zeroes out a mapping from file offset `from' * up to the end of the block which corresponds to `from'. * This required during truncate. We need to physically zero the tail end * of that block so it doesn't yield old data if the file is later grown. */ static int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from) { unsigned offset = from & (PAGE_SIZE-1); unsigned length; unsigned blocksize; struct inode *inode = mapping->host; /* If we are processing an encrypted inode during orphan list handling */ if (IS_ENCRYPTED(inode) && !fscrypt_has_encryption_key(inode)) return 0; blocksize = inode->i_sb->s_blocksize; length = blocksize - (offset & (blocksize - 1)); return ext4_block_zero_page_range(handle, mapping, from, length); } int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, loff_t lstart, loff_t length) { struct super_block *sb = inode->i_sb; struct address_space *mapping = inode->i_mapping; unsigned partial_start, partial_end; ext4_fsblk_t start, end; loff_t byte_end = (lstart + length - 1); int err = 0; partial_start = lstart & (sb->s_blocksize - 1); partial_end = byte_end & (sb->s_blocksize - 1); start = lstart >> sb->s_blocksize_bits; end = byte_end >> sb->s_blocksize_bits; /* Handle partial zero within the single block */ if (start == end && (partial_start || (partial_end != sb->s_blocksize - 1))) { err = ext4_block_zero_page_range(handle, mapping, lstart, length); return err; } /* Handle partial zero out on the start of the range */ if (partial_start) { err = ext4_block_zero_page_range(handle, mapping, lstart, sb->s_blocksize); if (err) return err; } /* Handle partial zero out on the end of the range */ if (partial_end != sb->s_blocksize - 1) err = ext4_block_zero_page_range(handle, mapping, byte_end - partial_end, partial_end + 1); return err; } int ext4_can_truncate(struct inode *inode) { if (S_ISREG(inode->i_mode)) return 1; if (S_ISDIR(inode->i_mode)) return 1; if (S_ISLNK(inode->i_mode)) return !ext4_inode_is_fast_symlink(inode); return 0; } /* * We have to make sure i_disksize gets properly updated before we truncate * page cache due to hole punching or zero range. Otherwise i_disksize update * can get lost as it may have been postponed to submission of writeback but * that will never happen after we truncate page cache. */ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, loff_t len) { handle_t *handle; int ret; loff_t size = i_size_read(inode); WARN_ON(!inode_is_locked(inode)); if (offset > size || offset + len < size) return 0; if (EXT4_I(inode)->i_disksize >= size) return 0; handle = ext4_journal_start(inode, EXT4_HT_MISC, 1); if (IS_ERR(handle)) return PTR_ERR(handle); ext4_update_i_disksize(inode, size); ret = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); return ret; } static inline void ext4_truncate_folio(struct inode *inode, loff_t start, loff_t end) { unsigned long blocksize = i_blocksize(inode); struct folio *folio; /* Nothing to be done if no complete block needs to be truncated. */ if (round_up(start, blocksize) >= round_down(end, blocksize)) return; folio = filemap_lock_folio(inode->i_mapping, start >> PAGE_SHIFT); if (IS_ERR(folio)) return; if (folio_mkclean(folio)) folio_mark_dirty(folio); folio_unlock(folio); folio_put(folio); } int ext4_truncate_page_cache_block_range(struct inode *inode, loff_t start, loff_t end) { unsigned long blocksize = i_blocksize(inode); int ret; /* * For journalled data we need to write (and checkpoint) pages * before discarding page cache to avoid inconsitent data on disk * in case of crash before freeing or unwritten converting trans * is committed. */ if (ext4_should_journal_data(inode)) { ret = filemap_write_and_wait_range(inode->i_mapping, start, end - 1); if (ret) return ret; goto truncate_pagecache; } /* * If the block size is less than the page size, the file's mapped * blocks within one page could be freed or converted to unwritten. * So it's necessary to remove writable userspace mappings, and then * ext4_page_mkwrite() can be called during subsequent write access * to these partial folios. */ if (!IS_ALIGNED(start | end, PAGE_SIZE) && blocksize < PAGE_SIZE && start < inode->i_size) { loff_t page_boundary = round_up(start, PAGE_SIZE); ext4_truncate_folio(inode, start, min(page_boundary, end)); if (end > page_boundary) ext4_truncate_folio(inode, round_down(end, PAGE_SIZE), end); } truncate_pagecache: truncate_pagecache_range(inode, start, end - 1); return 0; } static void ext4_wait_dax_page(struct inode *inode) { filemap_invalidate_unlock(inode->i_mapping); schedule(); filemap_invalidate_lock(inode->i_mapping); } int ext4_break_layouts(struct inode *inode) { if (WARN_ON_ONCE(!rwsem_is_locked(&inode->i_mapping->invalidate_lock))) return -EINVAL; return dax_break_layout_inode(inode, ext4_wait_dax_page); } /* * ext4_punch_hole: punches a hole in a file by releasing the blocks * associated with the given offset and length * * @inode: File inode * @offset: The offset where the hole will begin * @len: The length of the hole * * Returns: 0 on success or negative on failure */ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) { struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; ext4_lblk_t start_lblk, end_lblk; loff_t max_end = sb->s_maxbytes; loff_t end = offset + length; handle_t *handle; unsigned int credits; int ret; trace_ext4_punch_hole(inode, offset, length, 0); WARN_ON_ONCE(!inode_is_locked(inode)); /* * For indirect-block based inodes, make sure that the hole within * one block before last range. */ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) max_end = EXT4_SB(sb)->s_bitmap_maxbytes - sb->s_blocksize; /* No need to punch hole beyond i_size */ if (offset >= inode->i_size || offset >= max_end) return 0; /* * If the hole extends beyond i_size, set the hole to end after * the page that contains i_size. */ if (end > inode->i_size) end = round_up(inode->i_size, PAGE_SIZE); if (end > max_end) end = max_end; length = end - offset; /* * Attach jinode to inode for jbd2 if we do any zeroing of partial * block. */ if (!IS_ALIGNED(offset | end, sb->s_blocksize)) { ret = ext4_inode_attach_jinode(inode); if (ret < 0) return ret; } ret = ext4_update_disksize_before_punch(inode, offset, length); if (ret) return ret; /* Now release the pages and zero block aligned part of pages*/ ret = ext4_truncate_page_cache_block_range(inode, offset, end); if (ret) return ret; if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) credits = ext4_writepage_trans_blocks(inode); else credits = ext4_blocks_for_truncate(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); ext4_std_error(sb, ret); return ret; } ret = ext4_zero_partial_blocks(handle, inode, offset, length); if (ret) goto out_handle; /* If there are blocks to remove, do it */ start_lblk = EXT4_B_TO_LBLK(inode, offset); end_lblk = end >> inode->i_blkbits; if (end_lblk > start_lblk) { ext4_lblk_t hole_len = end_lblk - start_lblk; ext4_fc_track_inode(handle, inode); ext4_check_map_extents_env(inode); down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); ext4_es_remove_extent(inode, start_lblk, hole_len); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) ret = ext4_ext_remove_space(inode, start_lblk, end_lblk - 1); else ret = ext4_ind_remove_space(handle, inode, start_lblk, end_lblk); if (ret) { up_write(&EXT4_I(inode)->i_data_sem); goto out_handle; } ext4_es_insert_extent(inode, start_lblk, hole_len, ~0, EXTENT_STATUS_HOLE, 0); up_write(&EXT4_I(inode)->i_data_sem); } ext4_fc_track_range(handle, inode, start_lblk, end_lblk); ret = ext4_mark_inode_dirty(handle, inode); if (unlikely(ret)) goto out_handle; ext4_update_inode_fsync_trans(handle, inode, 1); if (IS_SYNC(inode)) ext4_handle_sync(handle); out_handle: ext4_journal_stop(handle); return ret; } int ext4_inode_attach_jinode(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct jbd2_inode *jinode; if (ei->jinode || !EXT4_SB(inode->i_sb)->s_journal) return 0; jinode = jbd2_alloc_inode(GFP_KERNEL); spin_lock(&inode->i_lock); if (!ei->jinode) { if (!jinode) { spin_unlock(&inode->i_lock); return -ENOMEM; } ei->jinode = jinode; jbd2_journal_init_jbd_inode(ei->jinode, inode); jinode = NULL; } spin_unlock(&inode->i_lock); if (unlikely(jinode != NULL)) jbd2_free_inode(jinode); return 0; } /* * ext4_truncate() * * We block out ext4_get_block() block instantiations across the entire * transaction, and VFS/VM ensures that ext4_truncate() cannot run * simultaneously on behalf of the same inode. * * As we work through the truncate and commit bits of it to the journal there * is one core, guiding principle: the file's tree must always be consistent on * disk. We must be able to restart the truncate after a crash. * * The file's tree may be transiently inconsistent in memory (although it * probably isn't), but whenever we close off and commit a journal transaction, * the contents of (the filesystem + the journal) must be consistent and * restartable. It's pretty simple, really: bottom up, right to left (although * left-to-right works OK too). * * Note that at recovery time, journal replay occurs *before* the restart of * truncate against the orphan inode list. * * The committed inode has the new, desired i_size (which is the same as * i_disksize in this case). After a crash, ext4_orphan_cleanup() will see * that this inode's truncate did not complete and it will again call * ext4_truncate() to have another go. So there will be instantiated blocks * to the right of the truncation point in a crashed ext4 filesystem. But * that's fine - as long as they are linked from the inode, the post-crash * ext4_truncate() run will find them and release them. */ int ext4_truncate(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); unsigned int credits; int err = 0, err2; handle_t *handle; struct address_space *mapping = inode->i_mapping; /* * There is a possibility that we're either freeing the inode * or it's a completely new inode. In those cases we might not * have i_rwsem locked because it's not necessary. */ if (!(inode->i_state & (I_NEW|I_FREEING))) WARN_ON(!inode_is_locked(inode)); trace_ext4_truncate_enter(inode); if (!ext4_can_truncate(inode)) goto out_trace; if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); if (ext4_has_inline_data(inode)) { int has_inline = 1; err = ext4_inline_data_truncate(inode, &has_inline); if (err || has_inline) goto out_trace; } /* If we zero-out tail of the page, we have to create jinode for jbd2 */ if (inode->i_size & (inode->i_sb->s_blocksize - 1)) { err = ext4_inode_attach_jinode(inode); if (err) goto out_trace; } if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) credits = ext4_writepage_trans_blocks(inode); else credits = ext4_blocks_for_truncate(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); if (IS_ERR(handle)) { err = PTR_ERR(handle); goto out_trace; } if (inode->i_size & (inode->i_sb->s_blocksize - 1)) ext4_block_truncate_page(handle, mapping, inode->i_size); /* * We add the inode to the orphan list, so that if this * truncate spans multiple transactions, and we crash, we will * resume the truncate when the filesystem recovers. It also * marks the inode dirty, to catch the new size. * * Implication: the file must always be in a sane, consistent * truncatable state while each transaction commits. */ err = ext4_orphan_add(handle, inode); if (err) goto out_stop; ext4_fc_track_inode(handle, inode); ext4_check_map_extents_env(inode); down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) err = ext4_ext_truncate(handle, inode); else ext4_ind_truncate(handle, inode); up_write(&ei->i_data_sem); if (err) goto out_stop; if (IS_SYNC(inode)) ext4_handle_sync(handle); out_stop: /* * If this was a simple ftruncate() and the file will remain alive, * then we need to clear up the orphan record which we created above. * However, if this was a real unlink then we were called by * ext4_evict_inode(), and we allow that function to clean up the * orphan info for us. */ if (inode->i_nlink) ext4_orphan_del(handle, inode); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); err2 = ext4_mark_inode_dirty(handle, inode); if (unlikely(err2 && !err)) err = err2; ext4_journal_stop(handle); out_trace: trace_ext4_truncate_exit(inode); return err; } static inline u64 ext4_inode_peek_iversion(const struct inode *inode) { if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) return inode_peek_iversion_raw(inode); else return inode_peek_iversion(inode); } static int ext4_inode_blocks_set(struct ext4_inode *raw_inode, struct ext4_inode_info *ei) { struct inode *inode = &(ei->vfs_inode); u64 i_blocks = READ_ONCE(inode->i_blocks); struct super_block *sb = inode->i_sb; if (i_blocks <= ~0U) { /* * i_blocks can be represented in a 32 bit variable * as multiple of 512 bytes */ raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); raw_inode->i_blocks_high = 0; ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); return 0; } /* * This should never happen since sb->s_maxbytes should not have * allowed this, sb->s_maxbytes was set according to the huge_file * feature in ext4_fill_super(). */ if (!ext4_has_feature_huge_file(sb)) return -EFSCORRUPTED; if (i_blocks <= 0xffffffffffffULL) { /* * i_blocks can be represented in a 48 bit variable * as multiple of 512 bytes */ raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); } else { ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE); /* i_block is stored in file system block size */ i_blocks = i_blocks >> (inode->i_blkbits - 9); raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); } return 0; } static int ext4_fill_raw_inode(struct inode *inode, struct ext4_inode *raw_inode) { struct ext4_inode_info *ei = EXT4_I(inode); uid_t i_uid; gid_t i_gid; projid_t i_projid; int block; int err; err = ext4_inode_blocks_set(raw_inode, ei); raw_inode->i_mode = cpu_to_le16(inode->i_mode); i_uid = i_uid_read(inode); i_gid = i_gid_read(inode); i_projid = from_kprojid(&init_user_ns, ei->i_projid); if (!(test_opt(inode->i_sb, NO_UID32))) { raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid)); raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid)); /* * Fix up interoperability with old kernels. Otherwise, * old inodes get re-used with the upper 16 bits of the * uid/gid intact. */ if (ei->i_dtime && list_empty(&ei->i_orphan)) { raw_inode->i_uid_high = 0; raw_inode->i_gid_high = 0; } else { raw_inode->i_uid_high = cpu_to_le16(high_16_bits(i_uid)); raw_inode->i_gid_high = cpu_to_le16(high_16_bits(i_gid)); } } else { raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid)); raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(i_gid)); raw_inode->i_uid_high = 0; raw_inode->i_gid_high = 0; } raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); EXT4_INODE_SET_CTIME(inode, raw_inode); EXT4_INODE_SET_MTIME(inode, raw_inode); EXT4_INODE_SET_ATIME(inode, raw_inode); EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF); if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) raw_inode->i_file_acl_high = cpu_to_le16(ei->i_file_acl >> 32); raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); ext4_isize_set(raw_inode, ei->i_disksize); raw_inode->i_generation = cpu_to_le32(inode->i_generation); if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { if (old_valid_dev(inode->i_rdev)) { raw_inode->i_block[0] = cpu_to_le32(old_encode_dev(inode->i_rdev)); raw_inode->i_block[1] = 0; } else { raw_inode->i_block[0] = 0; raw_inode->i_block[1] = cpu_to_le32(new_encode_dev(inode->i_rdev)); raw_inode->i_block[2] = 0; } } else if (!ext4_has_inline_data(inode)) { for (block = 0; block < EXT4_N_BLOCKS; block++) raw_inode->i_block[block] = ei->i_data[block]; } if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { u64 ivers = ext4_inode_peek_iversion(inode); raw_inode->i_disk_version = cpu_to_le32(ivers); if (ei->i_extra_isize) { if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) raw_inode->i_version_hi = cpu_to_le32(ivers >> 32); raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); } } if (i_projid != EXT4_DEF_PROJID && !ext4_has_feature_project(inode->i_sb)) err = err ?: -EFSCORRUPTED; if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) raw_inode->i_projid = cpu_to_le32(i_projid); ext4_inode_csum_set(inode, raw_inode, ei); return err; } /* * ext4_get_inode_loc returns with an extra refcount against the inode's * underlying buffer_head on success. If we pass 'inode' and it does not * have in-inode xattr, we have all inode data in memory that is needed * to recreate the on-disk version of this inode. */ static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino, struct inode *inode, struct ext4_iloc *iloc, ext4_fsblk_t *ret_block) { struct ext4_group_desc *gdp; struct buffer_head *bh; ext4_fsblk_t block; struct blk_plug plug; int inodes_per_block, inode_offset; iloc->bh = NULL; if (ino < EXT4_ROOT_INO || ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)) return -EFSCORRUPTED; iloc->block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); gdp = ext4_get_group_desc(sb, iloc->block_group, NULL); if (!gdp) return -EIO; /* * Figure out the offset within the block group inode table */ inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; inode_offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb)); iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb); block = ext4_inode_table(sb, gdp); if ((block <= le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) || (block >= ext4_blocks_count(EXT4_SB(sb)->s_es))) { ext4_error(sb, "Invalid inode table block %llu in " "block_group %u", block, iloc->block_group); return -EFSCORRUPTED; } block += (inode_offset / inodes_per_block); bh = sb_getblk(sb, block); if (unlikely(!bh)) return -ENOMEM; if (ext4_buffer_uptodate(bh)) goto has_buffer; lock_buffer(bh); if (ext4_buffer_uptodate(bh)) { /* Someone brought it uptodate while we waited */ unlock_buffer(bh); goto has_buffer; } /* * If we have all information of the inode in memory and this * is the only valid inode in the block, we need not read the * block. */ if (inode && !ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { struct buffer_head *bitmap_bh; int i, start; start = inode_offset & ~(inodes_per_block - 1); /* Is the inode bitmap in cache? */ bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp)); if (unlikely(!bitmap_bh)) goto make_io; /* * If the inode bitmap isn't in cache then the * optimisation may end up performing two reads instead * of one, so skip it. */ if (!buffer_uptodate(bitmap_bh)) { brelse(bitmap_bh); goto make_io; } for (i = start; i < start + inodes_per_block; i++) { if (i == inode_offset) continue; if (ext4_test_bit(i, bitmap_bh->b_data)) break; } brelse(bitmap_bh); if (i == start + inodes_per_block) { struct ext4_inode *raw_inode = (struct ext4_inode *) (bh->b_data + iloc->offset); /* all other inodes are free, so skip I/O */ memset(bh->b_data, 0, bh->b_size); if (!ext4_test_inode_state(inode, EXT4_STATE_NEW)) ext4_fill_raw_inode(inode, raw_inode); set_buffer_uptodate(bh); unlock_buffer(bh); goto has_buffer; } } make_io: /* * If we need to do any I/O, try to pre-readahead extra * blocks from the inode table. */ blk_start_plug(&plug); if (EXT4_SB(sb)->s_inode_readahead_blks) { ext4_fsblk_t b, end, table; unsigned num; __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks; table = ext4_inode_table(sb, gdp); /* s_inode_readahead_blks is always a power of 2 */ b = block & ~((ext4_fsblk_t) ra_blks - 1); if (table > b) b = table; end = b + ra_blks; num = EXT4_INODES_PER_GROUP(sb); if (ext4_has_group_desc_csum(sb)) num -= ext4_itable_unused_count(sb, gdp); table += num / inodes_per_block; if (end > table) end = table; while (b <= end) ext4_sb_breadahead_unmovable(sb, b++); } /* * There are other valid inodes in the buffer, this inode * has in-inode xattrs, or we don't have this inode in memory. * Read the block from disk. */ trace_ext4_load_inode(sb, ino); ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL, ext4_simulate_fail(sb, EXT4_SIM_INODE_EIO)); blk_finish_plug(&plug); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { if (ret_block) *ret_block = block; brelse(bh); return -EIO; } has_buffer: iloc->bh = bh; return 0; } static int __ext4_get_inode_loc_noinmem(struct inode *inode, struct ext4_iloc *iloc) { ext4_fsblk_t err_blk = 0; int ret; ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, NULL, iloc, &err_blk); if (ret == -EIO) ext4_error_inode_block(inode, err_blk, EIO, "unable to read itable block"); return ret; } int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) { ext4_fsblk_t err_blk = 0; int ret; ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, inode, iloc, &err_blk); if (ret == -EIO) ext4_error_inode_block(inode, err_blk, EIO, "unable to read itable block"); return ret; } int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino, struct ext4_iloc *iloc) { return __ext4_get_inode_loc(sb, ino, NULL, iloc, NULL); } static bool ext4_should_enable_dax(struct inode *inode) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); if (test_opt2(inode->i_sb, DAX_NEVER)) return false; if (!S_ISREG(inode->i_mode)) return false; if (ext4_should_journal_data(inode)) return false; if (ext4_has_inline_data(inode)) return false; if (ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT)) return false; if (ext4_test_inode_flag(inode, EXT4_INODE_VERITY)) return false; if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) return false; if (test_opt(inode->i_sb, DAX_ALWAYS)) return true; return ext4_test_inode_flag(inode, EXT4_INODE_DAX); } void ext4_set_inode_flags(struct inode *inode, bool init) { unsigned int flags = EXT4_I(inode)->i_flags; unsigned int new_fl = 0; WARN_ON_ONCE(IS_DAX(inode) && init); if (flags & EXT4_SYNC_FL) new_fl |= S_SYNC; if (flags & EXT4_APPEND_FL) new_fl |= S_APPEND; if (flags & EXT4_IMMUTABLE_FL) new_fl |= S_IMMUTABLE; if (flags & EXT4_NOATIME_FL) new_fl |= S_NOATIME; if (flags & EXT4_DIRSYNC_FL) new_fl |= S_DIRSYNC; /* Because of the way inode_set_flags() works we must preserve S_DAX * here if already set. */ new_fl |= (inode->i_flags & S_DAX); if (init && ext4_should_enable_dax(inode)) new_fl |= S_DAX; if (flags & EXT4_ENCRYPT_FL) new_fl |= S_ENCRYPTED; if (flags & EXT4_CASEFOLD_FL) new_fl |= S_CASEFOLD; if (flags & EXT4_VERITY_FL) new_fl |= S_VERITY; inode_set_flags(inode, new_fl, S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX| S_ENCRYPTED|S_CASEFOLD|S_VERITY); } static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, struct ext4_inode_info *ei) { blkcnt_t i_blocks ; struct inode *inode = &(ei->vfs_inode); struct super_block *sb = inode->i_sb; if (ext4_has_feature_huge_file(sb)) { /* we are using combined 48 bit field */ i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 | le32_to_cpu(raw_inode->i_blocks_lo); if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) { /* i_blocks represent file system block size */ return i_blocks << (inode->i_blkbits - 9); } else { return i_blocks; } } else { return le32_to_cpu(raw_inode->i_blocks_lo); } } static inline int ext4_iget_extra_inode(struct inode *inode, struct ext4_inode *raw_inode, struct ext4_inode_info *ei) { __le32 *magic = (void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize; if (EXT4_INODE_HAS_XATTR_SPACE(inode) && *magic == cpu_to_le32(EXT4_XATTR_MAGIC)) { int err; err = xattr_check_inode(inode, IHDR(inode, raw_inode), ITAIL(inode, raw_inode)); if (err) return err; ext4_set_inode_state(inode, EXT4_STATE_XATTR); err = ext4_find_inline_data_nolock(inode); if (!err && ext4_has_inline_data(inode)) ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); return err; } else EXT4_I(inode)->i_inline_off = 0; return 0; } int ext4_get_projid(struct inode *inode, kprojid_t *projid) { if (!ext4_has_feature_project(inode->i_sb)) return -EOPNOTSUPP; *projid = EXT4_I(inode)->i_projid; return 0; } /* * ext4 has self-managed i_version for ea inodes, it stores the lower 32bit of * refcount in i_version, so use raw values if inode has EXT4_EA_INODE_FL flag * set. */ static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val) { if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) inode_set_iversion_raw(inode, val); else inode_set_iversion_queried(inode, val); } static int check_igot_inode(struct inode *inode, ext4_iget_flags flags, const char *function, unsigned int line) { const char *err_str; if (flags & EXT4_IGET_EA_INODE) { if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { err_str = "missing EA_INODE flag"; goto error; } if (ext4_test_inode_state(inode, EXT4_STATE_XATTR) || EXT4_I(inode)->i_file_acl) { err_str = "ea_inode with extended attributes"; goto error; } } else { if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { /* * open_by_handle_at() could provide an old inode number * that has since been reused for an ea_inode; this does * not indicate filesystem corruption */ if (flags & EXT4_IGET_HANDLE) return -ESTALE; err_str = "unexpected EA_INODE flag"; goto error; } } if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) { err_str = "unexpected bad inode w/o EXT4_IGET_BAD"; goto error; } return 0; error: ext4_error_inode(inode, function, line, 0, "%s", err_str); return -EFSCORRUPTED; } bool ext4_should_enable_large_folio(struct inode *inode) { struct super_block *sb = inode->i_sb; if (!S_ISREG(inode->i_mode)) return false; if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) return false; if (ext4_has_feature_verity(sb)) return false; if (ext4_has_feature_encrypt(sb)) return false; return true; } struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ext4_iget_flags flags, const char *function, unsigned int line) { struct ext4_iloc iloc; struct ext4_inode *raw_inode; struct ext4_inode_info *ei; struct ext4_super_block *es = EXT4_SB(sb)->s_es; struct inode *inode; journal_t *journal = EXT4_SB(sb)->s_journal; long ret; loff_t size; int block; uid_t i_uid; gid_t i_gid; projid_t i_projid; if ((!(flags & EXT4_IGET_SPECIAL) && is_special_ino(sb, ino)) || (ino < EXT4_ROOT_INO) || (ino > le32_to_cpu(es->s_inodes_count))) { if (flags & EXT4_IGET_HANDLE) return ERR_PTR(-ESTALE); __ext4_error(sb, function, line, false, EFSCORRUPTED, 0, "inode #%lu: comm %s: iget: illegal inode #", ino, current->comm); return ERR_PTR(-EFSCORRUPTED); } inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) { ret = check_igot_inode(inode, flags, function, line); if (ret) { iput(inode); return ERR_PTR(ret); } return inode; } ei = EXT4_I(inode); iloc.bh = NULL; ret = __ext4_get_inode_loc_noinmem(inode, &iloc); if (ret < 0) goto bad_inode; raw_inode = ext4_raw_inode(&iloc); if ((flags & EXT4_IGET_HANDLE) && (raw_inode->i_links_count == 0) && (raw_inode->i_mode == 0)) { ret = -ESTALE; goto bad_inode; } if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > EXT4_INODE_SIZE(inode->i_sb) || (ei->i_extra_isize & 3)) { ext4_error_inode(inode, function, line, 0, "iget: bad extra_isize %u " "(inode size %u)", ei->i_extra_isize, EXT4_INODE_SIZE(inode->i_sb)); ret = -EFSCORRUPTED; goto bad_inode; } } else ei->i_extra_isize = 0; /* Precompute checksum seed for inode metadata */ if (ext4_has_feature_metadata_csum(sb)) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; __le32 inum = cpu_to_le32(inode->i_ino); __le32 gen = raw_inode->i_generation; csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen)); } if ((!ext4_inode_csum_verify(inode, raw_inode, ei) || ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) && (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))) { ext4_error_inode_err(inode, function, line, 0, EFSBADCRC, "iget: checksum invalid"); ret = -EFSBADCRC; goto bad_inode; } inode->i_mode = le16_to_cpu(raw_inode->i_mode); i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); if (ext4_has_feature_project(sb) && EXT4_INODE_SIZE(sb) > EXT4_GOOD_OLD_INODE_SIZE && EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) i_projid = (projid_t)le32_to_cpu(raw_inode->i_projid); else i_projid = EXT4_DEF_PROJID; if (!(test_opt(inode->i_sb, NO_UID32))) { i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; } i_uid_write(inode, i_uid); i_gid_write(inode, i_gid); ei->i_projid = make_kprojid(&init_user_ns, i_projid); set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ ei->i_inline_off = 0; ei->i_dir_start_lookup = 0; ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); /* We now have enough fields to check if the inode was active or not. * This is needed because nfsd might try to access dead inodes * the test is that same one that e2fsck uses * NeilBrown 1999oct15 */ if (inode->i_nlink == 0) { if ((inode->i_mode == 0 || flags & EXT4_IGET_SPECIAL || !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) && ino != EXT4_BOOT_LOADER_INO) { /* this inode is deleted or unallocated */ if (flags & EXT4_IGET_SPECIAL) { ext4_error_inode(inode, function, line, 0, "iget: special inode unallocated"); ret = -EFSCORRUPTED; } else ret = -ESTALE; goto bad_inode; } /* The only unlinked inodes we let through here have * valid i_mode and are being read by the orphan * recovery code: that's fine, we're about to complete * the process of deleting those. * OR it is the EXT4_BOOT_LOADER_INO which is * not initialized on a new filesystem. */ } ei->i_flags = le32_to_cpu(raw_inode->i_flags); ext4_set_inode_flags(inode, true); inode->i_blocks = ext4_inode_blocks(raw_inode, ei); ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo); if (ext4_has_feature_64bit(sb)) ei->i_file_acl |= ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; inode->i_size = ext4_isize(sb, raw_inode); size = i_size_read(inode); if (size < 0 || size > ext4_get_maxbytes(inode)) { ext4_error_inode(inode, function, line, 0, "iget: bad i_size value: %lld", size); ret = -EFSCORRUPTED; goto bad_inode; } /* * If dir_index is not enabled but there's dir with INDEX flag set, * we'd normally treat htree data as empty space. But with metadata * checksumming that corrupts checksums so forbid that. */ if (!ext4_has_feature_dir_index(sb) && ext4_has_feature_metadata_csum(sb) && ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) { ext4_error_inode(inode, function, line, 0, "iget: Dir with htree data on filesystem without dir_index feature."); ret = -EFSCORRUPTED; goto bad_inode; } ei->i_disksize = inode->i_size; #ifdef CONFIG_QUOTA ei->i_reserved_quota = 0; #endif inode->i_generation = le32_to_cpu(raw_inode->i_generation); ei->i_block_group = iloc.block_group; ei->i_last_alloc_group = ~0; /* * NOTE! The in-memory inode i_data array is in little-endian order * even on big-endian machines: we do NOT byteswap the block numbers! */ for (block = 0; block < EXT4_N_BLOCKS; block++) ei->i_data[block] = raw_inode->i_block[block]; INIT_LIST_HEAD(&ei->i_orphan); ext4_fc_init_inode(&ei->vfs_inode); /* * Set transaction id's of transactions that have to be committed * to finish f[data]sync. We set them to currently running transaction * as we cannot be sure that the inode or some of its metadata isn't * part of the transaction - the inode could have been reclaimed and * now it is reread from disk. */ if (journal) { transaction_t *transaction; tid_t tid; read_lock(&journal->j_state_lock); if (journal->j_running_transaction) transaction = journal->j_running_transaction; else transaction = journal->j_committing_transaction; if (transaction) tid = transaction->t_tid; else tid = journal->j_commit_sequence; read_unlock(&journal->j_state_lock); ei->i_sync_tid = tid; ei->i_datasync_tid = tid; } if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { if (ei->i_extra_isize == 0) { /* The extra space is currently unused. Use it. */ BUILD_BUG_ON(sizeof(struct ext4_inode) & 3); ei->i_extra_isize = sizeof(struct ext4_inode) - EXT4_GOOD_OLD_INODE_SIZE; } else { ret = ext4_iget_extra_inode(inode, raw_inode, ei); if (ret) goto bad_inode; } } EXT4_INODE_GET_CTIME(inode, raw_inode); EXT4_INODE_GET_ATIME(inode, raw_inode); EXT4_INODE_GET_MTIME(inode, raw_inode); EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { u64 ivers = le32_to_cpu(raw_inode->i_disk_version); if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) ivers |= (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; } ext4_inode_set_iversion_queried(inode, ivers); } ret = 0; if (ei->i_file_acl && !ext4_inode_block_valid(inode, ei->i_file_acl, 1)) { ext4_error_inode(inode, function, line, 0, "iget: bad extended attribute block %llu", ei->i_file_acl); ret = -EFSCORRUPTED; goto bad_inode; } else if (!ext4_has_inline_data(inode)) { /* validate the block references in the inode */ if (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) && (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || (S_ISLNK(inode->i_mode) && !ext4_inode_is_fast_symlink(inode)))) { if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) ret = ext4_ext_check_inode(inode); else ret = ext4_ind_check_inode(inode); } } if (ret) goto bad_inode; if (S_ISREG(inode->i_mode)) { inode->i_op = &ext4_file_inode_operations; inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); } else if (S_ISDIR(inode->i_mode)) { inode->i_op = &ext4_dir_inode_operations; inode->i_fop = &ext4_dir_operations; } else if (S_ISLNK(inode->i_mode)) { /* VFS does not allow setting these so must be corruption */ if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { ext4_error_inode(inode, function, line, 0, "iget: immutable or append flags " "not allowed on symlinks"); ret = -EFSCORRUPTED; goto bad_inode; } if (IS_ENCRYPTED(inode)) { inode->i_op = &ext4_encrypted_symlink_inode_operations; } else if (ext4_inode_is_fast_symlink(inode)) { inode->i_op = &ext4_fast_symlink_inode_operations; if (inode->i_size == 0 || inode->i_size >= sizeof(ei->i_data) || strnlen((char *)ei->i_data, inode->i_size + 1) != inode->i_size) { ext4_error_inode(inode, function, line, 0, "invalid fast symlink length %llu", (unsigned long long)inode->i_size); ret = -EFSCORRUPTED; goto bad_inode; } inode_set_cached_link(inode, (char *)ei->i_data, inode->i_size); } else { inode->i_op = &ext4_symlink_inode_operations; } } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { inode->i_op = &ext4_special_inode_operations; if (raw_inode->i_block[0]) init_special_inode(inode, inode->i_mode, old_decode_dev(le32_to_cpu(raw_inode->i_block[0]))); else init_special_inode(inode, inode->i_mode, new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); } else if (ino == EXT4_BOOT_LOADER_INO) { make_bad_inode(inode); } else { ret = -EFSCORRUPTED; ext4_error_inode(inode, function, line, 0, "iget: bogus i_mode (%o)", inode->i_mode); goto bad_inode; } if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb)) { ext4_error_inode(inode, function, line, 0, "casefold flag without casefold feature"); ret = -EFSCORRUPTED; goto bad_inode; } if (ext4_should_enable_large_folio(inode)) mapping_set_large_folios(inode->i_mapping); ret = check_igot_inode(inode, flags, function, line); /* * -ESTALE here means there is nothing inherently wrong with the inode, * it's just not an inode we can return for an fhandle lookup. */ if (ret == -ESTALE) { brelse(iloc.bh); unlock_new_inode(inode); iput(inode); return ERR_PTR(-ESTALE); } if (ret) goto bad_inode; brelse(iloc.bh); unlock_new_inode(inode); return inode; bad_inode: brelse(iloc.bh); iget_failed(inode); return ERR_PTR(ret); } static void __ext4_update_other_inode_time(struct super_block *sb, unsigned long orig_ino, unsigned long ino, struct ext4_inode *raw_inode) { struct inode *inode; inode = find_inode_by_ino_rcu(sb, ino); if (!inode) return; if (!inode_is_dirtytime_only(inode)) return; spin_lock(&inode->i_lock); if (inode_is_dirtytime_only(inode)) { struct ext4_inode_info *ei = EXT4_I(inode); inode->i_state &= ~I_DIRTY_TIME; spin_unlock(&inode->i_lock); spin_lock(&ei->i_raw_lock); EXT4_INODE_SET_CTIME(inode, raw_inode); EXT4_INODE_SET_MTIME(inode, raw_inode); EXT4_INODE_SET_ATIME(inode, raw_inode); ext4_inode_csum_set(inode, raw_inode, ei); spin_unlock(&ei->i_raw_lock); trace_ext4_other_inode_update_time(inode, orig_ino); return; } spin_unlock(&inode->i_lock); } /* * Opportunistically update the other time fields for other inodes in * the same inode table block. */ static void ext4_update_other_inodes_time(struct super_block *sb, unsigned long orig_ino, char *buf) { unsigned long ino; int i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; int inode_size = EXT4_INODE_SIZE(sb); /* * Calculate the first inode in the inode table block. Inode * numbers are one-based. That is, the first inode in a block * (assuming 4k blocks and 256 byte inodes) is (n*16 + 1). */ ino = ((orig_ino - 1) & ~(inodes_per_block - 1)) + 1; rcu_read_lock(); for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) { if (ino == orig_ino) continue; __ext4_update_other_inode_time(sb, orig_ino, ino, (struct ext4_inode *)buf); } rcu_read_unlock(); } /* * Post the struct inode info into an on-disk inode location in the * buffer-cache. This gobbles the caller's reference to the * buffer_head in the inode location struct. * * The caller must have write access to iloc->bh. */ static int ext4_do_update_inode(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc) { struct ext4_inode *raw_inode = ext4_raw_inode(iloc); struct ext4_inode_info *ei = EXT4_I(inode); struct buffer_head *bh = iloc->bh; struct super_block *sb = inode->i_sb; int err; int need_datasync = 0, set_large_file = 0; spin_lock(&ei->i_raw_lock); /* * For fields not tracked in the in-memory inode, initialise them * to zero for new inodes. */ if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); if (READ_ONCE(ei->i_disksize) != ext4_isize(inode->i_sb, raw_inode)) need_datasync = 1; if (ei->i_disksize > 0x7fffffffULL) { if (!ext4_has_feature_large_file(sb) || EXT4_SB(sb)->s_es->s_rev_level == cpu_to_le32(EXT4_GOOD_OLD_REV)) set_large_file = 1; } err = ext4_fill_raw_inode(inode, raw_inode); spin_unlock(&ei->i_raw_lock); if (err) { EXT4_ERROR_INODE(inode, "corrupted inode contents"); goto out_brelse; } if (inode->i_sb->s_flags & SB_LAZYTIME) ext4_update_other_inodes_time(inode->i_sb, inode->i_ino, bh->b_data); BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, NULL, bh); if (err) goto out_error; ext4_clear_inode_state(inode, EXT4_STATE_NEW); if (set_large_file) { BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access"); err = ext4_journal_get_write_access(handle, sb, EXT4_SB(sb)->s_sbh, EXT4_JTR_NONE); if (err) goto out_error; lock_buffer(EXT4_SB(sb)->s_sbh); ext4_set_feature_large_file(sb); ext4_superblock_csum_set(sb); unlock_buffer(EXT4_SB(sb)->s_sbh); ext4_handle_sync(handle); err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); } ext4_update_inode_fsync_trans(handle, inode, need_datasync); out_error: ext4_std_error(inode->i_sb, err); out_brelse: brelse(bh); return err; } /* * ext4_write_inode() * * We are called from a few places: * * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files. * Here, there will be no transaction running. We wait for any running * transaction to commit. * * - Within flush work (sys_sync(), kupdate and such). * We wait on commit, if told to. * * - Within iput_final() -> write_inode_now() * We wait on commit, if told to. * * In all cases it is actually safe for us to return without doing anything, * because the inode has been copied into a raw inode buffer in * ext4_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL * writeback. * * Note that we are absolutely dependent upon all inode dirtiers doing the * right thing: they *must* call mark_inode_dirty() after dirtying info in * which we are interested. * * It would be a bug for them to not do this. The code: * * mark_inode_dirty(inode) * stuff(); * inode->i_size = expr; * * is in error because write_inode() could occur while `stuff()' is running, * and the new i_size will be lost. Plus the inode will no longer be on the * superblock's dirty inode list. */ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) { int err; if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) return 0; err = ext4_emergency_state(inode->i_sb); if (unlikely(err)) return err; if (EXT4_SB(inode->i_sb)->s_journal) { if (ext4_journal_current_handle()) { ext4_debug("called recursively, non-PF_MEMALLOC!\n"); dump_stack(); return -EIO; } /* * No need to force transaction in WB_SYNC_NONE mode. Also * ext4_sync_fs() will force the commit after everything is * written. */ if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync) return 0; err = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal, EXT4_I(inode)->i_sync_tid); } else { struct ext4_iloc iloc; err = __ext4_get_inode_loc_noinmem(inode, &iloc); if (err) return err; /* * sync(2) will flush the whole buffer cache. No need to do * it here separately for each inode. */ if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) sync_dirty_buffer(iloc.bh); if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { ext4_error_inode_block(inode, iloc.bh->b_blocknr, EIO, "IO error syncing inode"); err = -EIO; } brelse(iloc.bh); } return err; } /* * In data=journal mode ext4_journalled_invalidate_folio() may fail to invalidate * buffers that are attached to a folio straddling i_size and are undergoing * commit. In that case we have to wait for commit to finish and try again. */ static void ext4_wait_for_tail_page_commit(struct inode *inode) { unsigned offset; journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; tid_t commit_tid; int ret; bool has_transaction; offset = inode->i_size & (PAGE_SIZE - 1); /* * If the folio is fully truncated, we don't need to wait for any commit * (and we even should not as __ext4_journalled_invalidate_folio() may * strip all buffers from the folio but keep the folio dirty which can then * confuse e.g. concurrent ext4_writepages() seeing dirty folio without * buffers). Also we don't need to wait for any commit if all buffers in * the folio remain valid. This is most beneficial for the common case of * blocksize == PAGESIZE. */ if (!offset || offset > (PAGE_SIZE - i_blocksize(inode))) return; while (1) { struct folio *folio = filemap_lock_folio(inode->i_mapping, inode->i_size >> PAGE_SHIFT); if (IS_ERR(folio)) return; ret = __ext4_journalled_invalidate_folio(folio, offset, folio_size(folio) - offset); folio_unlock(folio); folio_put(folio); if (ret != -EBUSY) return; has_transaction = false; read_lock(&journal->j_state_lock); if (journal->j_committing_transaction) { commit_tid = journal->j_committing_transaction->t_tid; has_transaction = true; } read_unlock(&journal->j_state_lock); if (has_transaction) jbd2_log_wait_commit(journal, commit_tid); } } /* * ext4_setattr() * * Called from notify_change. * * We want to trap VFS attempts to truncate the file as soon as * possible. In particular, we want to make sure that when the VFS * shrinks i_size, we put the inode on the orphan list and modify * i_disksize immediately, so that during the subsequent flushing of * dirty pages and freeing of disk blocks, we can guarantee that any * commit will leave the blocks being flushed in an unused state on * disk. (On recovery, the inode will get truncated and the blocks will * be freed, so we have a strong guarantee that no future commit will * leave these blocks visible to the user.) * * Another thing we have to assure is that if we are in ordered mode * and inode is still attached to the committing transaction, we must * we start writeout of all the dirty pages which are being truncated. * This way we are sure that all the data written in the previous * transaction are already on disk (truncate waits for pages under * writeback). * * Called with inode->i_rwsem down. */ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int error, rc = 0; int orphan = 0; const unsigned int ia_valid = attr->ia_valid; bool inc_ivers = true; error = ext4_emergency_state(inode->i_sb); if (unlikely(error)) return error; if (unlikely(IS_IMMUTABLE(inode))) return -EPERM; if (unlikely(IS_APPEND(inode) && (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)))) return -EPERM; error = setattr_prepare(idmap, dentry, attr); if (error) return error; error = fscrypt_prepare_setattr(dentry, attr); if (error) return error; error = fsverity_prepare_setattr(dentry, attr); if (error) return error; if (is_quota_modification(idmap, inode, attr)) { error = dquot_initialize(inode); if (error) return error; } if (i_uid_needs_update(idmap, attr, inode) || i_gid_needs_update(idmap, attr, inode)) { handle_t *handle; /* (user+group)*(old+new) structure, inode write (sb, * inode block, ? - but truncate inode update has it) */ handle = ext4_journal_start(inode, EXT4_HT_QUOTA, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) + EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)) + 3); if (IS_ERR(handle)) { error = PTR_ERR(handle); goto err_out; } /* dquot_transfer() calls back ext4_get_inode_usage() which * counts xattr inode references. */ down_read(&EXT4_I(inode)->xattr_sem); error = dquot_transfer(idmap, inode, attr); up_read(&EXT4_I(inode)->xattr_sem); if (error) { ext4_journal_stop(handle); return error; } /* Update corresponding info in inode so that everything is in * one transaction */ i_uid_update(idmap, attr, inode); i_gid_update(idmap, attr, inode); error = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); if (unlikely(error)) { return error; } } if (attr->ia_valid & ATTR_SIZE) { handle_t *handle; loff_t oldsize = inode->i_size; loff_t old_disksize; int shrink = (attr->ia_size < inode->i_size); if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); if (attr->ia_size > sbi->s_bitmap_maxbytes) { return -EFBIG; } } if (!S_ISREG(inode->i_mode)) { return -EINVAL; } if (attr->ia_size == inode->i_size) inc_ivers = false; if (shrink) { if (ext4_should_order_data(inode)) { error = ext4_begin_ordered_truncate(inode, attr->ia_size); if (error) goto err_out; } /* * Blocks are going to be removed from the inode. Wait * for dio in flight. */ inode_dio_wait(inode); } filemap_invalidate_lock(inode->i_mapping); rc = ext4_break_layouts(inode); if (rc) { filemap_invalidate_unlock(inode->i_mapping); goto err_out; } if (attr->ia_size != inode->i_size) { /* attach jbd2 jinode for EOF folio tail zeroing */ if (attr->ia_size & (inode->i_sb->s_blocksize - 1) || oldsize & (inode->i_sb->s_blocksize - 1)) { error = ext4_inode_attach_jinode(inode); if (error) goto out_mmap_sem; } handle = ext4_journal_start(inode, EXT4_HT_INODE, 3); if (IS_ERR(handle)) { error = PTR_ERR(handle); goto out_mmap_sem; } if (ext4_handle_valid(handle) && shrink) { error = ext4_orphan_add(handle, inode); orphan = 1; } /* * Update c/mtime and tail zero the EOF folio on * truncate up. ext4_truncate() handles the shrink case * below. */ if (!shrink) { inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); if (oldsize & (inode->i_sb->s_blocksize - 1)) ext4_block_truncate_page(handle, inode->i_mapping, oldsize); } if (shrink) ext4_fc_track_range(handle, inode, (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >> inode->i_sb->s_blocksize_bits, EXT_MAX_BLOCKS - 1); else ext4_fc_track_range( handle, inode, (oldsize > 0 ? oldsize - 1 : oldsize) >> inode->i_sb->s_blocksize_bits, (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >> inode->i_sb->s_blocksize_bits); down_write(&EXT4_I(inode)->i_data_sem); old_disksize = EXT4_I(inode)->i_disksize; EXT4_I(inode)->i_disksize = attr->ia_size; /* * We have to update i_size under i_data_sem together * with i_disksize to avoid races with writeback code * running ext4_wb_update_i_disksize(). */ if (!error) i_size_write(inode, attr->ia_size); else EXT4_I(inode)->i_disksize = old_disksize; up_write(&EXT4_I(inode)->i_data_sem); rc = ext4_mark_inode_dirty(handle, inode); if (!error) error = rc; ext4_journal_stop(handle); if (error) goto out_mmap_sem; if (!shrink) { pagecache_isize_extended(inode, oldsize, inode->i_size); } else if (ext4_should_journal_data(inode)) { ext4_wait_for_tail_page_commit(inode); } } /* * Truncate pagecache after we've waited for commit * in data=journal mode to make pages freeable. */ truncate_pagecache(inode, inode->i_size); /* * Call ext4_truncate() even if i_size didn't change to * truncate possible preallocated blocks. */ if (attr->ia_size <= oldsize) { rc = ext4_truncate(inode); if (rc) error = rc; } out_mmap_sem: filemap_invalidate_unlock(inode->i_mapping); } if (!error) { if (inc_ivers) inode_inc_iversion(inode); setattr_copy(idmap, inode, attr); mark_inode_dirty(inode); } /* * If the call to ext4_truncate failed to get a transaction handle at * all, we need to clean up the in-core orphan list manually. */ if (orphan && inode->i_nlink) ext4_orphan_del(NULL, inode); if (!error && (ia_valid & ATTR_MODE)) rc = posix_acl_chmod(idmap, dentry, inode->i_mode); err_out: if (error) ext4_std_error(inode->i_sb, error); if (!error) error = rc; return error; } u32 ext4_dio_alignment(struct inode *inode) { if (fsverity_active(inode)) return 0; if (ext4_should_journal_data(inode)) return 0; if (ext4_has_inline_data(inode)) return 0; if (IS_ENCRYPTED(inode)) { if (!fscrypt_dio_supported(inode)) return 0; return i_blocksize(inode); } return 1; /* use the iomap defaults */ } int ext4_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct ext4_inode *raw_inode; struct ext4_inode_info *ei = EXT4_I(inode); unsigned int flags; if ((request_mask & STATX_BTIME) && EXT4_FITS_IN_INODE(raw_inode, ei, i_crtime)) { stat->result_mask |= STATX_BTIME; stat->btime.tv_sec = ei->i_crtime.tv_sec; stat->btime.tv_nsec = ei->i_crtime.tv_nsec; } /* * Return the DIO alignment restrictions if requested. We only return * this information when requested, since on encrypted files it might * take a fair bit of work to get if the file wasn't opened recently. */ if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) { u32 dio_align = ext4_dio_alignment(inode); stat->result_mask |= STATX_DIOALIGN; if (dio_align == 1) { struct block_device *bdev = inode->i_sb->s_bdev; /* iomap defaults */ stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; stat->dio_offset_align = bdev_logical_block_size(bdev); } else { stat->dio_mem_align = dio_align; stat->dio_offset_align = dio_align; } } if ((request_mask & STATX_WRITE_ATOMIC) && S_ISREG(inode->i_mode)) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); unsigned int awu_min = 0, awu_max = 0; if (ext4_inode_can_atomic_write(inode)) { awu_min = sbi->s_awu_min; awu_max = sbi->s_awu_max; } generic_fill_statx_atomic_writes(stat, awu_min, awu_max, 0); } flags = ei->i_flags & EXT4_FL_USER_VISIBLE; if (flags & EXT4_APPEND_FL) stat->attributes |= STATX_ATTR_APPEND; if (flags & EXT4_COMPR_FL) stat->attributes |= STATX_ATTR_COMPRESSED; if (flags & EXT4_ENCRYPT_FL) stat->attributes |= STATX_ATTR_ENCRYPTED; if (flags & EXT4_IMMUTABLE_FL) stat->attributes |= STATX_ATTR_IMMUTABLE; if (flags & EXT4_NODUMP_FL) stat->attributes |= STATX_ATTR_NODUMP; if (flags & EXT4_VERITY_FL) stat->attributes |= STATX_ATTR_VERITY; stat->attributes_mask |= (STATX_ATTR_APPEND | STATX_ATTR_COMPRESSED | STATX_ATTR_ENCRYPTED | STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP | STATX_ATTR_VERITY); generic_fillattr(idmap, request_mask, inode, stat); return 0; } int ext4_file_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); u64 delalloc_blocks; ext4_getattr(idmap, path, stat, request_mask, query_flags); /* * If there is inline data in the inode, the inode will normally not * have data blocks allocated (it may have an external xattr block). * Report at least one sector for such files, so tools like tar, rsync, * others don't incorrectly think the file is completely sparse. */ if (unlikely(ext4_has_inline_data(inode))) stat->blocks += (stat->size + 511) >> 9; /* * We can't update i_blocks if the block allocation is delayed * otherwise in the case of system crash before the real block * allocation is done, we will have i_blocks inconsistent with * on-disk file blocks. * We always keep i_blocks updated together with real * allocation. But to not confuse with user, stat * will return the blocks that include the delayed allocation * blocks for this file. */ delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb), EXT4_I(inode)->i_reserved_data_blocks); stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits - 9); return 0; } static int ext4_index_trans_blocks(struct inode *inode, int lblocks, int pextents) { if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return ext4_ind_trans_blocks(inode, lblocks); return ext4_ext_index_trans_blocks(inode, pextents); } /* * Account for index blocks, block groups bitmaps and block group * descriptor blocks if modify datablocks and index blocks * worse case, the indexs blocks spread over different block groups * * If datablocks are discontiguous, they are possible to spread over * different block groups too. If they are contiguous, with flexbg, * they could still across block group boundary. * * Also account for superblock, inode, quota and xattr blocks */ int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents) { ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); int gdpblocks; int idxblocks; int ret; /* * How many index and lead blocks need to touch to map @lblocks * logical blocks to @pextents physical extents? */ idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents); /* * Now let's see how many group bitmaps and group descriptors need * to account */ groups = idxblocks; gdpblocks = groups; if (groups > ngroups) groups = ngroups; if (groups > EXT4_SB(inode->i_sb)->s_gdb_count) gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count; /* bitmaps and block group descriptor blocks */ ret = idxblocks + groups + gdpblocks; /* Blocks for super block, inode, quota and xattr blocks */ ret += EXT4_META_TRANS_BLOCKS(inode->i_sb); return ret; } /* * Calculate the total number of credits to reserve to fit * the modification of a single pages into a single transaction, * which may include multiple chunks of block allocations. * * This could be called via ext4_write_begin() * * We need to consider the worse case, when * one new block per extent. */ int ext4_writepage_trans_blocks(struct inode *inode) { int bpp = ext4_journal_blocks_per_folio(inode); int ret; ret = ext4_meta_trans_blocks(inode, bpp, bpp); /* Account for data blocks for journalled mode */ if (ext4_should_journal_data(inode)) ret += bpp; return ret; } /* * Calculate the journal credits for a chunk of data modification. * * This is called from DIO, fallocate or whoever calling * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks. * * journal buffers for data blocks are not included here, as DIO * and fallocate do no need to journal data buffers. */ int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks) { return ext4_meta_trans_blocks(inode, nrblocks, 1); } /* * The caller must have previously called ext4_reserve_inode_write(). * Give this, we know that the caller already has write access to iloc->bh. */ int ext4_mark_iloc_dirty(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc) { int err = 0; err = ext4_emergency_state(inode->i_sb); if (unlikely(err)) { put_bh(iloc->bh); return err; } ext4_fc_track_inode(handle, inode); /* the do_update_inode consumes one bh->b_count */ get_bh(iloc->bh); /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */ err = ext4_do_update_inode(handle, inode, iloc); put_bh(iloc->bh); return err; } /* * On success, We end up with an outstanding reference count against * iloc->bh. This _must_ be cleaned up later. */ int ext4_reserve_inode_write(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc) { int err; err = ext4_emergency_state(inode->i_sb); if (unlikely(err)) return err; err = ext4_get_inode_loc(inode, iloc); if (!err) { BUFFER_TRACE(iloc->bh, "get_write_access"); err = ext4_journal_get_write_access(handle, inode->i_sb, iloc->bh, EXT4_JTR_NONE); if (err) { brelse(iloc->bh); iloc->bh = NULL; } ext4_fc_track_inode(handle, inode); } ext4_std_error(inode->i_sb, err); return err; } static int __ext4_expand_extra_isize(struct inode *inode, unsigned int new_extra_isize, struct ext4_iloc *iloc, handle_t *handle, int *no_expand) { struct ext4_inode *raw_inode; struct ext4_xattr_ibody_header *header; unsigned int inode_size = EXT4_INODE_SIZE(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); int error; /* this was checked at iget time, but double check for good measure */ if ((EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > inode_size) || (ei->i_extra_isize & 3)) { EXT4_ERROR_INODE(inode, "bad extra_isize %u (inode size %u)", ei->i_extra_isize, EXT4_INODE_SIZE(inode->i_sb)); return -EFSCORRUPTED; } if ((new_extra_isize < ei->i_extra_isize) || (new_extra_isize < 4) || (new_extra_isize > inode_size - EXT4_GOOD_OLD_INODE_SIZE)) return -EINVAL; /* Should never happen */ raw_inode = ext4_raw_inode(iloc); header = IHDR(inode, raw_inode); /* No extended attributes present */ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) || header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + EXT4_I(inode)->i_extra_isize, 0, new_extra_isize - EXT4_I(inode)->i_extra_isize); EXT4_I(inode)->i_extra_isize = new_extra_isize; return 0; } /* * We may need to allocate external xattr block so we need quotas * initialized. Here we can be called with various locks held so we * cannot affort to initialize quotas ourselves. So just bail. */ if (dquot_initialize_needed(inode)) return -EAGAIN; /* try to expand with EAs present */ error = ext4_expand_extra_isize_ea(inode, new_extra_isize, raw_inode, handle); if (error) { /* * Inode size expansion failed; don't try again */ *no_expand = 1; } return error; } /* * Expand an inode by new_extra_isize bytes. * Returns 0 on success or negative error number on failure. */ static int ext4_try_to_expand_extra_isize(struct inode *inode, unsigned int new_extra_isize, struct ext4_iloc iloc, handle_t *handle) { int no_expand; int error; if (ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) return -EOVERFLOW; /* * In nojournal mode, we can immediately attempt to expand * the inode. When journaled, we first need to obtain extra * buffer credits since we may write into the EA block * with this same handle. If journal_extend fails, then it will * only result in a minor loss of functionality for that inode. * If this is felt to be critical, then e2fsck should be run to * force a large enough s_min_extra_isize. */ if (ext4_journal_extend(handle, EXT4_DATA_TRANS_BLOCKS(inode->i_sb), 0) != 0) return -ENOSPC; if (ext4_write_trylock_xattr(inode, &no_expand) == 0) return -EBUSY; error = __ext4_expand_extra_isize(inode, new_extra_isize, &iloc, handle, &no_expand); ext4_write_unlock_xattr(inode, &no_expand); return error; } int ext4_expand_extra_isize(struct inode *inode, unsigned int new_extra_isize, struct ext4_iloc *iloc) { handle_t *handle; int no_expand; int error, rc; if (ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) { brelse(iloc->bh); return -EOVERFLOW; } handle = ext4_journal_start(inode, EXT4_HT_INODE, EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); if (IS_ERR(handle)) { error = PTR_ERR(handle); brelse(iloc->bh); return error; } ext4_write_lock_xattr(inode, &no_expand); BUFFER_TRACE(iloc->bh, "get_write_access"); error = ext4_journal_get_write_access(handle, inode->i_sb, iloc->bh, EXT4_JTR_NONE); if (error) { brelse(iloc->bh); goto out_unlock; } error = __ext4_expand_extra_isize(inode, new_extra_isize, iloc, handle, &no_expand); rc = ext4_mark_iloc_dirty(handle, inode, iloc); if (!error) error = rc; out_unlock: ext4_write_unlock_xattr(inode, &no_expand); ext4_journal_stop(handle); return error; } /* * What we do here is to mark the in-core inode as clean with respect to inode * dirtiness (it may still be data-dirty). * This means that the in-core inode may be reaped by prune_icache * without having to perform any I/O. This is a very good thing, * because *any* task may call prune_icache - even ones which * have a transaction open against a different journal. * * Is this cheating? Not really. Sure, we haven't written the * inode out, but prune_icache isn't a user-visible syncing function. * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) * we start and wait on commits. */ int __ext4_mark_inode_dirty(handle_t *handle, struct inode *inode, const char *func, unsigned int line) { struct ext4_iloc iloc; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int err; might_sleep(); trace_ext4_mark_inode_dirty(inode, _RET_IP_); err = ext4_reserve_inode_write(handle, inode, &iloc); if (err) goto out; if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize) ext4_try_to_expand_extra_isize(inode, sbi->s_want_extra_isize, iloc, handle); err = ext4_mark_iloc_dirty(handle, inode, &iloc); out: if (unlikely(err)) ext4_error_inode_err(inode, func, line, 0, err, "mark_inode_dirty error"); return err; } /* * ext4_dirty_inode() is called from __mark_inode_dirty() * * We're really interested in the case where a file is being extended. * i_size has been changed by generic_commit_write() and we thus need * to include the updated inode in the current transaction. * * Also, dquot_alloc_block() will always dirty the inode when blocks * are allocated to the file. * * If the inode is marked synchronous, we don't honour that here - doing * so would cause a commit on atime updates, which we don't bother doing. * We handle synchronous inodes at the highest possible level. */ void ext4_dirty_inode(struct inode *inode, int flags) { handle_t *handle; handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) return; ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); } int ext4_change_inode_journal_flag(struct inode *inode, int val) { journal_t *journal; handle_t *handle; int err; int alloc_ctx; /* * We have to be very careful here: changing a data block's * journaling status dynamically is dangerous. If we write a * data block to the journal, change the status and then delete * that block, we risk forgetting to revoke the old log record * from the journal and so a subsequent replay can corrupt data. * So, first we make sure that the journal is empty and that * nobody is changing anything. */ journal = EXT4_JOURNAL(inode); if (!journal) return 0; if (is_journal_aborted(journal)) return -EROFS; /* Wait for all existing dio workers */ inode_dio_wait(inode); /* * Before flushing the journal and switching inode's aops, we have * to flush all dirty data the inode has. There can be outstanding * delayed allocations, there can be unwritten extents created by * fallocate or buffered writes in dioread_nolock mode covered by * dirty data which can be converted only after flushing the dirty * data (and journalled aops don't know how to handle these cases). */ if (val) { filemap_invalidate_lock(inode->i_mapping); err = filemap_write_and_wait(inode->i_mapping); if (err < 0) { filemap_invalidate_unlock(inode->i_mapping); return err; } } alloc_ctx = ext4_writepages_down_write(inode->i_sb); jbd2_journal_lock_updates(journal); /* * OK, there are no updates running now, and all cached data is * synced to disk. We are now in a completely consistent state * which doesn't have anything in the journal, and we know that * no filesystem updates are running, so it is safe to modify * the inode's in-core data-journaling state flag now. */ if (val) ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); else { err = jbd2_journal_flush(journal, 0); if (err < 0) { jbd2_journal_unlock_updates(journal); ext4_writepages_up_write(inode->i_sb, alloc_ctx); return err; } ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); } ext4_set_aops(inode); jbd2_journal_unlock_updates(journal); ext4_writepages_up_write(inode->i_sb, alloc_ctx); if (val) filemap_invalidate_unlock(inode->i_mapping); /* Finally we can mark the inode as dirty. */ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) return PTR_ERR(handle); ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, handle); err = ext4_mark_inode_dirty(handle, inode); ext4_handle_sync(handle); ext4_journal_stop(handle); ext4_std_error(inode->i_sb, err); return err; } static int ext4_bh_unmapped(handle_t *handle, struct inode *inode, struct buffer_head *bh) { return !buffer_mapped(bh); } vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct folio *folio = page_folio(vmf->page); loff_t size; unsigned long len; int err; vm_fault_t ret; struct file *file = vma->vm_file; struct inode *inode = file_inode(file); struct address_space *mapping = inode->i_mapping; handle_t *handle; get_block_t *get_block; int retries = 0; if (unlikely(IS_IMMUTABLE(inode))) return VM_FAULT_SIGBUS; sb_start_pagefault(inode->i_sb); file_update_time(vma->vm_file); filemap_invalidate_lock_shared(mapping); err = ext4_convert_inline_data(inode); if (err) goto out_ret; /* * On data journalling we skip straight to the transaction handle: * there's no delalloc; page truncated will be checked later; the * early return w/ all buffers mapped (calculates size/len) can't * be used; and there's no dioread_nolock, so only ext4_get_block. */ if (ext4_should_journal_data(inode)) goto retry_alloc; /* Delalloc case is easy... */ if (test_opt(inode->i_sb, DELALLOC) && !ext4_nonda_switch(inode->i_sb)) { do { err = block_page_mkwrite(vma, vmf, ext4_da_get_block_prep); } while (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)); goto out_ret; } folio_lock(folio); size = i_size_read(inode); /* Page got truncated from under us? */ if (folio->mapping != mapping || folio_pos(folio) > size) { folio_unlock(folio); ret = VM_FAULT_NOPAGE; goto out; } len = folio_size(folio); if (folio_pos(folio) + len > size) len = size - folio_pos(folio); /* * Return if we have all the buffers mapped. This avoids the need to do * journal_start/journal_stop which can block and take a long time * * This cannot be done for data journalling, as we have to add the * inode to the transaction's list to writeprotect pages on commit. */ if (folio_buffers(folio)) { if (!ext4_walk_page_buffers(NULL, inode, folio_buffers(folio), 0, len, NULL, ext4_bh_unmapped)) { /* Wait so that we don't change page under IO */ folio_wait_stable(folio); ret = VM_FAULT_LOCKED; goto out; } } folio_unlock(folio); /* OK, we need to fill the hole... */ if (ext4_should_dioread_nolock(inode)) get_block = ext4_get_block_unwritten; else get_block = ext4_get_block; retry_alloc: handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, ext4_writepage_trans_blocks(inode)); if (IS_ERR(handle)) { ret = VM_FAULT_SIGBUS; goto out; } /* * Data journalling can't use block_page_mkwrite() because it * will set_buffer_dirty() before do_journal_get_write_access() * thus might hit warning messages for dirty metadata buffers. */ if (!ext4_should_journal_data(inode)) { err = block_page_mkwrite(vma, vmf, get_block); } else { folio_lock(folio); size = i_size_read(inode); /* Page got truncated from under us? */ if (folio->mapping != mapping || folio_pos(folio) > size) { ret = VM_FAULT_NOPAGE; goto out_error; } len = folio_size(folio); if (folio_pos(folio) + len > size) len = size - folio_pos(folio); err = ext4_block_write_begin(handle, folio, 0, len, ext4_get_block); if (!err) { ret = VM_FAULT_SIGBUS; if (ext4_journal_folio_buffers(handle, folio, len)) goto out_error; } else { folio_unlock(folio); } } ext4_journal_stop(handle); if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry_alloc; out_ret: ret = vmf_fs_error(err); out: filemap_invalidate_unlock_shared(mapping); sb_end_pagefault(inode->i_sb); return ret; out_error: folio_unlock(folio); ext4_journal_stop(handle); goto out; }
6 5 5 5 5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ADDRCONF_H #define _ADDRCONF_H #define MAX_RTR_SOLICITATIONS -1 /* unlimited */ #define RTR_SOLICITATION_INTERVAL (4*HZ) #define RTR_SOLICITATION_MAX_INTERVAL (3600*HZ) /* 1 hour */ #define MIN_VALID_LIFETIME (2*3600) /* 2 hours */ #define TEMP_VALID_LIFETIME (7*86400) /* 1 week */ #define TEMP_PREFERRED_LIFETIME (86400) /* 24 hours */ #define REGEN_MIN_ADVANCE (2) /* 2 seconds */ #define REGEN_MAX_RETRY (3) #define MAX_DESYNC_FACTOR (600) #define ADDR_CHECK_FREQUENCY (120*HZ) #define IPV6_MAX_ADDRESSES 16 #define ADDRCONF_TIMER_FUZZ_MINUS (HZ > 50 ? HZ / 50 : 1) #define ADDRCONF_TIMER_FUZZ (HZ / 4) #define ADDRCONF_TIMER_FUZZ_MAX (HZ) #define ADDRCONF_NOTIFY_PRIORITY 0 #include <linux/in.h> #include <linux/in6.h> struct prefix_info { __u8 type; __u8 length; __u8 prefix_len; union __packed { __u8 flags; struct __packed { #if defined(__BIG_ENDIAN_BITFIELD) __u8 onlink : 1, autoconf : 1, routeraddr : 1, preferpd : 1, reserved : 4; #elif defined(__LITTLE_ENDIAN_BITFIELD) __u8 reserved : 4, preferpd : 1, routeraddr : 1, autoconf : 1, onlink : 1; #else #error "Please fix <asm/byteorder.h>" #endif }; }; __be32 valid; __be32 prefered; __be32 reserved2; struct in6_addr prefix; }; /* rfc4861 4.6.2: IPv6 PIO is 32 bytes in size */ static_assert(sizeof(struct prefix_info) == 32); #include <linux/ipv6.h> #include <linux/netdevice.h> #include <net/if_inet6.h> #include <net/ipv6.h> struct in6_validator_info { struct in6_addr i6vi_addr; struct inet6_dev *i6vi_dev; struct netlink_ext_ack *extack; }; struct ifa6_config { const struct in6_addr *pfx; unsigned int plen; u8 ifa_proto; const struct in6_addr *peer_pfx; u32 rt_priority; u32 ifa_flags; u32 preferred_lft; u32 valid_lft; u16 scope; }; enum addr_type_t { UNICAST_ADDR, MULTICAST_ADDR, ANYCAST_ADDR, }; struct inet6_fill_args { u32 portid; u32 seq; int event; unsigned int flags; int netnsid; int ifindex; enum addr_type_t type; bool force_rt_scope_universe; }; int addrconf_init(void); void addrconf_cleanup(void); int addrconf_add_ifaddr(struct net *net, void __user *arg); int addrconf_del_ifaddr(struct net *net, void __user *arg); int addrconf_set_dstaddr(struct net *net, void __user *arg); int ipv6_chk_addr(struct net *net, const struct in6_addr *addr, const struct net_device *dev, int strict); int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr, const struct net_device *dev, bool skip_dev_check, int strict, u32 banned_flags); #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr); #endif int ipv6_chk_rpl_srh_loop(struct net *net, const struct in6_addr *segs, unsigned char nsegs); bool ipv6_chk_custom_prefix(const struct in6_addr *addr, const unsigned int prefix_len, struct net_device *dev); int ipv6_chk_prefix(const struct in6_addr *addr, struct net_device *dev); struct net_device *ipv6_dev_find(struct net *net, const struct in6_addr *addr, struct net_device *dev); struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, const struct in6_addr *addr, struct net_device *dev, int strict); int ipv6_dev_get_saddr(struct net *net, const struct net_device *dev, const struct in6_addr *daddr, unsigned int srcprefs, struct in6_addr *saddr); int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr, u32 banned_flags); bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, bool match_wildcard); bool inet_rcv_saddr_any(const struct sock *sk); void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr); void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr); void addrconf_add_linklocal(struct inet6_dev *idev, const struct in6_addr *addr, u32 flags); int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, const struct prefix_info *pinfo, struct inet6_dev *in6_dev, const struct in6_addr *addr, int addr_type, u32 addr_flags, bool sllao, bool tokenized, __u32 valid_lft, u32 prefered_lft); static inline void addrconf_addr_eui48_base(u8 *eui, const char *const addr) { memcpy(eui, addr, 3); eui[3] = 0xFF; eui[4] = 0xFE; memcpy(eui + 5, addr + 3, 3); } static inline void addrconf_addr_eui48(u8 *eui, const char *const addr) { addrconf_addr_eui48_base(eui, addr); eui[0] ^= 2; } static inline int addrconf_ifid_eui48(u8 *eui, struct net_device *dev) { if (dev->addr_len != ETH_ALEN) return -1; /* * The zSeries OSA network cards can be shared among various * OS instances, but the OSA cards have only one MAC address. * This leads to duplicate address conflicts in conjunction * with IPv6 if more than one instance uses the same card. * * The driver for these cards can deliver a unique 16-bit * identifier for each instance sharing the same card. It is * placed instead of 0xFFFE in the interface identifier. The * "u" bit of the interface identifier is not inverted in this * case. Hence the resulting interface identifier has local * scope according to RFC2373. */ addrconf_addr_eui48_base(eui, dev->dev_addr); if (dev->dev_id) { eui[3] = (dev->dev_id >> 8) & 0xFF; eui[4] = dev->dev_id & 0xFF; } else { eui[0] ^= 2; } return 0; } #define INFINITY_LIFE_TIME 0xFFFFFFFF static inline unsigned long addrconf_timeout_fixup(u32 timeout, unsigned int unit) { if (timeout == INFINITY_LIFE_TIME) return ~0UL; /* * Avoid arithmetic overflow. * Assuming unit is constant and non-zero, this "if" statement * will go away on 64bit archs. */ if (0xfffffffe > LONG_MAX / unit && timeout > LONG_MAX / unit) return LONG_MAX / unit; return timeout; } static inline int addrconf_finite_timeout(unsigned long timeout) { return ~timeout; } /* * IPv6 Address Label subsystem (addrlabel.c) */ int ipv6_addr_label_init(void); void ipv6_addr_label_cleanup(void); int ipv6_addr_label_rtnl_register(void); u32 ipv6_addr_label(struct net *net, const struct in6_addr *addr, int type, int ifindex); /* * multicast prototypes (mcast.c) */ static inline bool ipv6_mc_may_pull(struct sk_buff *skb, unsigned int len) { if (skb_transport_offset(skb) + ipv6_transport_len(skb) < len) return false; return pskb_may_pull(skb, len); } int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr); int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr); void __ipv6_sock_mc_close(struct sock *sk); void ipv6_sock_mc_close(struct sock *sk); bool inet6_mc_check(const struct sock *sk, const struct in6_addr *mc_addr, const struct in6_addr *src_addr); int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr); int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr); int ipv6_dev_mc_dec(struct net_device *dev, const struct in6_addr *addr); void ipv6_mc_up(struct inet6_dev *idev); void ipv6_mc_down(struct inet6_dev *idev); void ipv6_mc_unmap(struct inet6_dev *idev); void ipv6_mc_remap(struct inet6_dev *idev); void ipv6_mc_init_dev(struct inet6_dev *idev); void ipv6_mc_destroy_dev(struct inet6_dev *idev); int ipv6_mc_check_mld(struct sk_buff *skb); void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp); bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, const struct in6_addr *src_addr); void ipv6_mc_dad_complete(struct inet6_dev *idev); /* * identify MLD packets for MLD filter exceptions */ static inline bool ipv6_is_mld(struct sk_buff *skb, int nexthdr, int offset) { struct icmp6hdr *hdr; if (nexthdr != IPPROTO_ICMPV6 || !pskb_network_may_pull(skb, offset + sizeof(struct icmp6hdr))) return false; hdr = (struct icmp6hdr *)(skb_network_header(skb) + offset); switch (hdr->icmp6_type) { case ICMPV6_MGM_QUERY: case ICMPV6_MGM_REPORT: case ICMPV6_MGM_REDUCTION: case ICMPV6_MLD2_REPORT: return true; default: break; } return false; } void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao); /* * anycast prototypes (anycast.c) */ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr); int ipv6_sock_ac_drop(struct sock *sk, int ifindex, const struct in6_addr *addr); void __ipv6_sock_ac_close(struct sock *sk); void ipv6_sock_ac_close(struct sock *sk); int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr); int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr); void ipv6_ac_destroy_dev(struct inet6_dev *idev); bool ipv6_chk_acast_addr(struct net *net, struct net_device *dev, const struct in6_addr *addr); bool ipv6_chk_acast_addr_src(struct net *net, struct net_device *dev, const struct in6_addr *addr); int ipv6_anycast_init(void); void ipv6_anycast_cleanup(void); /* Device notifier */ int register_inet6addr_notifier(struct notifier_block *nb); int unregister_inet6addr_notifier(struct notifier_block *nb); int inet6addr_notifier_call_chain(unsigned long val, void *v); int register_inet6addr_validator_notifier(struct notifier_block *nb); int unregister_inet6addr_validator_notifier(struct notifier_block *nb); int inet6addr_validator_notifier_call_chain(unsigned long val, void *v); void inet6_netconf_notify_devconf(struct net *net, int event, int type, int ifindex, struct ipv6_devconf *devconf); /** * __in6_dev_get - get inet6_dev pointer from netdevice * @dev: network device * * Caller must hold rcu_read_lock or RTNL, because this function * does not take a reference on the inet6_dev. */ static inline struct inet6_dev *__in6_dev_get(const struct net_device *dev) { return rcu_dereference_rtnl(dev->ip6_ptr); } static inline struct inet6_dev *__in6_dev_get_rtnl_net(const struct net_device *dev) { return rtnl_net_dereference(dev_net(dev), dev->ip6_ptr); } /** * __in6_dev_stats_get - get inet6_dev pointer for stats * @dev: network device * @skb: skb for original incoming interface if needed * * Caller must hold rcu_read_lock or RTNL, because this function * does not take a reference on the inet6_dev. */ static inline struct inet6_dev *__in6_dev_stats_get(const struct net_device *dev, const struct sk_buff *skb) { if (netif_is_l3_master(dev)) dev = dev_get_by_index_rcu(dev_net(dev), inet6_iif(skb)); return __in6_dev_get(dev); } /** * __in6_dev_get_safely - get inet6_dev pointer from netdevice * @dev: network device * * This is a safer version of __in6_dev_get */ static inline struct inet6_dev *__in6_dev_get_safely(const struct net_device *dev) { if (likely(dev)) return rcu_dereference_rtnl(dev->ip6_ptr); else return NULL; } /** * in6_dev_get - get inet6_dev pointer from netdevice * @dev: network device * * This version can be used in any context, and takes a reference * on the inet6_dev. Callers must use in6_dev_put() later to * release this reference. */ static inline struct inet6_dev *in6_dev_get(const struct net_device *dev) { struct inet6_dev *idev; rcu_read_lock(); idev = rcu_dereference(dev->ip6_ptr); if (idev) refcount_inc(&idev->refcnt); rcu_read_unlock(); return idev; } static inline struct neigh_parms *__in6_dev_nd_parms_get_rcu(const struct net_device *dev) { struct inet6_dev *idev = __in6_dev_get(dev); return idev ? idev->nd_parms : NULL; } void in6_dev_finish_destroy(struct inet6_dev *idev); static inline void in6_dev_put(struct inet6_dev *idev) { if (refcount_dec_and_test(&idev->refcnt)) in6_dev_finish_destroy(idev); } static inline void in6_dev_put_clear(struct inet6_dev **pidev) { struct inet6_dev *idev = *pidev; if (idev) { in6_dev_put(idev); *pidev = NULL; } } static inline void __in6_dev_put(struct inet6_dev *idev) { refcount_dec(&idev->refcnt); } static inline void in6_dev_hold(struct inet6_dev *idev) { refcount_inc(&idev->refcnt); } /* called with rcu_read_lock held */ static inline bool ip6_ignore_linkdown(const struct net_device *dev) { const struct inet6_dev *idev = __in6_dev_get(dev); if (unlikely(!idev)) return true; return !!READ_ONCE(idev->cnf.ignore_routes_with_linkdown); } void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp); static inline void in6_ifa_put(struct inet6_ifaddr *ifp) { if (refcount_dec_and_test(&ifp->refcnt)) inet6_ifa_finish_destroy(ifp); } static inline void __in6_ifa_put(struct inet6_ifaddr *ifp) { refcount_dec(&ifp->refcnt); } static inline void in6_ifa_hold(struct inet6_ifaddr *ifp) { refcount_inc(&ifp->refcnt); } static inline bool in6_ifa_hold_safe(struct inet6_ifaddr *ifp) { return refcount_inc_not_zero(&ifp->refcnt); } /* * compute link-local solicited-node multicast address */ static inline void addrconf_addr_solict_mult(const struct in6_addr *addr, struct in6_addr *solicited) { ipv6_addr_set(solicited, htonl(0xFF020000), 0, htonl(0x1), htonl(0xFF000000) | addr->s6_addr32[3]); } static inline bool ipv6_addr_is_ll_all_nodes(const struct in6_addr *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 __be64 *p = (__force __be64 *)addr; return ((p[0] ^ cpu_to_be64(0xff02000000000000UL)) | (p[1] ^ cpu_to_be64(1))) == 0UL; #else return ((addr->s6_addr32[0] ^ htonl(0xff020000)) | addr->s6_addr32[1] | addr->s6_addr32[2] | (addr->s6_addr32[3] ^ htonl(0x00000001))) == 0; #endif } static inline bool ipv6_addr_is_ll_all_routers(const struct in6_addr *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 __be64 *p = (__force __be64 *)addr; return ((p[0] ^ cpu_to_be64(0xff02000000000000UL)) | (p[1] ^ cpu_to_be64(2))) == 0UL; #else return ((addr->s6_addr32[0] ^ htonl(0xff020000)) | addr->s6_addr32[1] | addr->s6_addr32[2] | (addr->s6_addr32[3] ^ htonl(0x00000002))) == 0; #endif } static inline bool ipv6_addr_is_isatap(const struct in6_addr *addr) { return (addr->s6_addr32[2] | htonl(0x02000000)) == htonl(0x02005EFE); } static inline bool ipv6_addr_is_solict_mult(const struct in6_addr *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 __be64 *p = (__force __be64 *)addr; return ((p[0] ^ cpu_to_be64(0xff02000000000000UL)) | ((p[1] ^ cpu_to_be64(0x00000001ff000000UL)) & cpu_to_be64(0xffffffffff000000UL))) == 0UL; #else return ((addr->s6_addr32[0] ^ htonl(0xff020000)) | addr->s6_addr32[1] | (addr->s6_addr32[2] ^ htonl(0x00000001)) | (addr->s6_addr[12] ^ 0xff)) == 0; #endif } static inline bool ipv6_addr_is_all_snoopers(const struct in6_addr *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 __be64 *p = (__force __be64 *)addr; return ((p[0] ^ cpu_to_be64(0xff02000000000000UL)) | (p[1] ^ cpu_to_be64(0x6a))) == 0UL; #else return ((addr->s6_addr32[0] ^ htonl(0xff020000)) | addr->s6_addr32[1] | addr->s6_addr32[2] | (addr->s6_addr32[3] ^ htonl(0x0000006a))) == 0; #endif } #ifdef CONFIG_PROC_FS int if6_proc_init(void); void if6_proc_exit(void); #endif int inet6_fill_ifmcaddr(struct sk_buff *skb, const struct ifmcaddr6 *ifmca, struct inet6_fill_args *args); int inet6_fill_ifacaddr(struct sk_buff *skb, const struct ifacaddr6 *ifaca, struct inet6_fill_args *args); #endif
378 378 378 378 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PGALLOC_TRACK_H #define _LINUX_PGALLOC_TRACK_H #if defined(CONFIG_MMU) static inline p4d_t *p4d_alloc_track(struct mm_struct *mm, pgd_t *pgd, unsigned long address, pgtbl_mod_mask *mod_mask) { if (unlikely(pgd_none(*pgd))) { if (__p4d_alloc(mm, pgd, address)) return NULL; *mod_mask |= PGTBL_PGD_MODIFIED; } return p4d_offset(pgd, address); } static inline pud_t *pud_alloc_track(struct mm_struct *mm, p4d_t *p4d, unsigned long address, pgtbl_mod_mask *mod_mask) { if (unlikely(p4d_none(*p4d))) { if (__pud_alloc(mm, p4d, address)) return NULL; *mod_mask |= PGTBL_P4D_MODIFIED; } return pud_offset(p4d, address); } static inline pmd_t *pmd_alloc_track(struct mm_struct *mm, pud_t *pud, unsigned long address, pgtbl_mod_mask *mod_mask) { if (unlikely(pud_none(*pud))) { if (__pmd_alloc(mm, pud, address)) return NULL; *mod_mask |= PGTBL_PUD_MODIFIED; } return pmd_offset(pud, address); } #endif /* CONFIG_MMU */ #define pte_alloc_kernel_track(pmd, address, mask) \ ((unlikely(pmd_none(*(pmd))) && \ (__pte_alloc_kernel(pmd) || ({*(mask)|=PGTBL_PMD_MODIFIED;0;})))?\ NULL: pte_offset_kernel(pmd, address)) #endif /* _LINUX_PGALLOC_TRACK_H */
88 11 88 13 31 16 13 5 266 55 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_RCULIST_H #define _LINUX_RCULIST_H #ifdef __KERNEL__ /* * RCU-protected list version */ #include <linux/list.h> #include <linux/rcupdate.h> /* * INIT_LIST_HEAD_RCU - Initialize a list_head visible to RCU readers * @list: list to be initialized * * You should instead use INIT_LIST_HEAD() for normal initialization and * cleanup tasks, when readers have no access to the list being initialized. * However, if the list being initialized is visible to readers, you * need to keep the compiler from being too mischievous. */ static inline void INIT_LIST_HEAD_RCU(struct list_head *list) { WRITE_ONCE(list->next, list); WRITE_ONCE(list->prev, list); } /* * return the ->next pointer of a list_head in an rcu safe * way, we must not access it directly */ #define list_next_rcu(list) (*((struct list_head __rcu **)(&(list)->next))) /* * Return the ->prev pointer of a list_head in an rcu safe way. Don't * access it directly. * * Any list traversed with list_bidir_prev_rcu() must never use * list_del_rcu(). Doing so will poison the ->prev pointer that * list_bidir_prev_rcu() relies on, which will result in segfaults. * To prevent these segfaults, use list_bidir_del_rcu() instead * of list_del_rcu(). */ #define list_bidir_prev_rcu(list) (*((struct list_head __rcu **)(&(list)->prev))) /** * list_tail_rcu - returns the prev pointer of the head of the list * @head: the head of the list * * Note: This should only be used with the list header, and even then * only if list_del() and similar primitives are not also used on the * list header. */ #define list_tail_rcu(head) (*((struct list_head __rcu **)(&(head)->prev))) /* * Check during list traversal that we are within an RCU reader */ #define check_arg_count_one(dummy) #ifdef CONFIG_PROVE_RCU_LIST #define __list_check_rcu(dummy, cond, extra...) \ ({ \ check_arg_count_one(extra); \ RCU_LOCKDEP_WARN(!(cond) && !rcu_read_lock_any_held(), \ "RCU-list traversed in non-reader section!"); \ }) #define __list_check_srcu(cond) \ ({ \ RCU_LOCKDEP_WARN(!(cond), \ "RCU-list traversed without holding the required lock!");\ }) #else #define __list_check_rcu(dummy, cond, extra...) \ ({ check_arg_count_one(extra); }) #define __list_check_srcu(cond) ({ }) #endif /* * Insert a new entry between two known consecutive entries. * * This is only for internal list manipulation where we know * the prev/next entries already! */ static inline void __list_add_rcu(struct list_head *new, struct list_head *prev, struct list_head *next) { if (!__list_add_valid(new, prev, next)) return; new->next = next; new->prev = prev; rcu_assign_pointer(list_next_rcu(prev), new); next->prev = new; } /** * list_add_rcu - add a new entry to rcu-protected list * @new: new entry to be added * @head: list head to add it after * * Insert a new entry after the specified head. * This is good for implementing stacks. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as list_add_rcu() * or list_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * list_for_each_entry_rcu(). */ static inline void list_add_rcu(struct list_head *new, struct list_head *head) { __list_add_rcu(new, head, head->next); } /** * list_add_tail_rcu - add a new entry to rcu-protected list * @new: new entry to be added * @head: list head to add it before * * Insert a new entry before the specified head. * This is useful for implementing queues. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as list_add_tail_rcu() * or list_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * list_for_each_entry_rcu(). */ static inline void list_add_tail_rcu(struct list_head *new, struct list_head *head) { __list_add_rcu(new, head->prev, head); } /** * list_del_rcu - deletes entry from list without re-initialization * @entry: the element to delete from the list. * * Note: list_empty() on entry does not return true after this, * the entry is in an undefined state. It is useful for RCU based * lockfree traversal. * * In particular, it means that we can not poison the forward * pointers that may still be used for walking the list. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as list_del_rcu() * or list_add_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * list_for_each_entry_rcu(). * * Note that the caller is not permitted to immediately free * the newly deleted entry. Instead, either synchronize_rcu() * or call_rcu() must be used to defer freeing until an RCU * grace period has elapsed. */ static inline void list_del_rcu(struct list_head *entry) { __list_del_entry(entry); entry->prev = LIST_POISON2; } /** * list_bidir_del_rcu - deletes entry from list without re-initialization * @entry: the element to delete from the list. * * In contrast to list_del_rcu() doesn't poison the prev pointer thus * allowing backwards traversal via list_bidir_prev_rcu(). * * Note: list_empty() on entry does not return true after this because * the entry is in a special undefined state that permits RCU-based * lockfree reverse traversal. In particular this means that we can not * poison the forward and backwards pointers that may still be used for * walking the list. * * The caller must take whatever precautions are necessary (such as * holding appropriate locks) to avoid racing with another list-mutation * primitive, such as list_bidir_del_rcu() or list_add_rcu(), running on * this same list. However, it is perfectly legal to run concurrently * with the _rcu list-traversal primitives, such as * list_for_each_entry_rcu(). * * Note that list_del_rcu() and list_bidir_del_rcu() must not be used on * the same list. * * Note that the caller is not permitted to immediately free * the newly deleted entry. Instead, either synchronize_rcu() * or call_rcu() must be used to defer freeing until an RCU * grace period has elapsed. */ static inline void list_bidir_del_rcu(struct list_head *entry) { __list_del_entry(entry); } /** * hlist_del_init_rcu - deletes entry from hash list with re-initialization * @n: the element to delete from the hash list. * * Note: list_unhashed() on the node return true after this. It is * useful for RCU based read lockfree traversal if the writer side * must know if the list entry is still hashed or already unhashed. * * In particular, it means that we can not poison the forward pointers * that may still be used for walking the hash list and we can only * zero the pprev pointer so list_unhashed() will return true after * this. * * The caller must take whatever precautions are necessary (such as * holding appropriate locks) to avoid racing with another * list-mutation primitive, such as hlist_add_head_rcu() or * hlist_del_rcu(), running on this same list. However, it is * perfectly legal to run concurrently with the _rcu list-traversal * primitives, such as hlist_for_each_entry_rcu(). */ static inline void hlist_del_init_rcu(struct hlist_node *n) { if (!hlist_unhashed(n)) { __hlist_del(n); WRITE_ONCE(n->pprev, NULL); } } /** * list_replace_rcu - replace old entry by new one * @old : the element to be replaced * @new : the new element to insert * * The @old entry will be replaced with the @new entry atomically from * the perspective of concurrent readers. It is the caller's responsibility * to synchronize with concurrent updaters, if any. * * Note: @old should not be empty. */ static inline void list_replace_rcu(struct list_head *old, struct list_head *new) { new->next = old->next; new->prev = old->prev; rcu_assign_pointer(list_next_rcu(new->prev), new); new->next->prev = new; old->prev = LIST_POISON2; } /** * __list_splice_init_rcu - join an RCU-protected list into an existing list. * @list: the RCU-protected list to splice * @prev: points to the last element of the existing list * @next: points to the first element of the existing list * @sync: synchronize_rcu, synchronize_rcu_expedited, ... * * The list pointed to by @prev and @next can be RCU-read traversed * concurrently with this function. * * Note that this function blocks. * * Important note: the caller must take whatever action is necessary to prevent * any other updates to the existing list. In principle, it is possible to * modify the list as soon as sync() begins execution. If this sort of thing * becomes necessary, an alternative version based on call_rcu() could be * created. But only if -really- needed -- there is no shortage of RCU API * members. */ static inline void __list_splice_init_rcu(struct list_head *list, struct list_head *prev, struct list_head *next, void (*sync)(void)) { struct list_head *first = list->next; struct list_head *last = list->prev; /* * "first" and "last" tracking list, so initialize it. RCU readers * have access to this list, so we must use INIT_LIST_HEAD_RCU() * instead of INIT_LIST_HEAD(). */ INIT_LIST_HEAD_RCU(list); /* * At this point, the list body still points to the source list. * Wait for any readers to finish using the list before splicing * the list body into the new list. Any new readers will see * an empty list. */ sync(); ASSERT_EXCLUSIVE_ACCESS(*first); ASSERT_EXCLUSIVE_ACCESS(*last); /* * Readers are finished with the source list, so perform splice. * The order is important if the new list is global and accessible * to concurrent RCU readers. Note that RCU readers are not * permitted to traverse the prev pointers without excluding * this function. */ last->next = next; rcu_assign_pointer(list_next_rcu(prev), first); first->prev = prev; next->prev = last; } /** * list_splice_init_rcu - splice an RCU-protected list into an existing list, * designed for stacks. * @list: the RCU-protected list to splice * @head: the place in the existing list to splice the first list into * @sync: synchronize_rcu, synchronize_rcu_expedited, ... */ static inline void list_splice_init_rcu(struct list_head *list, struct list_head *head, void (*sync)(void)) { if (!list_empty(list)) __list_splice_init_rcu(list, head, head->next, sync); } /** * list_splice_tail_init_rcu - splice an RCU-protected list into an existing * list, designed for queues. * @list: the RCU-protected list to splice * @head: the place in the existing list to splice the first list into * @sync: synchronize_rcu, synchronize_rcu_expedited, ... */ static inline void list_splice_tail_init_rcu(struct list_head *list, struct list_head *head, void (*sync)(void)) { if (!list_empty(list)) __list_splice_init_rcu(list, head->prev, head, sync); } /** * list_entry_rcu - get the struct for this entry * @ptr: the &struct list_head pointer. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * This primitive may safely run concurrently with the _rcu list-mutation * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). */ #define list_entry_rcu(ptr, type, member) \ container_of(READ_ONCE(ptr), type, member) /* * Where are list_empty_rcu() and list_first_entry_rcu()? * * They do not exist because they would lead to subtle race conditions: * * if (!list_empty_rcu(mylist)) { * struct foo *bar = list_first_entry_rcu(mylist, struct foo, list_member); * do_something(bar); * } * * The list might be non-empty when list_empty_rcu() checks it, but it * might have become empty by the time that list_first_entry_rcu() rereads * the ->next pointer, which would result in a SEGV. * * When not using RCU, it is OK for list_first_entry() to re-read that * pointer because both functions should be protected by some lock that * blocks writers. * * When using RCU, list_empty() uses READ_ONCE() to fetch the * RCU-protected ->next pointer and then compares it to the address of the * list head. However, it neither dereferences this pointer nor provides * this pointer to its caller. Thus, READ_ONCE() suffices (that is, * rcu_dereference() is not needed), which means that list_empty() can be * used anywhere you would want to use list_empty_rcu(). Just don't * expect anything useful to happen if you do a subsequent lockless * call to list_first_entry_rcu()!!! * * See list_first_or_null_rcu for an alternative. */ /** * list_first_or_null_rcu - get the first element from a list * @ptr: the list head to take the element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * Note that if the list is empty, it returns NULL. * * This primitive may safely run concurrently with the _rcu list-mutation * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). */ #define list_first_or_null_rcu(ptr, type, member) \ ({ \ struct list_head *__ptr = (ptr); \ struct list_head *__next = READ_ONCE(__ptr->next); \ likely(__ptr != __next) ? list_entry_rcu(__next, type, member) : NULL; \ }) /** * list_next_or_null_rcu - get the next element from a list * @head: the head for the list. * @ptr: the list head to take the next element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * Note that if the ptr is at the end of the list, NULL is returned. * * This primitive may safely run concurrently with the _rcu list-mutation * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). */ #define list_next_or_null_rcu(head, ptr, type, member) \ ({ \ struct list_head *__head = (head); \ struct list_head *__ptr = (ptr); \ struct list_head *__next = READ_ONCE(__ptr->next); \ likely(__next != __head) ? list_entry_rcu(__next, type, \ member) : NULL; \ }) /** * list_for_each_entry_rcu - iterate over rcu list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * @cond: optional lockdep expression if called from non-RCU protection. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as list_add_rcu() * as long as the traversal is guarded by rcu_read_lock(). */ #define list_for_each_entry_rcu(pos, head, member, cond...) \ for (__list_check_rcu(dummy, ## cond, 0), \ pos = list_entry_rcu((head)->next, typeof(*pos), member); \ &pos->member != (head); \ pos = list_entry_rcu(pos->member.next, typeof(*pos), member)) /** * list_for_each_entry_srcu - iterate over rcu list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * @cond: lockdep expression for the lock required to traverse the list. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as list_add_rcu() * as long as the traversal is guarded by srcu_read_lock(). * The lockdep expression srcu_read_lock_held() can be passed as the * cond argument from read side. */ #define list_for_each_entry_srcu(pos, head, member, cond) \ for (__list_check_srcu(cond), \ pos = list_entry_rcu((head)->next, typeof(*pos), member); \ &pos->member != (head); \ pos = list_entry_rcu(pos->member.next, typeof(*pos), member)) /** * list_entry_lockless - get the struct for this entry * @ptr: the &struct list_head pointer. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * This primitive may safely run concurrently with the _rcu * list-mutation primitives such as list_add_rcu(), but requires some * implicit RCU read-side guarding. One example is running within a special * exception-time environment where preemption is disabled and where lockdep * cannot be invoked. Another example is when items are added to the list, * but never deleted. */ #define list_entry_lockless(ptr, type, member) \ container_of((typeof(ptr))READ_ONCE(ptr), type, member) /** * list_for_each_entry_lockless - iterate over rcu list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_struct within the struct. * * This primitive may safely run concurrently with the _rcu * list-mutation primitives such as list_add_rcu(), but requires some * implicit RCU read-side guarding. One example is running within a special * exception-time environment where preemption is disabled and where lockdep * cannot be invoked. Another example is when items are added to the list, * but never deleted. */ #define list_for_each_entry_lockless(pos, head, member) \ for (pos = list_entry_lockless((head)->next, typeof(*pos), member); \ &pos->member != (head); \ pos = list_entry_lockless(pos->member.next, typeof(*pos), member)) /** * list_for_each_entry_continue_rcu - continue iteration over list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * * Continue to iterate over list of given type, continuing after * the current position which must have been in the list when the RCU read * lock was taken. * This would typically require either that you obtained the node from a * previous walk of the list in the same RCU read-side critical section, or * that you held some sort of non-RCU reference (such as a reference count) * to keep the node alive *and* in the list. * * This iterator is similar to list_for_each_entry_from_rcu() except * this starts after the given position and that one starts at the given * position. */ #define list_for_each_entry_continue_rcu(pos, head, member) \ for (pos = list_entry_rcu(pos->member.next, typeof(*pos), member); \ &pos->member != (head); \ pos = list_entry_rcu(pos->member.next, typeof(*pos), member)) /** * list_for_each_entry_from_rcu - iterate over a list from current point * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_node within the struct. * * Iterate over the tail of a list starting from a given position, * which must have been in the list when the RCU read lock was taken. * This would typically require either that you obtained the node from a * previous walk of the list in the same RCU read-side critical section, or * that you held some sort of non-RCU reference (such as a reference count) * to keep the node alive *and* in the list. * * This iterator is similar to list_for_each_entry_continue_rcu() except * this starts from the given position and that one starts from the position * after the given position. */ #define list_for_each_entry_from_rcu(pos, head, member) \ for (; &(pos)->member != (head); \ pos = list_entry_rcu(pos->member.next, typeof(*(pos)), member)) /** * hlist_del_rcu - deletes entry from hash list without re-initialization * @n: the element to delete from the hash list. * * Note: list_unhashed() on entry does not return true after this, * the entry is in an undefined state. It is useful for RCU based * lockfree traversal. * * In particular, it means that we can not poison the forward * pointers that may still be used for walking the hash list. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_add_head_rcu() * or hlist_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_for_each_entry(). */ static inline void hlist_del_rcu(struct hlist_node *n) { __hlist_del(n); WRITE_ONCE(n->pprev, LIST_POISON2); } /** * hlist_replace_rcu - replace old entry by new one * @old : the element to be replaced * @new : the new element to insert * * The @old entry will be replaced with the @new entry atomically from * the perspective of concurrent readers. It is the caller's responsibility * to synchronize with concurrent updaters, if any. */ static inline void hlist_replace_rcu(struct hlist_node *old, struct hlist_node *new) { struct hlist_node *next = old->next; new->next = next; WRITE_ONCE(new->pprev, old->pprev); rcu_assign_pointer(*(struct hlist_node __rcu **)new->pprev, new); if (next) WRITE_ONCE(new->next->pprev, &new->next); WRITE_ONCE(old->pprev, LIST_POISON2); } /** * hlists_swap_heads_rcu - swap the lists the hlist heads point to * @left: The hlist head on the left * @right: The hlist head on the right * * The lists start out as [@left ][node1 ... ] and * [@right ][node2 ... ] * The lists end up as [@left ][node2 ... ] * [@right ][node1 ... ] */ static inline void hlists_swap_heads_rcu(struct hlist_head *left, struct hlist_head *right) { struct hlist_node *node1 = left->first; struct hlist_node *node2 = right->first; rcu_assign_pointer(left->first, node2); rcu_assign_pointer(right->first, node1); WRITE_ONCE(node2->pprev, &left->first); WRITE_ONCE(node1->pprev, &right->first); } /* * return the first or the next element in an RCU protected hlist */ #define hlist_first_rcu(head) (*((struct hlist_node __rcu **)(&(head)->first))) #define hlist_next_rcu(node) (*((struct hlist_node __rcu **)(&(node)->next))) #define hlist_pprev_rcu(node) (*((struct hlist_node __rcu **)((node)->pprev))) /** * hlist_add_head_rcu * @n: the element to add to the hash list. * @h: the list to add to. * * Description: * Adds the specified element to the specified hlist, * while permitting racing traversals. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_add_head_rcu() * or hlist_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_for_each_entry_rcu(), used to prevent memory-consistency * problems on Alpha CPUs. Regardless of the type of CPU, the * list-traversal primitive must be guarded by rcu_read_lock(). */ static inline void hlist_add_head_rcu(struct hlist_node *n, struct hlist_head *h) { struct hlist_node *first = h->first; n->next = first; WRITE_ONCE(n->pprev, &h->first); rcu_assign_pointer(hlist_first_rcu(h), n); if (first) WRITE_ONCE(first->pprev, &n->next); } /** * hlist_add_tail_rcu * @n: the element to add to the hash list. * @h: the list to add to. * * Description: * Adds the specified element to the specified hlist, * while permitting racing traversals. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_add_head_rcu() * or hlist_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_for_each_entry_rcu(), used to prevent memory-consistency * problems on Alpha CPUs. Regardless of the type of CPU, the * list-traversal primitive must be guarded by rcu_read_lock(). */ static inline void hlist_add_tail_rcu(struct hlist_node *n, struct hlist_head *h) { struct hlist_node *i, *last = NULL; /* Note: write side code, so rcu accessors are not needed. */ for (i = h->first; i; i = i->next) last = i; if (last) { n->next = last->next; WRITE_ONCE(n->pprev, &last->next); rcu_assign_pointer(hlist_next_rcu(last), n); } else { hlist_add_head_rcu(n, h); } } /** * hlist_add_before_rcu * @n: the new element to add to the hash list. * @next: the existing element to add the new element before. * * Description: * Adds the specified element to the specified hlist * before the specified node while permitting racing traversals. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_add_head_rcu() * or hlist_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_for_each_entry_rcu(), used to prevent memory-consistency * problems on Alpha CPUs. */ static inline void hlist_add_before_rcu(struct hlist_node *n, struct hlist_node *next) { WRITE_ONCE(n->pprev, next->pprev); n->next = next; rcu_assign_pointer(hlist_pprev_rcu(n), n); WRITE_ONCE(next->pprev, &n->next); } /** * hlist_add_behind_rcu * @n: the new element to add to the hash list. * @prev: the existing element to add the new element after. * * Description: * Adds the specified element to the specified hlist * after the specified node while permitting racing traversals. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_add_head_rcu() * or hlist_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_for_each_entry_rcu(), used to prevent memory-consistency * problems on Alpha CPUs. */ static inline void hlist_add_behind_rcu(struct hlist_node *n, struct hlist_node *prev) { n->next = prev->next; WRITE_ONCE(n->pprev, &prev->next); rcu_assign_pointer(hlist_next_rcu(prev), n); if (n->next) WRITE_ONCE(n->next->pprev, &n->next); } #define __hlist_for_each_rcu(pos, head) \ for (pos = rcu_dereference(hlist_first_rcu(head)); \ pos; \ pos = rcu_dereference(hlist_next_rcu(pos))) /** * hlist_for_each_entry_rcu - iterate over rcu list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. * @cond: optional lockdep expression if called from non-RCU protection. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as hlist_add_head_rcu() * as long as the traversal is guarded by rcu_read_lock(). */ #define hlist_for_each_entry_rcu(pos, head, member, cond...) \ for (__list_check_rcu(dummy, ## cond, 0), \ pos = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),\ typeof(*(pos)), member); \ pos; \ pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\ &(pos)->member)), typeof(*(pos)), member)) /** * hlist_for_each_entry_srcu - iterate over rcu list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. * @cond: lockdep expression for the lock required to traverse the list. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as hlist_add_head_rcu() * as long as the traversal is guarded by srcu_read_lock(). * The lockdep expression srcu_read_lock_held() can be passed as the * cond argument from read side. */ #define hlist_for_each_entry_srcu(pos, head, member, cond) \ for (__list_check_srcu(cond), \ pos = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),\ typeof(*(pos)), member); \ pos; \ pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\ &(pos)->member)), typeof(*(pos)), member)) /** * hlist_for_each_entry_rcu_notrace - iterate over rcu list of given type (for tracing) * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as hlist_add_head_rcu() * as long as the traversal is guarded by rcu_read_lock(). * * This is the same as hlist_for_each_entry_rcu() except that it does * not do any RCU debugging or tracing. */ #define hlist_for_each_entry_rcu_notrace(pos, head, member) \ for (pos = hlist_entry_safe(rcu_dereference_raw_check(hlist_first_rcu(head)),\ typeof(*(pos)), member); \ pos; \ pos = hlist_entry_safe(rcu_dereference_raw_check(hlist_next_rcu(\ &(pos)->member)), typeof(*(pos)), member)) /** * hlist_for_each_entry_rcu_bh - iterate over rcu list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as hlist_add_head_rcu() * as long as the traversal is guarded by rcu_read_lock(). */ #define hlist_for_each_entry_rcu_bh(pos, head, member) \ for (pos = hlist_entry_safe(rcu_dereference_bh(hlist_first_rcu(head)),\ typeof(*(pos)), member); \ pos; \ pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu(\ &(pos)->member)), typeof(*(pos)), member)) /** * hlist_for_each_entry_continue_rcu - iterate over a hlist continuing after current point * @pos: the type * to use as a loop cursor. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_continue_rcu(pos, member) \ for (pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu( \ &(pos)->member)), typeof(*(pos)), member); \ pos; \ pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu( \ &(pos)->member)), typeof(*(pos)), member)) /** * hlist_for_each_entry_continue_rcu_bh - iterate over a hlist continuing after current point * @pos: the type * to use as a loop cursor. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_continue_rcu_bh(pos, member) \ for (pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu( \ &(pos)->member)), typeof(*(pos)), member); \ pos; \ pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu( \ &(pos)->member)), typeof(*(pos)), member)) /** * hlist_for_each_entry_from_rcu - iterate over a hlist continuing from current point * @pos: the type * to use as a loop cursor. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_from_rcu(pos, member) \ for (; pos; \ pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu( \ &(pos)->member)), typeof(*(pos)), member)) #endif /* __KERNEL__ */ #endif
1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 #ifndef LLC_PDU_H #define LLC_PDU_H /* * Copyright (c) 1997 by Procom Technology,Inc. * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * This program can be redistributed or modified under the terms of the * GNU General Public License as published by the Free Software Foundation. * This program is distributed without any warranty or implied warranty * of merchantability or fitness for a particular purpose. * * See the GNU General Public License for more details. */ #include <linux/if_ether.h> /* Lengths of frame formats */ #define LLC_PDU_LEN_I 4 /* header and 2 control bytes */ #define LLC_PDU_LEN_S 4 #define LLC_PDU_LEN_U 3 /* header and 1 control byte */ /* header and 1 control byte and XID info */ #define LLC_PDU_LEN_U_XID (LLC_PDU_LEN_U + sizeof(struct llc_xid_info)) /* Known SAP addresses */ #define LLC_GLOBAL_SAP 0xFF #define LLC_NULL_SAP 0x00 /* not network-layer visible */ #define LLC_MGMT_INDIV 0x02 /* station LLC mgmt indiv addr */ #define LLC_MGMT_GRP 0x03 /* station LLC mgmt group addr */ #define LLC_RDE_SAP 0xA6 /* route ... */ /* SAP field bit masks */ #define LLC_ISO_RESERVED_SAP 0x02 #define LLC_SAP_GROUP_DSAP 0x01 #define LLC_SAP_RESP_SSAP 0x01 /* Group/individual DSAP indicator is DSAP field */ #define LLC_PDU_GROUP_DSAP_MASK 0x01 #define LLC_PDU_IS_GROUP_DSAP(pdu) \ ((pdu->dsap & LLC_PDU_GROUP_DSAP_MASK) ? 0 : 1) #define LLC_PDU_IS_INDIV_DSAP(pdu) \ (!(pdu->dsap & LLC_PDU_GROUP_DSAP_MASK) ? 0 : 1) /* Command/response PDU indicator in SSAP field */ #define LLC_PDU_CMD_RSP_MASK 0x01 #define LLC_PDU_CMD 0 #define LLC_PDU_RSP 1 #define LLC_PDU_IS_CMD(pdu) ((pdu->ssap & LLC_PDU_RSP) ? 0 : 1) #define LLC_PDU_IS_RSP(pdu) ((pdu->ssap & LLC_PDU_RSP) ? 1 : 0) /* Get PDU type from 2 lowest-order bits of control field first byte */ #define LLC_PDU_TYPE_I_MASK 0x01 /* 16-bit control field */ #define LLC_PDU_TYPE_S_MASK 0x03 #define LLC_PDU_TYPE_U_MASK 0x03 /* 8-bit control field */ #define LLC_PDU_TYPE_MASK 0x03 #define LLC_PDU_TYPE_I 0 /* first bit */ #define LLC_PDU_TYPE_S 1 /* first two bits */ #define LLC_PDU_TYPE_U 3 /* first two bits */ #define LLC_PDU_TYPE_U_XID 4 /* private type for detecting XID commands */ #define LLC_PDU_TYPE_IS_I(pdu) \ ((!(pdu->ctrl_1 & LLC_PDU_TYPE_I_MASK)) ? 1 : 0) #define LLC_PDU_TYPE_IS_U(pdu) \ (((pdu->ctrl_1 & LLC_PDU_TYPE_U_MASK) == LLC_PDU_TYPE_U) ? 1 : 0) #define LLC_PDU_TYPE_IS_S(pdu) \ (((pdu->ctrl_1 & LLC_PDU_TYPE_S_MASK) == LLC_PDU_TYPE_S) ? 1 : 0) /* U-format PDU control field masks */ #define LLC_U_PF_BIT_MASK 0x10 /* P/F bit mask */ #define LLC_U_PF_IS_1(pdu) ((pdu->ctrl_1 & LLC_U_PF_BIT_MASK) ? 1 : 0) #define LLC_U_PF_IS_0(pdu) ((!(pdu->ctrl_1 & LLC_U_PF_BIT_MASK)) ? 1 : 0) #define LLC_U_PDU_CMD_MASK 0xEC /* cmd/rsp mask */ #define LLC_U_PDU_CMD(pdu) (pdu->ctrl_1 & LLC_U_PDU_CMD_MASK) #define LLC_U_PDU_RSP(pdu) (pdu->ctrl_1 & LLC_U_PDU_CMD_MASK) #define LLC_1_PDU_CMD_UI 0x00 /* Type 1 cmds/rsps */ #define LLC_1_PDU_CMD_XID 0xAC #define LLC_1_PDU_CMD_TEST 0xE0 #define LLC_2_PDU_CMD_SABME 0x6C /* Type 2 cmds/rsps */ #define LLC_2_PDU_CMD_DISC 0x40 #define LLC_2_PDU_RSP_UA 0x60 #define LLC_2_PDU_RSP_DM 0x0C #define LLC_2_PDU_RSP_FRMR 0x84 /* Type 1 operations */ /* XID information field bit masks */ /* LLC format identifier (byte 1) */ #define LLC_XID_FMT_ID 0x81 /* first byte must be this */ /* LLC types/classes identifier (byte 2) */ #define LLC_XID_CLASS_ZEROS_MASK 0xE0 /* these must be zeros */ #define LLC_XID_CLASS_MASK 0x1F /* AND with byte to get below */ #define LLC_XID_NULL_CLASS_1 0x01 /* if NULL LSAP...use these */ #define LLC_XID_NULL_CLASS_2 0x03 #define LLC_XID_NULL_CLASS_3 0x05 #define LLC_XID_NULL_CLASS_4 0x07 #define LLC_XID_NNULL_TYPE_1 0x01 /* if non-NULL LSAP...use these */ #define LLC_XID_NNULL_TYPE_2 0x02 #define LLC_XID_NNULL_TYPE_3 0x04 #define LLC_XID_NNULL_TYPE_1_2 0x03 #define LLC_XID_NNULL_TYPE_1_3 0x05 #define LLC_XID_NNULL_TYPE_2_3 0x06 #define LLC_XID_NNULL_ALL 0x07 /* Sender Receive Window (byte 3) */ #define LLC_XID_RW_MASK 0xFE /* AND with value to get below */ #define LLC_XID_MIN_RW 0x02 /* lowest-order bit always zero */ /* Type 2 operations */ #define LLC_2_SEQ_NBR_MODULO ((u8) 128) /* I-PDU masks ('ctrl' is I-PDU control word) */ #define LLC_I_GET_NS(pdu) (u8)((pdu->ctrl_1 & 0xFE) >> 1) #define LLC_I_GET_NR(pdu) (u8)((pdu->ctrl_2 & 0xFE) >> 1) #define LLC_I_PF_BIT_MASK 0x01 #define LLC_I_PF_IS_0(pdu) ((!(pdu->ctrl_2 & LLC_I_PF_BIT_MASK)) ? 1 : 0) #define LLC_I_PF_IS_1(pdu) ((pdu->ctrl_2 & LLC_I_PF_BIT_MASK) ? 1 : 0) /* S-PDU supervisory commands and responses */ #define LLC_S_PDU_CMD_MASK 0x0C #define LLC_S_PDU_CMD(pdu) (pdu->ctrl_1 & LLC_S_PDU_CMD_MASK) #define LLC_S_PDU_RSP(pdu) (pdu->ctrl_1 & LLC_S_PDU_CMD_MASK) #define LLC_2_PDU_CMD_RR 0x00 /* rx ready cmd */ #define LLC_2_PDU_RSP_RR 0x00 /* rx ready rsp */ #define LLC_2_PDU_CMD_REJ 0x08 /* reject PDU cmd */ #define LLC_2_PDU_RSP_REJ 0x08 /* reject PDU rsp */ #define LLC_2_PDU_CMD_RNR 0x04 /* rx not ready cmd */ #define LLC_2_PDU_RSP_RNR 0x04 /* rx not ready rsp */ #define LLC_S_PF_BIT_MASK 0x01 #define LLC_S_PF_IS_0(pdu) ((!(pdu->ctrl_2 & LLC_S_PF_BIT_MASK)) ? 1 : 0) #define LLC_S_PF_IS_1(pdu) ((pdu->ctrl_2 & LLC_S_PF_BIT_MASK) ? 1 : 0) #define PDU_SUPV_GET_Nr(pdu) ((pdu->ctrl_2 & 0xFE) >> 1) #define PDU_GET_NEXT_Vr(sn) (((sn) + 1) & ~LLC_2_SEQ_NBR_MODULO) /* FRMR information field macros */ #define FRMR_INFO_LENGTH 5 /* 5 bytes of information */ /* * info is pointer to FRMR info field structure; 'rej_ctrl' is byte pointer * (if U-PDU) or word pointer to rejected PDU control field */ #define FRMR_INFO_SET_REJ_CNTRL(info,rej_ctrl) \ info->rej_pdu_ctrl = ((*((u8 *) rej_ctrl) & \ LLC_PDU_TYPE_U) != LLC_PDU_TYPE_U ? \ (u16)*((u16 *) rej_ctrl) : \ (((u16) *((u8 *) rej_ctrl)) & 0x00FF)) /* * Info is pointer to FRMR info field structure; 'vs' is a byte containing * send state variable value in low-order 7 bits (insure the lowest-order * bit remains zero (0)) */ #define FRMR_INFO_SET_Vs(info,vs) (info->curr_ssv = (((u8) vs) << 1)) #define FRMR_INFO_SET_Vr(info,vr) (info->curr_rsv = (((u8) vr) << 1)) /* * Info is pointer to FRMR info field structure; 'cr' is a byte containing * the C/R bit value in the low-order bit */ #define FRMR_INFO_SET_C_R_BIT(info, cr) (info->curr_rsv |= (((u8) cr) & 0x01)) /* * In the remaining five macros, 'info' is pointer to FRMR info field * structure; 'ind' is a byte containing the bit value to set in the * lowest-order bit) */ #define FRMR_INFO_SET_INVALID_PDU_CTRL_IND(info, ind) \ (info->ind_bits = ((info->ind_bits & 0xFE) | (((u8) ind) & 0x01))) #define FRMR_INFO_SET_INVALID_PDU_INFO_IND(info, ind) \ (info->ind_bits = ( (info->ind_bits & 0xFD) | (((u8) ind) & 0x02))) #define FRMR_INFO_SET_PDU_INFO_2LONG_IND(info, ind) \ (info->ind_bits = ( (info->ind_bits & 0xFB) | (((u8) ind) & 0x04))) #define FRMR_INFO_SET_PDU_INVALID_Nr_IND(info, ind) \ (info->ind_bits = ( (info->ind_bits & 0xF7) | (((u8) ind) & 0x08))) #define FRMR_INFO_SET_PDU_INVALID_Ns_IND(info, ind) \ (info->ind_bits = ( (info->ind_bits & 0xEF) | (((u8) ind) & 0x10))) /* Sequence-numbered PDU format (4 bytes in length) */ struct llc_pdu_sn { u8 dsap; u8 ssap; u8 ctrl_1; u8 ctrl_2; } __packed; static inline struct llc_pdu_sn *llc_pdu_sn_hdr(struct sk_buff *skb) { return (struct llc_pdu_sn *)skb_network_header(skb); } /* Un-numbered PDU format (3 bytes in length) */ struct llc_pdu_un { u8 dsap; u8 ssap; u8 ctrl_1; } __packed; static inline struct llc_pdu_un *llc_pdu_un_hdr(struct sk_buff *skb) { return (struct llc_pdu_un *)skb_network_header(skb); } /** * llc_pdu_header_init - initializes pdu header * @skb: input skb that header must be set into it. * @type: type of PDU (U, I or S). * @ssap: source sap. * @dsap: destination sap. * @cr: command/response bit (0 or 1). * * This function sets DSAP, SSAP and command/Response bit in LLC header. */ static inline void llc_pdu_header_init(struct sk_buff *skb, u8 type, u8 ssap, u8 dsap, u8 cr) { int hlen = 4; /* default value for I and S types */ struct llc_pdu_un *pdu; switch (type) { case LLC_PDU_TYPE_U: hlen = 3; break; case LLC_PDU_TYPE_U_XID: hlen = 6; break; } skb_push(skb, hlen); skb_reset_network_header(skb); pdu = llc_pdu_un_hdr(skb); pdu->dsap = dsap; pdu->ssap = ssap; pdu->ssap |= cr; } /** * llc_pdu_decode_sa - extracts, source address (MAC) of input frame * @skb: input skb that source address must be extracted from it. * @sa: pointer to source address (6 byte array). * * This function extracts source address(MAC) of input frame. */ static inline void llc_pdu_decode_sa(struct sk_buff *skb, u8 *sa) { memcpy(sa, eth_hdr(skb)->h_source, ETH_ALEN); } /** * llc_pdu_decode_da - extracts dest address of input frame * @skb: input skb that destination address must be extracted from it * @da: pointer to destination address (6 byte array). * * This function extracts destination address(MAC) of input frame. */ static inline void llc_pdu_decode_da(struct sk_buff *skb, u8 *da) { memcpy(da, eth_hdr(skb)->h_dest, ETH_ALEN); } /** * llc_pdu_decode_ssap - extracts source SAP of input frame * @skb: input skb that source SAP must be extracted from it. * @ssap: source SAP (output argument). * * This function extracts source SAP of input frame. Right bit of SSAP is * command/response bit. */ static inline void llc_pdu_decode_ssap(struct sk_buff *skb, u8 *ssap) { *ssap = llc_pdu_un_hdr(skb)->ssap & 0xFE; } /** * llc_pdu_decode_dsap - extracts dest SAP of input frame * @skb: input skb that destination SAP must be extracted from it. * @dsap: destination SAP (output argument). * * This function extracts destination SAP of input frame. right bit of * DSAP designates individual/group SAP. */ static inline void llc_pdu_decode_dsap(struct sk_buff *skb, u8 *dsap) { *dsap = llc_pdu_un_hdr(skb)->dsap & 0xFE; } /** * llc_pdu_init_as_ui_cmd - sets LLC header as UI PDU * @skb: input skb that header must be set into it. * * This function sets third byte of LLC header as a UI PDU. */ static inline void llc_pdu_init_as_ui_cmd(struct sk_buff *skb) { struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); pdu->ctrl_1 = LLC_PDU_TYPE_U; pdu->ctrl_1 |= LLC_1_PDU_CMD_UI; } /** * llc_pdu_init_as_test_cmd - sets PDU as TEST * @skb: Address of the skb to build * * Sets a PDU as TEST */ static inline void llc_pdu_init_as_test_cmd(struct sk_buff *skb) { struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); pdu->ctrl_1 = LLC_PDU_TYPE_U; pdu->ctrl_1 |= LLC_1_PDU_CMD_TEST; pdu->ctrl_1 |= LLC_U_PF_BIT_MASK; } /** * llc_pdu_init_as_test_rsp - build TEST response PDU * @skb: Address of the skb to build * @ev_skb: The received TEST command PDU frame * * Builds a pdu frame as a TEST response. */ static inline void llc_pdu_init_as_test_rsp(struct sk_buff *skb, struct sk_buff *ev_skb) { struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); pdu->ctrl_1 = LLC_PDU_TYPE_U; pdu->ctrl_1 |= LLC_1_PDU_CMD_TEST; pdu->ctrl_1 |= LLC_U_PF_BIT_MASK; if (ev_skb->protocol == htons(ETH_P_802_2)) { struct llc_pdu_un *ev_pdu = llc_pdu_un_hdr(ev_skb); int dsize; dsize = ntohs(eth_hdr(ev_skb)->h_proto) - 3; memcpy(((u8 *)pdu) + 3, ((u8 *)ev_pdu) + 3, dsize); skb_put(skb, dsize); } } /* LLC Type 1 XID command/response information fields format */ struct llc_xid_info { u8 fmt_id; /* always 0x81 for LLC */ u8 type; /* different if NULL/non-NULL LSAP */ u8 rw; /* sender receive window */ } __packed; /** * llc_pdu_init_as_xid_cmd - sets bytes 3, 4 & 5 of LLC header as XID * @skb: input skb that header must be set into it. * @svcs_supported: The class of the LLC (I or II) * @rx_window: The size of the receive window of the LLC * * This function sets third,fourth,fifth and sixth bytes of LLC header as * a XID PDU. */ static inline void llc_pdu_init_as_xid_cmd(struct sk_buff *skb, u8 svcs_supported, u8 rx_window) { struct llc_xid_info *xid_info; struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); pdu->ctrl_1 = LLC_PDU_TYPE_U; pdu->ctrl_1 |= LLC_1_PDU_CMD_XID; pdu->ctrl_1 |= LLC_U_PF_BIT_MASK; xid_info = (struct llc_xid_info *)(((u8 *)&pdu->ctrl_1) + 1); xid_info->fmt_id = LLC_XID_FMT_ID; /* 0x81 */ xid_info->type = svcs_supported; xid_info->rw = rx_window << 1; /* size of receive window */ /* no need to push/put since llc_pdu_header_init() has already * pushed 3 + 3 bytes */ } /** * llc_pdu_init_as_xid_rsp - builds XID response PDU * @skb: Address of the skb to build * @svcs_supported: The class of the LLC (I or II) * @rx_window: The size of the receive window of the LLC * * Builds a pdu frame as an XID response. */ static inline void llc_pdu_init_as_xid_rsp(struct sk_buff *skb, u8 svcs_supported, u8 rx_window) { struct llc_xid_info *xid_info; struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); pdu->ctrl_1 = LLC_PDU_TYPE_U; pdu->ctrl_1 |= LLC_1_PDU_CMD_XID; pdu->ctrl_1 |= LLC_U_PF_BIT_MASK; xid_info = (struct llc_xid_info *)(((u8 *)&pdu->ctrl_1) + 1); xid_info->fmt_id = LLC_XID_FMT_ID; xid_info->type = svcs_supported; xid_info->rw = rx_window << 1; skb_put(skb, sizeof(struct llc_xid_info)); } /* LLC Type 2 FRMR response information field format */ struct llc_frmr_info { u16 rej_pdu_ctrl; /* bits 1-8 if U-PDU */ u8 curr_ssv; /* current send state variable val */ u8 curr_rsv; /* current receive state variable */ u8 ind_bits; /* indicator bits set with macro */ } __packed; void llc_pdu_set_cmd_rsp(struct sk_buff *skb, u8 type); void llc_pdu_set_pf_bit(struct sk_buff *skb, u8 bit_value); void llc_pdu_decode_pf_bit(struct sk_buff *skb, u8 *pf_bit); void llc_pdu_init_as_disc_cmd(struct sk_buff *skb, u8 p_bit); void llc_pdu_init_as_i_cmd(struct sk_buff *skb, u8 p_bit, u8 ns, u8 nr); void llc_pdu_init_as_rej_cmd(struct sk_buff *skb, u8 p_bit, u8 nr); void llc_pdu_init_as_rnr_cmd(struct sk_buff *skb, u8 p_bit, u8 nr); void llc_pdu_init_as_rr_cmd(struct sk_buff *skb, u8 p_bit, u8 nr); void llc_pdu_init_as_sabme_cmd(struct sk_buff *skb, u8 p_bit); void llc_pdu_init_as_dm_rsp(struct sk_buff *skb, u8 f_bit); void llc_pdu_init_as_frmr_rsp(struct sk_buff *skb, struct llc_pdu_sn *prev_pdu, u8 f_bit, u8 vs, u8 vr, u8 vzyxw); void llc_pdu_init_as_rr_rsp(struct sk_buff *skb, u8 f_bit, u8 nr); void llc_pdu_init_as_rej_rsp(struct sk_buff *skb, u8 f_bit, u8 nr); void llc_pdu_init_as_rnr_rsp(struct sk_buff *skb, u8 f_bit, u8 nr); void llc_pdu_init_as_ua_rsp(struct sk_buff *skb, u8 f_bit); #endif /* LLC_PDU_H */
6 6 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ /* Copyright (c) 2008-2019, IBM Corporation */ #include <linux/init.h> #include <linux/errno.h> #include <linux/netdevice.h> #include <linux/inetdevice.h> #include <net/net_namespace.h> #include <linux/rtnetlink.h> #include <linux/if_arp.h> #include <linux/list.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/module.h> #include <linux/dma-mapping.h> #include <net/addrconf.h> #include <rdma/ib_verbs.h> #include <rdma/ib_user_verbs.h> #include <rdma/rdma_netlink.h> #include <linux/kthread.h> #include "siw.h" #include "siw_verbs.h" MODULE_AUTHOR("Bernard Metzler"); MODULE_DESCRIPTION("Software iWARP Driver"); MODULE_LICENSE("Dual BSD/GPL"); /* transmit from user buffer, if possible */ const bool zcopy_tx = true; /* Restrict usage of GSO, if hardware peer iwarp is unable to process * large packets. try_gso = true lets siw try to use local GSO, * if peer agrees. Not using GSO severly limits siw maximum tx bandwidth. */ const bool try_gso; /* Attach siw also with loopback devices */ const bool loopback_enabled = true; /* We try to negotiate CRC on, if true */ const bool mpa_crc_required; /* MPA CRC on/off enforced */ const bool mpa_crc_strict; /* Control TCP_NODELAY socket option */ const bool siw_tcp_nagle; /* Select MPA version to be used during connection setup */ u_char mpa_version = MPA_REVISION_2; /* Selects MPA P2P mode (additional handshake during connection * setup, if true. */ const bool peer_to_peer; struct task_struct *siw_tx_thread[NR_CPUS]; static int siw_device_register(struct siw_device *sdev, const char *name) { struct ib_device *base_dev = &sdev->base_dev; static int dev_id = 1; int rv; sdev->vendor_part_id = dev_id++; rv = ib_register_device(base_dev, name, NULL); if (rv) { pr_warn("siw: device registration error %d\n", rv); return rv; } siw_dbg(base_dev, "HWaddr=%pM\n", sdev->raw_gid); return 0; } static void siw_device_cleanup(struct ib_device *base_dev) { struct siw_device *sdev = to_siw_dev(base_dev); xa_destroy(&sdev->qp_xa); xa_destroy(&sdev->mem_xa); } static int siw_dev_qualified(struct net_device *netdev) { /* * Additional hardware support can be added here * (e.g. ARPHRD_FDDI, ARPHRD_ATM, ...) - see * <linux/if_arp.h> for type identifiers. */ if (netdev->type == ARPHRD_ETHER || netdev->type == ARPHRD_IEEE802 || netdev->type == ARPHRD_NONE || (netdev->type == ARPHRD_LOOPBACK && loopback_enabled)) return 1; return 0; } static DEFINE_PER_CPU(atomic_t, siw_use_cnt); static struct { struct cpumask **tx_valid_cpus; int num_nodes; } siw_cpu_info; static void siw_destroy_cpulist(int number) { int i = 0; while (i < number) kfree(siw_cpu_info.tx_valid_cpus[i++]); kfree(siw_cpu_info.tx_valid_cpus); siw_cpu_info.tx_valid_cpus = NULL; } static int siw_init_cpulist(void) { int i, num_nodes = nr_node_ids; memset(siw_tx_thread, 0, sizeof(siw_tx_thread)); siw_cpu_info.num_nodes = num_nodes; siw_cpu_info.tx_valid_cpus = kcalloc(num_nodes, sizeof(struct cpumask *), GFP_KERNEL); if (!siw_cpu_info.tx_valid_cpus) { siw_cpu_info.num_nodes = 0; return -ENOMEM; } for (i = 0; i < siw_cpu_info.num_nodes; i++) { siw_cpu_info.tx_valid_cpus[i] = kzalloc(sizeof(struct cpumask), GFP_KERNEL); if (!siw_cpu_info.tx_valid_cpus[i]) goto out_err; cpumask_clear(siw_cpu_info.tx_valid_cpus[i]); } for_each_possible_cpu(i) cpumask_set_cpu(i, siw_cpu_info.tx_valid_cpus[cpu_to_node(i)]); return 0; out_err: siw_cpu_info.num_nodes = 0; siw_destroy_cpulist(i); return -ENOMEM; } /* * Choose CPU with least number of active QP's from NUMA node of * TX interface. */ int siw_get_tx_cpu(struct siw_device *sdev) { const struct cpumask *tx_cpumask; int i, num_cpus, cpu, min_use, node = sdev->numa_node, tx_cpu = -1; if (node < 0) tx_cpumask = cpu_online_mask; else tx_cpumask = siw_cpu_info.tx_valid_cpus[node]; num_cpus = cpumask_weight(tx_cpumask); if (!num_cpus) { /* no CPU on this NUMA node */ tx_cpumask = cpu_online_mask; num_cpus = cpumask_weight(tx_cpumask); } if (!num_cpus) goto out; cpu = cpumask_first(tx_cpumask); for (i = 0, min_use = SIW_MAX_QP; i < num_cpus; i++, cpu = cpumask_next(cpu, tx_cpumask)) { int usage; /* Skip any cores which have no TX thread */ if (!siw_tx_thread[cpu]) continue; usage = atomic_read(&per_cpu(siw_use_cnt, cpu)); if (usage <= min_use) { tx_cpu = cpu; min_use = usage; } } siw_dbg(&sdev->base_dev, "tx cpu %d, node %d, %d qp's\n", tx_cpu, node, min_use); out: if (tx_cpu >= 0) atomic_inc(&per_cpu(siw_use_cnt, tx_cpu)); else pr_warn("siw: no tx cpu found\n"); return tx_cpu; } void siw_put_tx_cpu(int cpu) { atomic_dec(&per_cpu(siw_use_cnt, cpu)); } static struct ib_qp *siw_get_base_qp(struct ib_device *base_dev, int id) { struct siw_qp *qp = siw_qp_id2obj(to_siw_dev(base_dev), id); if (qp) { /* * siw_qp_id2obj() increments object reference count */ siw_qp_put(qp); return &qp->base_qp; } return NULL; } static const struct ib_device_ops siw_device_ops = { .owner = THIS_MODULE, .uverbs_abi_ver = SIW_ABI_VERSION, .driver_id = RDMA_DRIVER_SIW, .alloc_mr = siw_alloc_mr, .alloc_pd = siw_alloc_pd, .alloc_ucontext = siw_alloc_ucontext, .create_cq = siw_create_cq, .create_qp = siw_create_qp, .create_srq = siw_create_srq, .dealloc_driver = siw_device_cleanup, .dealloc_pd = siw_dealloc_pd, .dealloc_ucontext = siw_dealloc_ucontext, .dereg_mr = siw_dereg_mr, .destroy_cq = siw_destroy_cq, .destroy_qp = siw_destroy_qp, .destroy_srq = siw_destroy_srq, .get_dma_mr = siw_get_dma_mr, .get_port_immutable = siw_get_port_immutable, .iw_accept = siw_accept, .iw_add_ref = siw_qp_get_ref, .iw_connect = siw_connect, .iw_create_listen = siw_create_listen, .iw_destroy_listen = siw_destroy_listen, .iw_get_qp = siw_get_base_qp, .iw_reject = siw_reject, .iw_rem_ref = siw_qp_put_ref, .map_mr_sg = siw_map_mr_sg, .mmap = siw_mmap, .mmap_free = siw_mmap_free, .modify_qp = siw_verbs_modify_qp, .modify_srq = siw_modify_srq, .poll_cq = siw_poll_cq, .post_recv = siw_post_receive, .post_send = siw_post_send, .post_srq_recv = siw_post_srq_recv, .query_device = siw_query_device, .query_gid = siw_query_gid, .query_port = siw_query_port, .query_qp = siw_query_qp, .query_srq = siw_query_srq, .req_notify_cq = siw_req_notify_cq, .reg_user_mr = siw_reg_user_mr, INIT_RDMA_OBJ_SIZE(ib_cq, siw_cq, base_cq), INIT_RDMA_OBJ_SIZE(ib_pd, siw_pd, base_pd), INIT_RDMA_OBJ_SIZE(ib_qp, siw_qp, base_qp), INIT_RDMA_OBJ_SIZE(ib_srq, siw_srq, base_srq), INIT_RDMA_OBJ_SIZE(ib_ucontext, siw_ucontext, base_ucontext), }; static struct siw_device *siw_device_create(struct net_device *netdev) { struct siw_device *sdev = NULL; struct ib_device *base_dev; int rv; sdev = ib_alloc_device(siw_device, base_dev); if (!sdev) return NULL; base_dev = &sdev->base_dev; if (netdev->addr_len) { memcpy(sdev->raw_gid, netdev->dev_addr, min_t(unsigned int, netdev->addr_len, ETH_ALEN)); } else { /* * This device does not have a HW address, but * connection mangagement requires a unique gid. */ eth_random_addr(sdev->raw_gid); } addrconf_addr_eui48((u8 *)&base_dev->node_guid, sdev->raw_gid); base_dev->uverbs_cmd_mask |= BIT_ULL(IB_USER_VERBS_CMD_POST_SEND); base_dev->node_type = RDMA_NODE_RNIC; memcpy(base_dev->node_desc, SIW_NODE_DESC_COMMON, sizeof(SIW_NODE_DESC_COMMON)); /* * Current model (one-to-one device association): * One Softiwarp device per net_device or, equivalently, * per physical port. */ base_dev->phys_port_cnt = 1; base_dev->num_comp_vectors = num_possible_cpus(); xa_init_flags(&sdev->qp_xa, XA_FLAGS_ALLOC1); xa_init_flags(&sdev->mem_xa, XA_FLAGS_ALLOC1); ib_set_device_ops(base_dev, &siw_device_ops); rv = ib_device_set_netdev(base_dev, netdev, 1); if (rv) goto error; memcpy(base_dev->iw_ifname, netdev->name, sizeof(base_dev->iw_ifname)); /* Disable TCP port mapping */ base_dev->iw_driver_flags = IW_F_NO_PORT_MAP; sdev->attrs.max_qp = SIW_MAX_QP; sdev->attrs.max_qp_wr = SIW_MAX_QP_WR; sdev->attrs.max_ord = SIW_MAX_ORD_QP; sdev->attrs.max_ird = SIW_MAX_IRD_QP; sdev->attrs.max_sge = SIW_MAX_SGE; sdev->attrs.max_sge_rd = SIW_MAX_SGE_RD; sdev->attrs.max_cq = SIW_MAX_CQ; sdev->attrs.max_cqe = SIW_MAX_CQE; sdev->attrs.max_mr = SIW_MAX_MR; sdev->attrs.max_pd = SIW_MAX_PD; sdev->attrs.max_mw = SIW_MAX_MW; sdev->attrs.max_srq = SIW_MAX_SRQ; sdev->attrs.max_srq_wr = SIW_MAX_SRQ_WR; sdev->attrs.max_srq_sge = SIW_MAX_SGE; INIT_LIST_HEAD(&sdev->cep_list); INIT_LIST_HEAD(&sdev->qp_list); atomic_set(&sdev->num_ctx, 0); atomic_set(&sdev->num_srq, 0); atomic_set(&sdev->num_qp, 0); atomic_set(&sdev->num_cq, 0); atomic_set(&sdev->num_mr, 0); atomic_set(&sdev->num_pd, 0); sdev->numa_node = dev_to_node(&netdev->dev); spin_lock_init(&sdev->lock); return sdev; error: ib_dealloc_device(base_dev); return NULL; } static int siw_netdev_event(struct notifier_block *nb, unsigned long event, void *arg) { struct net_device *netdev = netdev_notifier_info_to_dev(arg); struct ib_device *base_dev; struct siw_device *sdev; dev_dbg(&netdev->dev, "siw: event %lu\n", event); base_dev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_SIW); if (!base_dev) return NOTIFY_OK; sdev = to_siw_dev(base_dev); switch (event) { case NETDEV_REGISTER: /* * Device registration now handled only by * rdma netlink commands. So it shall be impossible * to end up here with a valid siw device. */ siw_dbg(base_dev, "unexpected NETDEV_REGISTER event\n"); break; case NETDEV_UNREGISTER: ib_unregister_device_queued(&sdev->base_dev); break; case NETDEV_CHANGEADDR: siw_port_event(sdev, 1, IB_EVENT_LID_CHANGE); break; /* * All other events are not handled */ default: break; } ib_device_put(&sdev->base_dev); return NOTIFY_OK; } static struct notifier_block siw_netdev_nb = { .notifier_call = siw_netdev_event, }; static int siw_newlink(const char *basedev_name, struct net_device *netdev) { struct ib_device *base_dev; struct siw_device *sdev = NULL; int rv = -ENOMEM; if (!siw_dev_qualified(netdev)) return -EINVAL; base_dev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_SIW); if (base_dev) { ib_device_put(base_dev); return -EEXIST; } sdev = siw_device_create(netdev); if (sdev) { dev_dbg(&netdev->dev, "siw: new device\n"); ib_mark_name_assigned_by_user(&sdev->base_dev); rv = siw_device_register(sdev, basedev_name); if (rv) ib_dealloc_device(&sdev->base_dev); } return rv; } static struct rdma_link_ops siw_link_ops = { .type = "siw", .newlink = siw_newlink, }; /* * siw_init_module - Initialize Softiwarp module and register with netdev * subsystem. */ static __init int siw_init_module(void) { int rv; if (SENDPAGE_THRESH < SIW_MAX_INLINE) { pr_info("siw: sendpage threshold too small: %u\n", (int)SENDPAGE_THRESH); rv = -EINVAL; goto out_error; } rv = siw_init_cpulist(); if (rv) goto out_error; rv = siw_cm_init(); if (rv) goto out_error; if (!siw_create_tx_threads()) { pr_info("siw: Could not start any TX thread\n"); rv = -ENOMEM; goto out_error; } rv = register_netdevice_notifier(&siw_netdev_nb); if (rv) goto out_error; rdma_link_register(&siw_link_ops); pr_info("SoftiWARP attached\n"); return 0; out_error: siw_stop_tx_threads(); pr_info("SoftIWARP attach failed. Error: %d\n", rv); siw_cm_exit(); siw_destroy_cpulist(siw_cpu_info.num_nodes); return rv; } static void __exit siw_exit_module(void) { siw_stop_tx_threads(); unregister_netdevice_notifier(&siw_netdev_nb); rdma_link_unregister(&siw_link_ops); ib_unregister_driver(RDMA_DRIVER_SIW); siw_cm_exit(); siw_destroy_cpulist(siw_cpu_info.num_nodes); pr_info("SoftiWARP detached\n"); } module_init(siw_init_module); module_exit(siw_exit_module); MODULE_ALIAS_RDMA_LINK("siw");
15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_GENERIC_BITOPS_LE_H_ #define _ASM_GENERIC_BITOPS_LE_H_ #include <asm/types.h> #include <asm/byteorder.h> #if defined(__LITTLE_ENDIAN) #define BITOP_LE_SWIZZLE 0 #elif defined(__BIG_ENDIAN) #define BITOP_LE_SWIZZLE ((BITS_PER_LONG-1) & ~0x7) #endif static inline int test_bit_le(int nr, const void *addr) { return test_bit(nr ^ BITOP_LE_SWIZZLE, addr); } static inline void set_bit_le(int nr, void *addr) { set_bit(nr ^ BITOP_LE_SWIZZLE, addr); } static inline void clear_bit_le(int nr, void *addr) { clear_bit(nr ^ BITOP_LE_SWIZZLE, addr); } static inline void __set_bit_le(int nr, void *addr) { __set_bit(nr ^ BITOP_LE_SWIZZLE, addr); } static inline void __clear_bit_le(int nr, void *addr) { __clear_bit(nr ^ BITOP_LE_SWIZZLE, addr); } static inline int test_and_set_bit_le(int nr, void *addr) { return test_and_set_bit(nr ^ BITOP_LE_SWIZZLE, addr); } static inline int test_and_clear_bit_le(int nr, void *addr) { return test_and_clear_bit(nr ^ BITOP_LE_SWIZZLE, addr); } static inline int __test_and_set_bit_le(int nr, void *addr) { return __test_and_set_bit(nr ^ BITOP_LE_SWIZZLE, addr); } static inline int __test_and_clear_bit_le(int nr, void *addr) { return __test_and_clear_bit(nr ^ BITOP_LE_SWIZZLE, addr); } #endif /* _ASM_GENERIC_BITOPS_LE_H_ */
499 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 /* * Performance events: * * Copyright (C) 2008-2009, Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2011, Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011, Red Hat, Inc., Peter Zijlstra * * Data type definitions, declarations, prototypes. * * Started by: Thomas Gleixner and Ingo Molnar * * For licencing details see kernel-base/COPYING */ #ifndef _LINUX_PERF_EVENT_H #define _LINUX_PERF_EVENT_H #include <uapi/linux/perf_event.h> #include <uapi/linux/bpf_perf_event.h> /* * Kernel-internal data types and definitions: */ #ifdef CONFIG_PERF_EVENTS # include <asm/perf_event.h> # include <asm/local64.h> #endif #ifdef CONFIG_HAVE_HW_BREAKPOINT # include <linux/rhashtable-types.h> # include <asm/hw_breakpoint.h> #endif #include <linux/list.h> #include <linux/mutex.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/spinlock.h> #include <linux/hrtimer.h> #include <linux/fs.h> #include <linux/pid_namespace.h> #include <linux/workqueue.h> #include <linux/ftrace.h> #include <linux/cpu.h> #include <linux/irq_work.h> #include <linux/static_key.h> #include <linux/jump_label_ratelimit.h> #include <linux/atomic.h> #include <linux/sysfs.h> #include <linux/perf_regs.h> #include <linux/cgroup.h> #include <linux/refcount.h> #include <linux/security.h> #include <linux/static_call.h> #include <linux/lockdep.h> #include <asm/local.h> struct perf_callchain_entry { u64 nr; u64 ip[]; /* /proc/sys/kernel/perf_event_max_stack */ }; struct perf_callchain_entry_ctx { struct perf_callchain_entry *entry; u32 max_stack; u32 nr; short contexts; bool contexts_maxed; }; typedef unsigned long (*perf_copy_f)(void *dst, const void *src, unsigned long off, unsigned long len); struct perf_raw_frag { union { struct perf_raw_frag *next; unsigned long pad; }; perf_copy_f copy; void *data; u32 size; } __packed; struct perf_raw_record { struct perf_raw_frag frag; u32 size; }; static __always_inline bool perf_raw_frag_last(const struct perf_raw_frag *frag) { return frag->pad < sizeof(u64); } /* * branch stack layout: * nr: number of taken branches stored in entries[] * hw_idx: The low level index of raw branch records * for the most recent branch. * -1ULL means invalid/unknown. * * Note that nr can vary from sample to sample * branches (to, from) are stored from most recent * to least recent, i.e., entries[0] contains the most * recent branch. * The entries[] is an abstraction of raw branch records, * which may not be stored in age order in HW, e.g. Intel LBR. * The hw_idx is to expose the low level index of raw * branch record for the most recent branch aka entries[0]. * The hw_idx index is between -1 (unknown) and max depth, * which can be retrieved in /sys/devices/cpu/caps/branches. * For the architectures whose raw branch records are * already stored in age order, the hw_idx should be 0. */ struct perf_branch_stack { u64 nr; u64 hw_idx; struct perf_branch_entry entries[]; }; struct task_struct; /* * extra PMU register associated with an event */ struct hw_perf_event_extra { u64 config; /* register value */ unsigned int reg; /* register address or index */ int alloc; /* extra register already allocated */ int idx; /* index in shared_regs->regs[] */ }; /** * hw_perf_event::flag values * * PERF_EVENT_FLAG_ARCH bits are reserved for architecture-specific * usage. */ #define PERF_EVENT_FLAG_ARCH 0x0fffffff #define PERF_EVENT_FLAG_USER_READ_CNT 0x80000000 static_assert((PERF_EVENT_FLAG_USER_READ_CNT & PERF_EVENT_FLAG_ARCH) == 0); /** * struct hw_perf_event - performance event hardware details: */ struct hw_perf_event { #ifdef CONFIG_PERF_EVENTS union { struct { /* hardware */ u64 config; u64 config1; u64 last_tag; u64 dyn_constraint; unsigned long config_base; unsigned long event_base; int event_base_rdpmc; int idx; int last_cpu; int flags; struct hw_perf_event_extra extra_reg; struct hw_perf_event_extra branch_reg; }; struct { /* aux / Intel-PT */ u64 aux_config; /* * For AUX area events, aux_paused cannot be a state * flag because it can be updated asynchronously to * state. */ unsigned int aux_paused; }; struct { /* software */ struct hrtimer hrtimer; }; struct { /* tracepoint */ /* for tp_event->class */ struct list_head tp_list; }; struct { /* amd_power */ u64 pwr_acc; u64 ptsc; }; #ifdef CONFIG_HAVE_HW_BREAKPOINT struct { /* breakpoint */ /* * Crufty hack to avoid the chicken and egg * problem hw_breakpoint has with context * creation and event initalization. */ struct arch_hw_breakpoint info; struct rhlist_head bp_list; }; #endif struct { /* amd_iommu */ u8 iommu_bank; u8 iommu_cntr; u16 padding; u64 conf; u64 conf1; }; }; /* * If the event is a per task event, this will point to the task in * question. See the comment in perf_event_alloc(). */ struct task_struct *target; /* * PMU would store hardware filter configuration * here. */ void *addr_filters; /* Last sync'ed generation of filters */ unsigned long addr_filters_gen; /* * hw_perf_event::state flags; used to track the PERF_EF_* state. */ /* the counter is stopped */ #define PERF_HES_STOPPED 0x01 /* event->count up-to-date */ #define PERF_HES_UPTODATE 0x02 #define PERF_HES_ARCH 0x04 int state; /* * The last observed hardware counter value, updated with a * local64_cmpxchg() such that pmu::read() can be called nested. */ local64_t prev_count; /* * The period to start the next sample with. */ u64 sample_period; union { struct { /* Sampling */ /* * The period we started this sample with. */ u64 last_period; /* * However much is left of the current period; * note that this is a full 64bit value and * allows for generation of periods longer * than hardware might allow. */ local64_t period_left; }; struct { /* Topdown events counting for context switch */ u64 saved_metric; u64 saved_slots; }; }; /* * State for throttling the event, see __perf_event_overflow() and * perf_adjust_freq_unthr_context(). */ u64 interrupts_seq; u64 interrupts; /* * State for freq target events, see __perf_event_overflow() and * perf_adjust_freq_unthr_context(). */ u64 freq_time_stamp; u64 freq_count_stamp; #endif /* CONFIG_PERF_EVENTS */ }; struct perf_event; struct perf_event_pmu_context; /* * Common implementation detail of pmu::{start,commit,cancel}_txn */ /* txn to add/schedule event on PMU */ #define PERF_PMU_TXN_ADD 0x1 /* txn to read event group from PMU */ #define PERF_PMU_TXN_READ 0x2 /** * pmu::capabilities flags */ #define PERF_PMU_CAP_NO_INTERRUPT 0x0001 #define PERF_PMU_CAP_NO_NMI 0x0002 #define PERF_PMU_CAP_AUX_NO_SG 0x0004 #define PERF_PMU_CAP_EXTENDED_REGS 0x0008 #define PERF_PMU_CAP_EXCLUSIVE 0x0010 #define PERF_PMU_CAP_ITRACE 0x0020 #define PERF_PMU_CAP_NO_EXCLUDE 0x0040 #define PERF_PMU_CAP_AUX_OUTPUT 0x0080 #define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100 #define PERF_PMU_CAP_AUX_PAUSE 0x0200 #define PERF_PMU_CAP_AUX_PREFER_LARGE 0x0400 /** * pmu::scope */ enum perf_pmu_scope { PERF_PMU_SCOPE_NONE = 0, PERF_PMU_SCOPE_CORE, PERF_PMU_SCOPE_DIE, PERF_PMU_SCOPE_CLUSTER, PERF_PMU_SCOPE_PKG, PERF_PMU_SCOPE_SYS_WIDE, PERF_PMU_MAX_SCOPE, }; struct perf_output_handle; #define PMU_NULL_DEV ((void *)(~0UL)) /** * struct pmu - generic performance monitoring unit */ struct pmu { struct list_head entry; spinlock_t events_lock; struct list_head events; struct module *module; struct device *dev; struct device *parent; const struct attribute_group **attr_groups; const struct attribute_group **attr_update; const char *name; int type; /* * various common per-pmu feature flags */ int capabilities; /* * PMU scope */ unsigned int scope; struct perf_cpu_pmu_context * __percpu *cpu_pmu_context; atomic_t exclusive_cnt; /* < 0: cpu; > 0: tsk */ int task_ctx_nr; int hrtimer_interval_ms; /* number of address filters this PMU can do */ unsigned int nr_addr_filters; /* * Fully disable/enable this PMU, can be used to protect from the PMI * as well as for lazy/batch writing of the MSRs. */ void (*pmu_enable) (struct pmu *pmu); /* optional */ void (*pmu_disable) (struct pmu *pmu); /* optional */ /* * Try and initialize the event for this PMU. * * Returns: * -ENOENT -- @event is not for this PMU * * -ENODEV -- @event is for this PMU but PMU not present * -EBUSY -- @event is for this PMU but PMU temporarily unavailable * -EINVAL -- @event is for this PMU but @event is not valid * -EOPNOTSUPP -- @event is for this PMU, @event is valid, but not supported * -EACCES -- @event is for this PMU, @event is valid, but no privileges * * 0 -- @event is for this PMU and valid * * Other error return values are allowed. */ int (*event_init) (struct perf_event *event); /* * Notification that the event was mapped or unmapped. Called * in the context of the mapping task. */ void (*event_mapped) (struct perf_event *event, struct mm_struct *mm); /* optional */ void (*event_unmapped) (struct perf_event *event, struct mm_struct *mm); /* optional */ /* * Flags for ->add()/->del()/ ->start()/->stop(). There are * matching hw_perf_event::state flags. */ /* start the counter when adding */ #define PERF_EF_START 0x01 /* reload the counter when starting */ #define PERF_EF_RELOAD 0x02 /* update the counter when stopping */ #define PERF_EF_UPDATE 0x04 /* AUX area event, pause tracing */ #define PERF_EF_PAUSE 0x08 /* AUX area event, resume tracing */ #define PERF_EF_RESUME 0x10 /* * Adds/Removes a counter to/from the PMU, can be done inside a * transaction, see the ->*_txn() methods. * * The add/del callbacks will reserve all hardware resources required * to service the event, this includes any counter constraint * scheduling etc. * * Called with IRQs disabled and the PMU disabled on the CPU the event * is on. * * ->add() called without PERF_EF_START should result in the same state * as ->add() followed by ->stop(). * * ->del() must always PERF_EF_UPDATE stop an event. If it calls * ->stop() that must deal with already being stopped without * PERF_EF_UPDATE. */ int (*add) (struct perf_event *event, int flags); void (*del) (struct perf_event *event, int flags); /* * Starts/Stops a counter present on the PMU. * * The PMI handler should stop the counter when perf_event_overflow() * returns !0. ->start() will be used to continue. * * Also used to change the sample period. * * Called with IRQs disabled and the PMU disabled on the CPU the event * is on -- will be called from NMI context with the PMU generates * NMIs. * * ->stop() with PERF_EF_UPDATE will read the counter and update * period/count values like ->read() would. * * ->start() with PERF_EF_RELOAD will reprogram the counter * value, must be preceded by a ->stop() with PERF_EF_UPDATE. * * ->stop() with PERF_EF_PAUSE will stop as simply as possible. Will not * overlap another ->stop() with PERF_EF_PAUSE nor ->start() with * PERF_EF_RESUME. * * ->start() with PERF_EF_RESUME will start as simply as possible but * only if the counter is not otherwise stopped. Will not overlap * another ->start() with PERF_EF_RESUME nor ->stop() with * PERF_EF_PAUSE. * * Notably, PERF_EF_PAUSE/PERF_EF_RESUME *can* be concurrent with other * ->stop()/->start() invocations, just not itself. */ void (*start) (struct perf_event *event, int flags); void (*stop) (struct perf_event *event, int flags); /* * Updates the counter value of the event. * * For sampling capable PMUs this will also update the software period * hw_perf_event::period_left field. */ void (*read) (struct perf_event *event); /* * Group events scheduling is treated as a transaction, add * group events as a whole and perform one schedulability test. * If the test fails, roll back the whole group * * Start the transaction, after this ->add() doesn't need to * do schedulability tests. * * Optional. */ void (*start_txn) (struct pmu *pmu, unsigned int txn_flags); /* * If ->start_txn() disabled the ->add() schedulability test * then ->commit_txn() is required to perform one. On success * the transaction is closed. On error the transaction is kept * open until ->cancel_txn() is called. * * Optional. */ int (*commit_txn) (struct pmu *pmu); /* * Will cancel the transaction, assumes ->del() is called * for each successful ->add() during the transaction. * * Optional. */ void (*cancel_txn) (struct pmu *pmu); /* * Will return the value for perf_event_mmap_page::index for this event, * if no implementation is provided it will default to 0 (see * perf_event_idx_default). */ int (*event_idx) (struct perf_event *event); /*optional */ /* * context-switches callback */ void (*sched_task) (struct perf_event_pmu_context *pmu_ctx, struct task_struct *task, bool sched_in); /* * Kmem cache of PMU specific data */ struct kmem_cache *task_ctx_cache; /* * Set up pmu-private data structures for an AUX area */ void *(*setup_aux) (struct perf_event *event, void **pages, int nr_pages, bool overwrite); /* optional */ /* * Free pmu-private AUX data structures */ void (*free_aux) (void *aux); /* optional */ /* * Take a snapshot of the AUX buffer without touching the event * state, so that preempting ->start()/->stop() callbacks does * not interfere with their logic. Called in PMI context. * * Returns the size of AUX data copied to the output handle. * * Optional. */ long (*snapshot_aux) (struct perf_event *event, struct perf_output_handle *handle, unsigned long size); /* * Validate address range filters: make sure the HW supports the * requested configuration and number of filters; return 0 if the * supplied filters are valid, -errno otherwise. * * Runs in the context of the ioctl()ing process and is not serialized * with the rest of the PMU callbacks. */ int (*addr_filters_validate) (struct list_head *filters); /* optional */ /* * Synchronize address range filter configuration: * translate hw-agnostic filters into hardware configuration in * event::hw::addr_filters. * * Runs as a part of filter sync sequence that is done in ->start() * callback by calling perf_event_addr_filters_sync(). * * May (and should) traverse event::addr_filters::list, for which its * caller provides necessary serialization. */ void (*addr_filters_sync) (struct perf_event *event); /* optional */ /* * Check if event can be used for aux_output purposes for * events of this PMU. * * Runs from perf_event_open(). Should return 0 for "no match" * or non-zero for "match". */ int (*aux_output_match) (struct perf_event *event); /* optional */ /* * Skip programming this PMU on the given CPU. Typically needed for * big.LITTLE things. */ bool (*filter) (struct pmu *pmu, int cpu); /* optional */ /* * Check period value for PERF_EVENT_IOC_PERIOD ioctl. */ int (*check_period) (struct perf_event *event, u64 value); /* optional */ }; enum perf_addr_filter_action_t { PERF_ADDR_FILTER_ACTION_STOP = 0, PERF_ADDR_FILTER_ACTION_START, PERF_ADDR_FILTER_ACTION_FILTER, }; /** * struct perf_addr_filter - address range filter definition * @entry: event's filter list linkage * @path: object file's path for file-based filters * @offset: filter range offset * @size: filter range size (size==0 means single address trigger) * @action: filter/start/stop * * This is a hardware-agnostic filter configuration as specified by the user. */ struct perf_addr_filter { struct list_head entry; struct path path; unsigned long offset; unsigned long size; enum perf_addr_filter_action_t action; }; /** * struct perf_addr_filters_head - container for address range filters * @list: list of filters for this event * @lock: spinlock that serializes accesses to the @list and event's * (and its children's) filter generations. * @nr_file_filters: number of file-based filters * * A child event will use parent's @list (and therefore @lock), so they are * bundled together; see perf_event_addr_filters(). */ struct perf_addr_filters_head { struct list_head list; raw_spinlock_t lock; unsigned int nr_file_filters; }; struct perf_addr_filter_range { unsigned long start; unsigned long size; }; /* * The normal states are: * * ACTIVE --. * ^ | * | | * sched_{in,out}() | * | | * v | * ,---> INACTIVE --+ <-. * | | | * | {dis,en}able() * sched_in() | | * | OFF <--' --+ * | | * `---> ERROR ------' * * That is: * * sched_in: INACTIVE -> {ACTIVE,ERROR} * sched_out: ACTIVE -> INACTIVE * disable: {ACTIVE,INACTIVE} -> OFF * enable: {OFF,ERROR} -> INACTIVE * * Where {OFF,ERROR} are disabled states. * * Then we have the {EXIT,REVOKED,DEAD} states which are various shades of * defunct events: * * - EXIT means task that the even was assigned to died, but child events * still live, and further children can still be created. But the event * itself will never be active again. It can only transition to * {REVOKED,DEAD}; * * - REVOKED means the PMU the event was associated with is gone; all * functionality is stopped but the event is still alive. Can only * transition to DEAD; * * - DEAD event really is DYING tearing down state and freeing bits. * */ enum perf_event_state { PERF_EVENT_STATE_DEAD = -5, PERF_EVENT_STATE_REVOKED = -4, /* pmu gone, must not touch */ PERF_EVENT_STATE_EXIT = -3, /* task died, still inherit */ PERF_EVENT_STATE_ERROR = -2, /* scheduling error, can enable */ PERF_EVENT_STATE_OFF = -1, PERF_EVENT_STATE_INACTIVE = 0, PERF_EVENT_STATE_ACTIVE = 1, }; struct file; struct perf_sample_data; typedef void (*perf_overflow_handler_t)(struct perf_event *, struct perf_sample_data *, struct pt_regs *regs); /* * Event capabilities. For event_caps and groups caps. * * PERF_EV_CAP_SOFTWARE: Is a software event. * PERF_EV_CAP_READ_ACTIVE_PKG: A CPU event (or cgroup event) that can be read * from any CPU in the package where it is active. * PERF_EV_CAP_SIBLING: An event with this flag must be a group sibling and * cannot be a group leader. If an event with this flag is detached from the * group it is scheduled out and moved into an unrecoverable ERROR state. * PERF_EV_CAP_READ_SCOPE: A CPU event that can be read from any CPU of the * PMU scope where it is active. */ #define PERF_EV_CAP_SOFTWARE BIT(0) #define PERF_EV_CAP_READ_ACTIVE_PKG BIT(1) #define PERF_EV_CAP_SIBLING BIT(2) #define PERF_EV_CAP_READ_SCOPE BIT(3) #define SWEVENT_HLIST_BITS 8 #define SWEVENT_HLIST_SIZE (1 << SWEVENT_HLIST_BITS) struct swevent_hlist { struct hlist_head heads[SWEVENT_HLIST_SIZE]; struct rcu_head rcu_head; }; #define PERF_ATTACH_CONTEXT 0x0001 #define PERF_ATTACH_GROUP 0x0002 #define PERF_ATTACH_TASK 0x0004 #define PERF_ATTACH_TASK_DATA 0x0008 #define PERF_ATTACH_GLOBAL_DATA 0x0010 #define PERF_ATTACH_SCHED_CB 0x0020 #define PERF_ATTACH_CHILD 0x0040 #define PERF_ATTACH_EXCLUSIVE 0x0080 #define PERF_ATTACH_CALLCHAIN 0x0100 #define PERF_ATTACH_ITRACE 0x0200 struct bpf_prog; struct perf_cgroup; struct perf_buffer; struct pmu_event_list { raw_spinlock_t lock; struct list_head list; }; /* * event->sibling_list is modified whole holding both ctx->lock and ctx->mutex * as such iteration must hold either lock. However, since ctx->lock is an IRQ * safe lock, and is only held by the CPU doing the modification, having IRQs * disabled is sufficient since it will hold-off the IPIs. */ #ifdef CONFIG_PROVE_LOCKING # define lockdep_assert_event_ctx(event) \ WARN_ON_ONCE(__lockdep_enabled && \ (this_cpu_read(hardirqs_enabled) && \ lockdep_is_held(&(event)->ctx->mutex) != LOCK_STATE_HELD)) #else # define lockdep_assert_event_ctx(event) #endif #define for_each_sibling_event(sibling, event) \ lockdep_assert_event_ctx(event); \ if ((event)->group_leader == (event)) \ list_for_each_entry((sibling), &(event)->sibling_list, sibling_list) /** * struct perf_event - performance event kernel representation: */ struct perf_event { #ifdef CONFIG_PERF_EVENTS /* * entry onto perf_event_context::event_list; * modifications require ctx->lock * RCU safe iterations. */ struct list_head event_entry; /* * Locked for modification by both ctx->mutex and ctx->lock; holding * either sufficies for read. */ struct list_head sibling_list; struct list_head active_list; /* * Node on the pinned or flexible tree located at the event context; */ struct rb_node group_node; u64 group_index; /* * We need storage to track the entries in perf_pmu_migrate_context; we * cannot use the event_entry because of RCU and we want to keep the * group in tact which avoids us using the other two entries. */ struct list_head migrate_entry; struct hlist_node hlist_entry; struct list_head active_entry; int nr_siblings; /* Not serialized. Only written during event initialization. */ int event_caps; /* The cumulative AND of all event_caps for events in this group. */ int group_caps; unsigned int group_generation; struct perf_event *group_leader; /* * event->pmu will always point to pmu in which this event belongs. * Whereas event->pmu_ctx->pmu may point to other pmu when group of * different pmu events is created. */ struct pmu *pmu; void *pmu_private; enum perf_event_state state; unsigned int attach_state; local64_t count; atomic64_t child_count; /* * These are the total time in nanoseconds that the event * has been enabled (i.e. eligible to run, and the task has * been scheduled in, if this is a per-task event) * and running (scheduled onto the CPU), respectively. */ u64 total_time_enabled; u64 total_time_running; u64 tstamp; struct perf_event_attr attr; u16 header_size; u16 id_header_size; u16 read_size; struct hw_perf_event hw; struct perf_event_context *ctx; /* * event->pmu_ctx points to perf_event_pmu_context in which the event * is added. This pmu_ctx can be of other pmu for sw event when that * sw event is part of a group which also contains non-sw events. */ struct perf_event_pmu_context *pmu_ctx; atomic_long_t refcount; /* * These accumulate total time (in nanoseconds) that children * events have been enabled and running, respectively. */ atomic64_t child_total_time_enabled; atomic64_t child_total_time_running; /* * Protect attach/detach and child_list: */ struct mutex child_mutex; struct list_head child_list; struct perf_event *parent; int oncpu; int cpu; struct list_head owner_entry; struct task_struct *owner; /* mmap bits */ struct mutex mmap_mutex; atomic_t mmap_count; struct perf_buffer *rb; struct list_head rb_entry; unsigned long rcu_batches; int rcu_pending; /* poll related */ wait_queue_head_t waitq; struct fasync_struct *fasync; /* delayed work for NMIs and such */ unsigned int pending_wakeup; unsigned int pending_kill; unsigned int pending_disable; unsigned long pending_addr; /* SIGTRAP */ struct irq_work pending_irq; struct irq_work pending_disable_irq; struct callback_head pending_task; unsigned int pending_work; atomic_t event_limit; /* address range filters */ struct perf_addr_filters_head addr_filters; /* vma address array for file-based filders */ struct perf_addr_filter_range *addr_filter_ranges; unsigned long addr_filters_gen; /* for aux_output events */ struct perf_event *aux_event; void (*destroy)(struct perf_event *); struct rcu_head rcu_head; struct pid_namespace *ns; u64 id; atomic64_t lost_samples; u64 (*clock)(void); perf_overflow_handler_t overflow_handler; void *overflow_handler_context; struct bpf_prog *prog; u64 bpf_cookie; #ifdef CONFIG_EVENT_TRACING struct trace_event_call *tp_event; struct event_filter *filter; # ifdef CONFIG_FUNCTION_TRACER struct ftrace_ops ftrace_ops; # endif #endif #ifdef CONFIG_CGROUP_PERF struct perf_cgroup *cgrp; /* cgroup event is attach to */ #endif #ifdef CONFIG_SECURITY void *security; #endif struct list_head sb_list; struct list_head pmu_list; /* * Certain events gets forwarded to another pmu internally by over- * writing kernel copy of event->attr.type without user being aware * of it. event->orig_type contains original 'type' requested by * user. */ u32 orig_type; #endif /* CONFIG_PERF_EVENTS */ }; /* * ,-----------------------[1:n]------------------------. * V V * perf_event_context <-[1:n]-> perf_event_pmu_context <-[1:n]- perf_event * | | * `--[n:1]-> pmu <-[1:n]--' * * * struct perf_event_pmu_context lifetime is refcount based and RCU freed * (similar to perf_event_context). Locking is as if it were a member of * perf_event_context; specifically: * * modification, both: ctx->mutex && ctx->lock * reading, either: ctx->mutex || ctx->lock * * There is one exception to this; namely put_pmu_ctx() isn't always called * with ctx->mutex held; this means that as long as we can guarantee the epc * has events the above rules hold. * * Specificially, sys_perf_event_open()'s group_leader case depends on * ctx->mutex pinning the configuration. Since we hold a reference on * group_leader (through the filedesc) it can't go away, therefore it's * associated pmu_ctx must exist and cannot change due to ctx->mutex. * * perf_event holds a refcount on perf_event_context * perf_event holds a refcount on perf_event_pmu_context */ struct perf_event_pmu_context { struct pmu *pmu; struct perf_event_context *ctx; struct list_head pmu_ctx_entry; struct list_head pinned_active; struct list_head flexible_active; /* Used to identify the per-cpu perf_event_pmu_context */ unsigned int embedded : 1; unsigned int nr_events; unsigned int nr_cgroups; unsigned int nr_freq; atomic_t refcount; /* event <-> epc */ struct rcu_head rcu_head; /* * Set when one or more (plausibly active) event can't be scheduled * due to pmu overcommit or pmu constraints, except tolerant to * events not necessary to be active due to scheduling constraints, * such as cgroups. */ int rotate_necessary; }; static inline bool perf_pmu_ctx_is_active(struct perf_event_pmu_context *epc) { return !list_empty(&epc->flexible_active) || !list_empty(&epc->pinned_active); } struct perf_event_groups { struct rb_root tree; u64 index; }; /** * struct perf_event_context - event context structure * * Used as a container for task events and CPU events as well: */ struct perf_event_context { /* * Protect the states of the events in the list, * nr_active, and the list: */ raw_spinlock_t lock; /* * Protect the list of events. Locking either mutex or lock * is sufficient to ensure the list doesn't change; to change * the list you need to lock both the mutex and the spinlock. */ struct mutex mutex; struct list_head pmu_ctx_list; struct perf_event_groups pinned_groups; struct perf_event_groups flexible_groups; struct list_head event_list; int nr_events; int nr_user; int is_active; int nr_stat; int nr_freq; int rotate_disable; refcount_t refcount; /* event <-> ctx */ struct task_struct *task; /* * Context clock, runs when context enabled. */ u64 time; u64 timestamp; u64 timeoffset; /* * These fields let us detect when two contexts have both * been cloned (inherited) from a common ancestor. */ struct perf_event_context *parent_ctx; u64 parent_gen; u64 generation; int pin_count; #ifdef CONFIG_CGROUP_PERF int nr_cgroups; /* cgroup evts */ #endif struct rcu_head rcu_head; /* * The count of events for which using the switch-out fast path * should be avoided. * * Sum (event->pending_work + events with * (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))) * * The SIGTRAP is targeted at ctx->task, as such it won't do changing * that until the signal is delivered. */ local_t nr_no_switch_fast; }; /** * struct perf_ctx_data - PMU specific data for a task * @rcu_head: To avoid the race on free PMU specific data * @refcount: To track users * @global: To track system-wide users * @ctx_cache: Kmem cache of PMU specific data * @data: PMU specific data * * Currently, the struct is only used in Intel LBR call stack mode to * save/restore the call stack of a task on context switches. * * The rcu_head is used to prevent the race on free the data. * The data only be allocated when Intel LBR call stack mode is enabled. * The data will be freed when the mode is disabled. * The content of the data will only be accessed in context switch, which * should be protected by rcu_read_lock(). * * Because of the alignment requirement of Intel Arch LBR, the Kmem cache * is used to allocate the PMU specific data. The ctx_cache is to track * the Kmem cache. * * Careful: Struct perf_ctx_data is added as a pointer in struct task_struct. * When system-wide Intel LBR call stack mode is enabled, a buffer with * constant size will be allocated for each task. * Also, system memory consumption can further grow when the size of * struct perf_ctx_data enlarges. */ struct perf_ctx_data { struct rcu_head rcu_head; refcount_t refcount; int global; struct kmem_cache *ctx_cache; void *data; }; struct perf_cpu_pmu_context { struct perf_event_pmu_context epc; struct perf_event_pmu_context *task_epc; struct list_head sched_cb_entry; int sched_cb_usage; int active_oncpu; int exclusive; int pmu_disable_count; raw_spinlock_t hrtimer_lock; struct hrtimer hrtimer; ktime_t hrtimer_interval; unsigned int hrtimer_active; }; /** * struct perf_event_cpu_context - per cpu event context structure */ struct perf_cpu_context { struct perf_event_context ctx; struct perf_event_context *task_ctx; int online; #ifdef CONFIG_CGROUP_PERF struct perf_cgroup *cgrp; #endif /* * Per-CPU storage for iterators used in visit_groups_merge. The default * storage is of size 2 to hold the CPU and any CPU event iterators. */ int heap_size; struct perf_event **heap; struct perf_event *heap_default[2]; }; struct perf_output_handle { struct perf_event *event; struct perf_buffer *rb; unsigned long wakeup; unsigned long size; union { u64 flags; /* perf_output*() */ u64 aux_flags; /* perf_aux_output*() */ struct { u64 skip_read : 1; }; }; union { void *addr; unsigned long head; }; int page; }; struct bpf_perf_event_data_kern { bpf_user_pt_regs_t *regs; struct perf_sample_data *data; struct perf_event *event; }; #ifdef CONFIG_CGROUP_PERF /* * perf_cgroup_info keeps track of time_enabled for a cgroup. * This is a per-cpu dynamically allocated data structure. */ struct perf_cgroup_info { u64 time; u64 timestamp; u64 timeoffset; int active; }; struct perf_cgroup { struct cgroup_subsys_state css; struct perf_cgroup_info __percpu *info; }; /* * Must ensure cgroup is pinned (css_get) before calling * this function. In other words, we cannot call this function * if there is no cgroup event for the current CPU context. */ static inline struct perf_cgroup * perf_cgroup_from_task(struct task_struct *task, struct perf_event_context *ctx) { return container_of(task_css_check(task, perf_event_cgrp_id, ctx ? lockdep_is_held(&ctx->lock) : true), struct perf_cgroup, css); } #endif /* CONFIG_CGROUP_PERF */ #ifdef CONFIG_PERF_EVENTS extern struct perf_event_context *perf_cpu_task_ctx(void); extern void *perf_aux_output_begin(struct perf_output_handle *handle, struct perf_event *event); extern void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size); extern int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size); extern void *perf_get_aux(struct perf_output_handle *handle); extern void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags); extern void perf_event_itrace_started(struct perf_event *event); extern int perf_pmu_register(struct pmu *pmu, const char *name, int type); extern int perf_pmu_unregister(struct pmu *pmu); extern void __perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task); extern void __perf_event_task_sched_out(struct task_struct *prev, struct task_struct *next); extern int perf_event_init_task(struct task_struct *child, u64 clone_flags); extern void perf_event_exit_task(struct task_struct *child); extern void perf_event_free_task(struct task_struct *task); extern void perf_event_delayed_put(struct task_struct *task); extern struct file *perf_event_get(unsigned int fd); extern const struct perf_event *perf_get_event(struct file *file); extern const struct perf_event_attr *perf_event_attrs(struct perf_event *event); extern void perf_event_print_debug(void); extern void perf_pmu_disable(struct pmu *pmu); extern void perf_pmu_enable(struct pmu *pmu); extern void perf_sched_cb_dec(struct pmu *pmu); extern void perf_sched_cb_inc(struct pmu *pmu); extern int perf_event_task_disable(void); extern int perf_event_task_enable(void); extern void perf_pmu_resched(struct pmu *pmu); extern int perf_event_refresh(struct perf_event *event, int refresh); extern void perf_event_update_userpage(struct perf_event *event); extern int perf_event_release_kernel(struct perf_event *event); extern struct perf_event * perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, struct task_struct *task, perf_overflow_handler_t callback, void *context); extern void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu); extern int perf_event_read_local(struct perf_event *event, u64 *value, u64 *enabled, u64 *running); extern u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running); extern struct perf_callchain_entry *perf_callchain(struct perf_event *event, struct pt_regs *regs); static inline bool branch_sample_no_flags(const struct perf_event *event) { return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_NO_FLAGS; } static inline bool branch_sample_no_cycles(const struct perf_event *event) { return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_NO_CYCLES; } static inline bool branch_sample_type(const struct perf_event *event) { return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_TYPE_SAVE; } static inline bool branch_sample_hw_index(const struct perf_event *event) { return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX; } static inline bool branch_sample_priv(const struct perf_event *event) { return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_PRIV_SAVE; } static inline bool branch_sample_counters(const struct perf_event *event) { return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_COUNTERS; } static inline bool branch_sample_call_stack(const struct perf_event *event) { return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK; } struct perf_sample_data { /* * Fields set by perf_sample_data_init() unconditionally, * group so as to minimize the cachelines touched. */ u64 sample_flags; u64 period; u64 dyn_size; /* * Fields commonly set by __perf_event_header__init_id(), * group so as to minimize the cachelines touched. */ u64 type; struct { u32 pid; u32 tid; } tid_entry; u64 time; u64 id; struct { u32 cpu; u32 reserved; } cpu_entry; /* * The other fields, optionally {set,used} by * perf_{prepare,output}_sample(). */ u64 ip; struct perf_callchain_entry *callchain; struct perf_raw_record *raw; struct perf_branch_stack *br_stack; u64 *br_stack_cntr; union perf_sample_weight weight; union perf_mem_data_src data_src; u64 txn; struct perf_regs regs_user; struct perf_regs regs_intr; u64 stack_user_size; u64 stream_id; u64 cgroup; u64 addr; u64 phys_addr; u64 data_page_size; u64 code_page_size; u64 aux_size; } ____cacheline_aligned; /* default value for data source */ #define PERF_MEM_NA (PERF_MEM_S(OP, NA) |\ PERF_MEM_S(LVL, NA) |\ PERF_MEM_S(SNOOP, NA) |\ PERF_MEM_S(LOCK, NA) |\ PERF_MEM_S(TLB, NA) |\ PERF_MEM_S(LVLNUM, NA)) static inline void perf_sample_data_init(struct perf_sample_data *data, u64 addr, u64 period) { /* remaining struct members initialized in perf_prepare_sample() */ data->sample_flags = PERF_SAMPLE_PERIOD; data->period = period; data->dyn_size = 0; if (addr) { data->addr = addr; data->sample_flags |= PERF_SAMPLE_ADDR; } } static inline void perf_sample_save_callchain(struct perf_sample_data *data, struct perf_event *event, struct pt_regs *regs) { int size = 1; if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)) return; if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_CALLCHAIN)) return; data->callchain = perf_callchain(event, regs); size += data->callchain->nr; data->dyn_size += size * sizeof(u64); data->sample_flags |= PERF_SAMPLE_CALLCHAIN; } static inline void perf_sample_save_raw_data(struct perf_sample_data *data, struct perf_event *event, struct perf_raw_record *raw) { struct perf_raw_frag *frag = &raw->frag; u32 sum = 0; int size; if (!(event->attr.sample_type & PERF_SAMPLE_RAW)) return; if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_RAW)) return; do { sum += frag->size; if (perf_raw_frag_last(frag)) break; frag = frag->next; } while (1); size = round_up(sum + sizeof(u32), sizeof(u64)); raw->size = size - sizeof(u32); frag->pad = raw->size - sum; data->raw = raw; data->dyn_size += size; data->sample_flags |= PERF_SAMPLE_RAW; } static inline bool has_branch_stack(struct perf_event *event) { return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK; } static inline void perf_sample_save_brstack(struct perf_sample_data *data, struct perf_event *event, struct perf_branch_stack *brs, u64 *brs_cntr) { int size = sizeof(u64); /* nr */ if (!has_branch_stack(event)) return; if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_BRANCH_STACK)) return; if (branch_sample_hw_index(event)) size += sizeof(u64); brs->nr = min_t(u16, event->attr.sample_max_stack, brs->nr); size += brs->nr * sizeof(struct perf_branch_entry); /* * The extension space for counters is appended after the * struct perf_branch_stack. It is used to store the occurrences * of events of each branch. */ if (brs_cntr) size += brs->nr * sizeof(u64); data->br_stack = brs; data->br_stack_cntr = brs_cntr; data->dyn_size += size; data->sample_flags |= PERF_SAMPLE_BRANCH_STACK; } static inline u32 perf_sample_data_size(struct perf_sample_data *data, struct perf_event *event) { u32 size = sizeof(struct perf_event_header); size += event->header_size + event->id_header_size; size += data->dyn_size; return size; } /* * Clear all bitfields in the perf_branch_entry. * The to and from fields are not cleared because they are * systematically modified by caller. */ static inline void perf_clear_branch_entry_bitfields(struct perf_branch_entry *br) { br->mispred = 0; br->predicted = 0; br->in_tx = 0; br->abort = 0; br->cycles = 0; br->type = 0; br->spec = PERF_BR_SPEC_NA; br->reserved = 0; } extern void perf_output_sample(struct perf_output_handle *handle, struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event); extern void perf_prepare_sample(struct perf_sample_data *data, struct perf_event *event, struct pt_regs *regs); extern void perf_prepare_header(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event, struct pt_regs *regs); extern int perf_event_overflow(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs); extern void perf_event_output_forward(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs); extern void perf_event_output_backward(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs); extern int perf_event_output(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs); static inline bool is_default_overflow_handler(struct perf_event *event) { perf_overflow_handler_t overflow_handler = event->overflow_handler; if (likely(overflow_handler == perf_event_output_forward)) return true; if (unlikely(overflow_handler == perf_event_output_backward)) return true; return false; } extern void perf_event_header__init_id(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event); extern void perf_event__output_id_sample(struct perf_event *event, struct perf_output_handle *handle, struct perf_sample_data *sample); extern void perf_log_lost_samples(struct perf_event *event, u64 lost); static inline bool event_has_any_exclude_flag(struct perf_event *event) { struct perf_event_attr *attr = &event->attr; return attr->exclude_idle || attr->exclude_user || attr->exclude_kernel || attr->exclude_hv || attr->exclude_guest || attr->exclude_host; } static inline bool is_sampling_event(struct perf_event *event) { return event->attr.sample_period != 0; } /* * Return 1 for a software event, 0 for a hardware event */ static inline int is_software_event(struct perf_event *event) { return event->event_caps & PERF_EV_CAP_SOFTWARE; } /* * Return 1 for event in sw context, 0 for event in hw context */ static inline int in_software_context(struct perf_event *event) { return event->pmu_ctx->pmu->task_ctx_nr == perf_sw_context; } static inline int is_exclusive_pmu(struct pmu *pmu) { return pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE; } extern struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; extern void ___perf_sw_event(u32, u64, struct pt_regs *, u64); extern void __perf_sw_event(u32, u64, struct pt_regs *, u64); #ifndef perf_arch_fetch_caller_regs static inline void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip) { } #endif /* * When generating a perf sample in-line, instead of from an interrupt / * exception, we lack a pt_regs. This is typically used from software events * like: SW_CONTEXT_SWITCHES, SW_MIGRATIONS and the tie-in with tracepoints. * * We typically don't need a full set, but (for x86) do require: * - ip for PERF_SAMPLE_IP * - cs for user_mode() tests * - sp for PERF_SAMPLE_CALLCHAIN * - eflags for MISC bits and CALLCHAIN (see: perf_hw_regs()) * * NOTE: assumes @regs is otherwise already 0 filled; this is important for * things like PERF_SAMPLE_REGS_INTR. */ static inline void perf_fetch_caller_regs(struct pt_regs *regs) { perf_arch_fetch_caller_regs(regs, CALLER_ADDR0); } static __always_inline void perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { if (static_key_false(&perf_swevent_enabled[event_id])) __perf_sw_event(event_id, nr, regs, addr); } DECLARE_PER_CPU(struct pt_regs, __perf_regs[4]); /* * 'Special' version for the scheduler, it hard assumes no recursion, * which is guaranteed by us not actually scheduling inside other swevents * because those disable preemption. */ static __always_inline void __perf_sw_event_sched(u32 event_id, u64 nr, u64 addr) { struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]); perf_fetch_caller_regs(regs); ___perf_sw_event(event_id, nr, regs, addr); } extern struct static_key_false perf_sched_events; static __always_inline bool __perf_sw_enabled(int swevt) { return static_key_false(&perf_swevent_enabled[swevt]); } static inline void perf_event_task_migrate(struct task_struct *task) { if (__perf_sw_enabled(PERF_COUNT_SW_CPU_MIGRATIONS)) task->sched_migrated = 1; } static inline void perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task) { if (static_branch_unlikely(&perf_sched_events)) __perf_event_task_sched_in(prev, task); if (__perf_sw_enabled(PERF_COUNT_SW_CPU_MIGRATIONS) && task->sched_migrated) { __perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0); task->sched_migrated = 0; } } static inline void perf_event_task_sched_out(struct task_struct *prev, struct task_struct *next) { if (__perf_sw_enabled(PERF_COUNT_SW_CONTEXT_SWITCHES)) __perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0); #ifdef CONFIG_CGROUP_PERF if (__perf_sw_enabled(PERF_COUNT_SW_CGROUP_SWITCHES) && perf_cgroup_from_task(prev, NULL) != perf_cgroup_from_task(next, NULL)) __perf_sw_event_sched(PERF_COUNT_SW_CGROUP_SWITCHES, 1, 0); #endif if (static_branch_unlikely(&perf_sched_events)) __perf_event_task_sched_out(prev, next); } extern void perf_event_mmap(struct vm_area_struct *vma); extern void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, const char *sym); extern void perf_event_bpf_event(struct bpf_prog *prog, enum perf_bpf_event_type type, u16 flags); #define PERF_GUEST_ACTIVE 0x01 #define PERF_GUEST_USER 0x02 struct perf_guest_info_callbacks { unsigned int (*state)(void); unsigned long (*get_ip)(void); unsigned int (*handle_intel_pt_intr)(void); }; #ifdef CONFIG_GUEST_PERF_EVENTS extern struct perf_guest_info_callbacks __rcu *perf_guest_cbs; DECLARE_STATIC_CALL(__perf_guest_state, *perf_guest_cbs->state); DECLARE_STATIC_CALL(__perf_guest_get_ip, *perf_guest_cbs->get_ip); DECLARE_STATIC_CALL(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr); static inline unsigned int perf_guest_state(void) { return static_call(__perf_guest_state)(); } static inline unsigned long perf_guest_get_ip(void) { return static_call(__perf_guest_get_ip)(); } static inline unsigned int perf_guest_handle_intel_pt_intr(void) { return static_call(__perf_guest_handle_intel_pt_intr)(); } extern void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); extern void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); #else /* !CONFIG_GUEST_PERF_EVENTS: */ static inline unsigned int perf_guest_state(void) { return 0; } static inline unsigned long perf_guest_get_ip(void) { return 0; } static inline unsigned int perf_guest_handle_intel_pt_intr(void) { return 0; } #endif /* !CONFIG_GUEST_PERF_EVENTS */ extern void perf_event_exec(void); extern void perf_event_comm(struct task_struct *tsk, bool exec); extern void perf_event_namespaces(struct task_struct *tsk); extern void perf_event_fork(struct task_struct *tsk); extern void perf_event_text_poke(const void *addr, const void *old_bytes, size_t old_len, const void *new_bytes, size_t new_len); /* Callchains */ DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry); extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs); extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs); extern struct perf_callchain_entry * get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, u32 max_stack, bool crosstask, bool add_mark); extern int get_callchain_buffers(int max_stack); extern void put_callchain_buffers(void); extern struct perf_callchain_entry *get_callchain_entry(int *rctx); extern void put_callchain_entry(int rctx); extern int sysctl_perf_event_max_stack; extern int sysctl_perf_event_max_contexts_per_stack; static inline int perf_callchain_store_context(struct perf_callchain_entry_ctx *ctx, u64 ip) { if (ctx->contexts < sysctl_perf_event_max_contexts_per_stack) { struct perf_callchain_entry *entry = ctx->entry; entry->ip[entry->nr++] = ip; ++ctx->contexts; return 0; } else { ctx->contexts_maxed = true; return -1; /* no more room, stop walking the stack */ } } static inline int perf_callchain_store(struct perf_callchain_entry_ctx *ctx, u64 ip) { if (ctx->nr < ctx->max_stack && !ctx->contexts_maxed) { struct perf_callchain_entry *entry = ctx->entry; entry->ip[entry->nr++] = ip; ++ctx->nr; return 0; } else { return -1; /* no more room, stop walking the stack */ } } extern int sysctl_perf_event_paranoid; extern int sysctl_perf_event_sample_rate; extern void perf_sample_event_took(u64 sample_len_ns); /* Access to perf_event_open(2) syscall. */ #define PERF_SECURITY_OPEN 0 /* Finer grained perf_event_open(2) access control. */ #define PERF_SECURITY_CPU 1 #define PERF_SECURITY_KERNEL 2 #define PERF_SECURITY_TRACEPOINT 3 static inline int perf_is_paranoid(void) { return sysctl_perf_event_paranoid > -1; } extern int perf_allow_kernel(void); static inline int perf_allow_cpu(void) { if (sysctl_perf_event_paranoid > 0 && !perfmon_capable()) return -EACCES; return security_perf_event_open(PERF_SECURITY_CPU); } static inline int perf_allow_tracepoint(void) { if (sysctl_perf_event_paranoid > -1 && !perfmon_capable()) return -EPERM; return security_perf_event_open(PERF_SECURITY_TRACEPOINT); } extern int perf_exclude_event(struct perf_event *event, struct pt_regs *regs); extern void perf_event_init(void); extern void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, struct pt_regs *regs, struct hlist_head *head, int rctx, struct task_struct *task); extern void perf_bp_event(struct perf_event *event, void *data); extern unsigned long perf_misc_flags(struct perf_event *event, struct pt_regs *regs); extern unsigned long perf_instruction_pointer(struct perf_event *event, struct pt_regs *regs); #ifndef perf_arch_misc_flags # define perf_arch_misc_flags(regs) \ (user_mode(regs) ? PERF_RECORD_MISC_USER : PERF_RECORD_MISC_KERNEL) # define perf_arch_instruction_pointer(regs) instruction_pointer(regs) #endif #ifndef perf_arch_bpf_user_pt_regs # define perf_arch_bpf_user_pt_regs(regs) regs #endif #ifndef perf_arch_guest_misc_flags static inline unsigned long perf_arch_guest_misc_flags(struct pt_regs *regs) { unsigned long guest_state = perf_guest_state(); if (!(guest_state & PERF_GUEST_ACTIVE)) return 0; if (guest_state & PERF_GUEST_USER) return PERF_RECORD_MISC_GUEST_USER; else return PERF_RECORD_MISC_GUEST_KERNEL; } # define perf_arch_guest_misc_flags(regs) perf_arch_guest_misc_flags(regs) #endif static inline bool needs_branch_stack(struct perf_event *event) { return event->attr.branch_sample_type != 0; } static inline bool has_aux(struct perf_event *event) { return event->pmu && event->pmu->setup_aux; } static inline bool has_aux_action(struct perf_event *event) { return event->attr.aux_sample_size || event->attr.aux_pause || event->attr.aux_resume; } static inline bool is_write_backward(struct perf_event *event) { return !!event->attr.write_backward; } static inline bool has_addr_filter(struct perf_event *event) { return event->pmu->nr_addr_filters; } /* * An inherited event uses parent's filters */ static inline struct perf_addr_filters_head * perf_event_addr_filters(struct perf_event *event) { struct perf_addr_filters_head *ifh = &event->addr_filters; if (event->parent) ifh = &event->parent->addr_filters; return ifh; } static inline struct fasync_struct **perf_event_fasync(struct perf_event *event) { /* Only the parent has fasync state */ if (event->parent) event = event->parent; return &event->fasync; } extern void perf_event_addr_filters_sync(struct perf_event *event); extern void perf_report_aux_output_id(struct perf_event *event, u64 hw_id); extern int perf_output_begin(struct perf_output_handle *handle, struct perf_sample_data *data, struct perf_event *event, unsigned int size); extern int perf_output_begin_forward(struct perf_output_handle *handle, struct perf_sample_data *data, struct perf_event *event, unsigned int size); extern int perf_output_begin_backward(struct perf_output_handle *handle, struct perf_sample_data *data, struct perf_event *event, unsigned int size); extern void perf_output_end(struct perf_output_handle *handle); extern unsigned int perf_output_copy(struct perf_output_handle *handle, const void *buf, unsigned int len); extern unsigned int perf_output_skip(struct perf_output_handle *handle, unsigned int len); extern long perf_output_copy_aux(struct perf_output_handle *aux_handle, struct perf_output_handle *handle, unsigned long from, unsigned long to); extern int perf_swevent_get_recursion_context(void); extern void perf_swevent_put_recursion_context(int rctx); extern u64 perf_swevent_set_period(struct perf_event *event); extern void perf_event_enable(struct perf_event *event); extern void perf_event_disable(struct perf_event *event); extern void perf_event_disable_local(struct perf_event *event); extern void perf_event_disable_inatomic(struct perf_event *event); extern void perf_event_task_tick(void); extern int perf_event_account_interrupt(struct perf_event *event); extern int perf_event_period(struct perf_event *event, u64 value); extern u64 perf_event_pause(struct perf_event *event, bool reset); #else /* !CONFIG_PERF_EVENTS: */ static inline void * perf_aux_output_begin(struct perf_output_handle *handle, struct perf_event *event) { return NULL; } static inline void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) { } static inline int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) { return -EINVAL; } static inline void * perf_get_aux(struct perf_output_handle *handle) { return NULL; } static inline void perf_event_task_migrate(struct task_struct *task) { } static inline void perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task) { } static inline void perf_event_task_sched_out(struct task_struct *prev, struct task_struct *next) { } static inline int perf_event_init_task(struct task_struct *child, u64 clone_flags) { return 0; } static inline void perf_event_exit_task(struct task_struct *child) { } static inline void perf_event_free_task(struct task_struct *task) { } static inline void perf_event_delayed_put(struct task_struct *task) { } static inline struct file *perf_event_get(unsigned int fd) { return ERR_PTR(-EINVAL); } static inline const struct perf_event *perf_get_event(struct file *file) { return ERR_PTR(-EINVAL); } static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *event) { return ERR_PTR(-EINVAL); } static inline int perf_event_read_local(struct perf_event *event, u64 *value, u64 *enabled, u64 *running) { return -EINVAL; } static inline void perf_event_print_debug(void) { } static inline int perf_event_task_disable(void) { return -EINVAL; } static inline int perf_event_task_enable(void) { return -EINVAL; } static inline int perf_event_refresh(struct perf_event *event, int refresh) { return -EINVAL; } static inline void perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { } static inline void perf_bp_event(struct perf_event *event, void *data) { } static inline void perf_event_mmap(struct vm_area_struct *vma) { } typedef int (perf_ksymbol_get_name_f)(char *name, int name_len, void *data); static inline void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, const char *sym) { } static inline void perf_event_bpf_event(struct bpf_prog *prog, enum perf_bpf_event_type type, u16 flags) { } static inline void perf_event_exec(void) { } static inline void perf_event_comm(struct task_struct *tsk, bool exec) { } static inline void perf_event_namespaces(struct task_struct *tsk) { } static inline void perf_event_fork(struct task_struct *tsk) { } static inline void perf_event_text_poke(const void *addr, const void *old_bytes, size_t old_len, const void *new_bytes, size_t new_len) { } static inline void perf_event_init(void) { } static inline int perf_swevent_get_recursion_context(void) { return -1; } static inline void perf_swevent_put_recursion_context(int rctx) { } static inline u64 perf_swevent_set_period(struct perf_event *event) { return 0; } static inline void perf_event_enable(struct perf_event *event) { } static inline void perf_event_disable(struct perf_event *event) { } static inline int __perf_event_disable(void *info) { return -1; } static inline void perf_event_task_tick(void) { } static inline int perf_event_release_kernel(struct perf_event *event) { return 0; } static inline int perf_event_period(struct perf_event *event, u64 value) { return -EINVAL; } static inline u64 perf_event_pause(struct perf_event *event, bool reset) { return 0; } static inline int perf_exclude_event(struct perf_event *event, struct pt_regs *regs) { return 0; } #endif /* !CONFIG_PERF_EVENTS */ #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) extern void perf_restore_debug_store(void); #else static inline void perf_restore_debug_store(void) { } #endif #define perf_output_put(handle, x) perf_output_copy((handle), &(x), sizeof(x)) struct perf_pmu_events_attr { struct device_attribute attr; u64 id; const char *event_str; }; struct perf_pmu_events_ht_attr { struct device_attribute attr; u64 id; const char *event_str_ht; const char *event_str_noht; }; struct perf_pmu_events_hybrid_attr { struct device_attribute attr; u64 id; const char *event_str; u64 pmu_type; }; struct perf_pmu_format_hybrid_attr { struct device_attribute attr; u64 pmu_type; }; ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr, char *page); #define PMU_EVENT_ATTR(_name, _var, _id, _show) \ static struct perf_pmu_events_attr _var = { \ .attr = __ATTR(_name, 0444, _show, NULL), \ .id = _id, \ }; #define PMU_EVENT_ATTR_STRING(_name, _var, _str) \ static struct perf_pmu_events_attr _var = { \ .attr = __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \ .id = 0, \ .event_str = _str, \ }; #define PMU_EVENT_ATTR_ID(_name, _show, _id) \ (&((struct perf_pmu_events_attr[]) { \ { .attr = __ATTR(_name, 0444, _show, NULL), \ .id = _id, } \ })[0].attr.attr) #define PMU_FORMAT_ATTR_SHOW(_name, _format) \ static ssize_t \ _name##_show(struct device *dev, \ struct device_attribute *attr, \ char *page) \ { \ BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ return sprintf(page, _format "\n"); \ } \ #define PMU_FORMAT_ATTR(_name, _format) \ PMU_FORMAT_ATTR_SHOW(_name, _format) \ \ static struct device_attribute format_attr_##_name = __ATTR_RO(_name) /* Performance counter hotplug functions */ #ifdef CONFIG_PERF_EVENTS extern int perf_event_init_cpu(unsigned int cpu); extern int perf_event_exit_cpu(unsigned int cpu); #else # define perf_event_init_cpu NULL # define perf_event_exit_cpu NULL #endif extern void arch_perf_update_userpage(struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now); /* * Snapshot branch stack on software events. * * Branch stack can be very useful in understanding software events. For * example, when a long function, e.g. sys_perf_event_open, returns an * errno, it is not obvious why the function failed. Branch stack could * provide very helpful information in this type of scenarios. * * On software event, it is necessary to stop the hardware branch recorder * fast. Otherwise, the hardware register/buffer will be flushed with * entries of the triggering event. Therefore, static call is used to * stop the hardware recorder. */ /* * cnt is the number of entries allocated for entries. * Return number of entries copied to . */ typedef int (perf_snapshot_branch_stack_t)(struct perf_branch_entry *entries, unsigned int cnt); DECLARE_STATIC_CALL(perf_snapshot_branch_stack, perf_snapshot_branch_stack_t); #ifndef PERF_NEEDS_LOPWR_CB static inline void perf_lopwr_cb(bool mode) { } #endif #endif /* _LINUX_PERF_EVENT_H */
47 47 47 47 672 666 257 656 669 669 541 542 543 655 651 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 // SPDX-License-Identifier: GPL-2.0 /* * mm/pgtable-generic.c * * Generic pgtable methods declared in linux/pgtable.h * * Copyright (C) 2010 Linus Torvalds */ #include <linux/pagemap.h> #include <linux/hugetlb.h> #include <linux/pgtable.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/mm_inline.h> #include <asm/pgalloc.h> #include <asm/tlb.h> /* * If a p?d_bad entry is found while walking page tables, report * the error, before resetting entry to p?d_none. Usually (but * very seldom) called out from the p?d_none_or_clear_bad macros. */ void pgd_clear_bad(pgd_t *pgd) { pgd_ERROR(*pgd); pgd_clear(pgd); } #ifndef __PAGETABLE_P4D_FOLDED void p4d_clear_bad(p4d_t *p4d) { p4d_ERROR(*p4d); p4d_clear(p4d); } #endif #ifndef __PAGETABLE_PUD_FOLDED void pud_clear_bad(pud_t *pud) { pud_ERROR(*pud); pud_clear(pud); } #endif /* * Note that the pmd variant below can't be stub'ed out just as for p4d/pud * above. pmd folding is special and typically pmd_* macros refer to upper * level even when folded */ void pmd_clear_bad(pmd_t *pmd) { pmd_ERROR(*pmd); pmd_clear(pmd); } #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS /* * Only sets the access flags (dirty, accessed), as well as write * permission. Furthermore, we know it always gets set to a "more * permissive" setting, which allows most architectures to optimize * this. We return whether the PTE actually changed, which in turn * instructs the caller to do things like update__mmu_cache. This * used to be done in the caller, but sparc needs minor faults to * force that call on sun4c so we changed this macro slightly */ int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty) { int changed = !pte_same(ptep_get(ptep), entry); if (changed) { set_pte_at(vma->vm_mm, address, ptep, entry); flush_tlb_fix_spurious_fault(vma, address, ptep); } return changed; } #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH int ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { int young; young = ptep_test_and_clear_young(vma, address, ptep); if (young) flush_tlb_page(vma, address); return young; } #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { struct mm_struct *mm = (vma)->vm_mm; pte_t pte; pte = ptep_get_and_clear(mm, address, ptep); if (pte_accessible(mm, pte)) flush_tlb_page(vma, address); return pte; } #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE #ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t entry, int dirty) { int changed = !pmd_same(*pmdp, entry); VM_BUG_ON(address & ~HPAGE_PMD_MASK); if (changed) { set_pmd_at(vma->vm_mm, address, pmdp, entry); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); } return changed; } #endif #ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { int young; VM_BUG_ON(address & ~HPAGE_PMD_MASK); young = pmdp_test_and_clear_young(vma, address, pmdp); if (young) flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return young; } #endif #ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { pmd_t pmd; VM_BUG_ON(address & ~HPAGE_PMD_MASK); VM_BUG_ON(pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return pmd; } #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, pud_t *pudp) { pud_t pud; VM_BUG_ON(address & ~HPAGE_PUD_MASK); VM_BUG_ON(!pud_trans_huge(*pudp) && !pud_devmap(*pudp)); pud = pudp_huge_get_and_clear(vma->vm_mm, address, pudp); flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE); return pud; } #endif #endif #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable) { assert_spin_locked(pmd_lockptr(mm, pmdp)); /* FIFO */ if (!pmd_huge_pte(mm, pmdp)) INIT_LIST_HEAD(&pgtable->lru); else list_add(&pgtable->lru, &pmd_huge_pte(mm, pmdp)->lru); pmd_huge_pte(mm, pmdp) = pgtable; } #endif #ifndef __HAVE_ARCH_PGTABLE_WITHDRAW /* no "address" argument so destroys page coloring of some arch */ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) { pgtable_t pgtable; assert_spin_locked(pmd_lockptr(mm, pmdp)); /* FIFO */ pgtable = pmd_huge_pte(mm, pmdp); pmd_huge_pte(mm, pmdp) = list_first_entry_or_null(&pgtable->lru, struct page, lru); if (pmd_huge_pte(mm, pmdp)) list_del(&pgtable->lru); return pgtable; } #endif #ifndef __HAVE_ARCH_PMDP_INVALIDATE pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { VM_WARN_ON_ONCE(!pmd_present(*pmdp)); pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp)); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return old; } #endif #ifndef __HAVE_ARCH_PMDP_INVALIDATE_AD pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { VM_WARN_ON_ONCE(!pmd_present(*pmdp)); return pmdp_invalidate(vma, address, pmdp); } #endif #ifndef pmdp_collapse_flush pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { /* * pmd and hugepage pte format are same. So we could * use the same function. */ pmd_t pmd; VM_BUG_ON(address & ~HPAGE_PMD_MASK); VM_BUG_ON(pmd_trans_huge(*pmdp)); pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); /* collapse entails shooting down ptes not pmd */ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return pmd; } #endif /* arch define pte_free_defer in asm/pgalloc.h for its own implementation */ #ifndef pte_free_defer static void pte_free_now(struct rcu_head *head) { struct page *page; page = container_of(head, struct page, rcu_head); pte_free(NULL /* mm not passed and not used */, (pgtable_t)page); } void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable) { struct page *page; page = pgtable; call_rcu(&page->rcu_head, pte_free_now); } #endif /* pte_free_defer */ #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #if defined(CONFIG_GUP_GET_PXX_LOW_HIGH) && \ (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RCU)) /* * See the comment above ptep_get_lockless() in include/linux/pgtable.h: * the barriers in pmdp_get_lockless() cannot guarantee that the value in * pmd_high actually belongs with the value in pmd_low; but holding interrupts * off blocks the TLB flush between present updates, which guarantees that a * successful __pte_offset_map() points to a page from matched halves. */ static unsigned long pmdp_get_lockless_start(void) { unsigned long irqflags; local_irq_save(irqflags); return irqflags; } static void pmdp_get_lockless_end(unsigned long irqflags) { local_irq_restore(irqflags); } #else static unsigned long pmdp_get_lockless_start(void) { return 0; } static void pmdp_get_lockless_end(unsigned long irqflags) { } #endif pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) { unsigned long irqflags; pmd_t pmdval; rcu_read_lock(); irqflags = pmdp_get_lockless_start(); pmdval = pmdp_get_lockless(pmd); pmdp_get_lockless_end(irqflags); if (pmdvalp) *pmdvalp = pmdval; if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval))) goto nomap; if (unlikely(pmd_trans_huge(pmdval) || pmd_devmap(pmdval))) goto nomap; if (unlikely(pmd_bad(pmdval))) { pmd_clear_bad(pmd); goto nomap; } return __pte_map(&pmdval, addr); nomap: rcu_read_unlock(); return NULL; } pte_t *pte_offset_map_ro_nolock(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, spinlock_t **ptlp) { pmd_t pmdval; pte_t *pte; pte = __pte_offset_map(pmd, addr, &pmdval); if (likely(pte)) *ptlp = pte_lockptr(mm, &pmdval); return pte; } pte_t *pte_offset_map_rw_nolock(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp, spinlock_t **ptlp) { pte_t *pte; VM_WARN_ON_ONCE(!pmdvalp); pte = __pte_offset_map(pmd, addr, pmdvalp); if (likely(pte)) *ptlp = pte_lockptr(mm, pmdvalp); return pte; } /* * pte_offset_map_lock(mm, pmd, addr, ptlp), and its internal implementation * __pte_offset_map_lock() below, is usually called with the pmd pointer for * addr, reached by walking down the mm's pgd, p4d, pud for addr: either while * holding mmap_lock or vma lock for read or for write; or in truncate or rmap * context, while holding file's i_mmap_lock or anon_vma lock for read (or for * write). In a few cases, it may be used with pmd pointing to a pmd_t already * copied to or constructed on the stack. * * When successful, it returns the pte pointer for addr, with its page table * kmapped if necessary (when CONFIG_HIGHPTE), and locked against concurrent * modification by software, with a pointer to that spinlock in ptlp (in some * configs mm->page_table_lock, in SPLIT_PTLOCK configs a spinlock in table's * struct page). pte_unmap_unlock(pte, ptl) to unlock and unmap afterwards. * * But it is unsuccessful, returning NULL with *ptlp unchanged, if there is no * page table at *pmd: if, for example, the page table has just been removed, * or replaced by the huge pmd of a THP. (When successful, *pmd is rechecked * after acquiring the ptlock, and retried internally if it changed: so that a * page table can be safely removed or replaced by THP while holding its lock.) * * pte_offset_map(pmd, addr), and its internal helper __pte_offset_map() above, * just returns the pte pointer for addr, its page table kmapped if necessary; * or NULL if there is no page table at *pmd. It does not attempt to lock the * page table, so cannot normally be used when the page table is to be updated, * or when entries read must be stable. But it does take rcu_read_lock(): so * that even when page table is racily removed, it remains a valid though empty * and disconnected table. Until pte_unmap(pte) unmaps and rcu_read_unlock()s * afterwards. * * pte_offset_map_ro_nolock(mm, pmd, addr, ptlp), above, is like pte_offset_map(); * but when successful, it also outputs a pointer to the spinlock in ptlp - as * pte_offset_map_lock() does, but in this case without locking it. This helps * the caller to avoid a later pte_lockptr(mm, *pmd), which might by that time * act on a changed *pmd: pte_offset_map_ro_nolock() provides the correct spinlock * pointer for the page table that it returns. Even after grabbing the spinlock, * we might be looking either at a page table that is still mapped or one that * was unmapped and is about to get freed. But for R/O access this is sufficient. * So it is only applicable for read-only cases where any modification operations * to the page table are not allowed even if the corresponding spinlock is held * afterwards. * * pte_offset_map_rw_nolock(mm, pmd, addr, pmdvalp, ptlp), above, is like * pte_offset_map_ro_nolock(); but when successful, it also outputs the pdmval. * It is applicable for may-write cases where any modification operations to the * page table may happen after the corresponding spinlock is held afterwards. * But the users should make sure the page table is stable like checking pte_same() * or checking pmd_same() by using the output pmdval before performing the write * operations. * * Note: "RO" / "RW" expresses the intended semantics, not that the *kmap* will * be read-only/read-write protected. * * Note that free_pgtables(), used after unmapping detached vmas, or when * exiting the whole mm, does not take page table lock before freeing a page * table, and may not use RCU at all: "outsiders" like khugepaged should avoid * pte_offset_map() and co once the vma is detached from mm or mm_users is zero. */ pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, spinlock_t **ptlp) { spinlock_t *ptl; pmd_t pmdval; pte_t *pte; again: pte = __pte_offset_map(pmd, addr, &pmdval); if (unlikely(!pte)) return pte; ptl = pte_lockptr(mm, &pmdval); spin_lock(ptl); if (likely(pmd_same(pmdval, pmdp_get_lockless(pmd)))) { *ptlp = ptl; return pte; } pte_unmap_unlock(pte, ptl); goto again; }
7 278 55 91 897 289 776 109 65 66 21 38 99 278 138 1 292 190 63 148 148 3 193 193 122 122 122 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 /* SPDX-License-Identifier: GPL-2.0+ */ #ifndef _LINUX_XARRAY_H #define _LINUX_XARRAY_H /* * eXtensible Arrays * Copyright (c) 2017 Microsoft Corporation * Author: Matthew Wilcox <willy@infradead.org> * * See Documentation/core-api/xarray.rst for how to use the XArray. */ #include <linux/bitmap.h> #include <linux/bug.h> #include <linux/compiler.h> #include <linux/err.h> #include <linux/gfp.h> #include <linux/kconfig.h> #include <linux/limits.h> #include <linux/lockdep.h> #include <linux/rcupdate.h> #include <linux/sched/mm.h> #include <linux/spinlock.h> #include <linux/types.h> struct list_lru; /* * The bottom two bits of the entry determine how the XArray interprets * the contents: * * 00: Pointer entry * 10: Internal entry * x1: Value entry or tagged pointer * * Attempting to store internal entries in the XArray is a bug. * * Most internal entries are pointers to the next node in the tree. * The following internal entries have a special meaning: * * 0-62: Sibling entries * 256: Retry entry * 257: Zero entry * * Errors are also represented as internal entries, but use the negative * space (-4094 to -2). They're never stored in the slots array; only * returned by the normal API. */ #define BITS_PER_XA_VALUE (BITS_PER_LONG - 1) /** * xa_mk_value() - Create an XArray entry from an integer. * @v: Value to store in XArray. * * Context: Any context. * Return: An entry suitable for storing in the XArray. */ static inline void *xa_mk_value(unsigned long v) { WARN_ON((long)v < 0); return (void *)((v << 1) | 1); } /** * xa_to_value() - Get value stored in an XArray entry. * @entry: XArray entry. * * Context: Any context. * Return: The value stored in the XArray entry. */ static inline unsigned long xa_to_value(const void *entry) { return (unsigned long)entry >> 1; } /** * xa_is_value() - Determine if an entry is a value. * @entry: XArray entry. * * Context: Any context. * Return: True if the entry is a value, false if it is a pointer. */ static inline bool xa_is_value(const void *entry) { return (unsigned long)entry & 1; } /** * xa_tag_pointer() - Create an XArray entry for a tagged pointer. * @p: Plain pointer. * @tag: Tag value (0, 1 or 3). * * If the user of the XArray prefers, they can tag their pointers instead * of storing value entries. Three tags are available (0, 1 and 3). * These are distinct from the xa_mark_t as they are not replicated up * through the array and cannot be searched for. * * Context: Any context. * Return: An XArray entry. */ static inline void *xa_tag_pointer(void *p, unsigned long tag) { return (void *)((unsigned long)p | tag); } /** * xa_untag_pointer() - Turn an XArray entry into a plain pointer. * @entry: XArray entry. * * If you have stored a tagged pointer in the XArray, call this function * to get the untagged version of the pointer. * * Context: Any context. * Return: A pointer. */ static inline void *xa_untag_pointer(void *entry) { return (void *)((unsigned long)entry & ~3UL); } /** * xa_pointer_tag() - Get the tag stored in an XArray entry. * @entry: XArray entry. * * If you have stored a tagged pointer in the XArray, call this function * to get the tag of that pointer. * * Context: Any context. * Return: A tag. */ static inline unsigned int xa_pointer_tag(void *entry) { return (unsigned long)entry & 3UL; } /* * xa_mk_internal() - Create an internal entry. * @v: Value to turn into an internal entry. * * Internal entries are used for a number of purposes. Entries 0-255 are * used for sibling entries (only 0-62 are used by the current code). 256 * is used for the retry entry. 257 is used for the reserved / zero entry. * Negative internal entries are used to represent errnos. Node pointers * are also tagged as internal entries in some situations. * * Context: Any context. * Return: An XArray internal entry corresponding to this value. */ static inline void *xa_mk_internal(unsigned long v) { return (void *)((v << 2) | 2); } /* * xa_to_internal() - Extract the value from an internal entry. * @entry: XArray entry. * * Context: Any context. * Return: The value which was stored in the internal entry. */ static inline unsigned long xa_to_internal(const void *entry) { return (unsigned long)entry >> 2; } /* * xa_is_internal() - Is the entry an internal entry? * @entry: XArray entry. * * Context: Any context. * Return: %true if the entry is an internal entry. */ static inline bool xa_is_internal(const void *entry) { return ((unsigned long)entry & 3) == 2; } #define XA_ZERO_ENTRY xa_mk_internal(257) /** * xa_is_zero() - Is the entry a zero entry? * @entry: Entry retrieved from the XArray * * The normal API will return NULL as the contents of a slot containing * a zero entry. You can only see zero entries by using the advanced API. * * Return: %true if the entry is a zero entry. */ static inline bool xa_is_zero(const void *entry) { return unlikely(entry == XA_ZERO_ENTRY); } /** * xa_is_err() - Report whether an XArray operation returned an error * @entry: Result from calling an XArray function * * If an XArray operation cannot complete an operation, it will return * a special value indicating an error. This function tells you * whether an error occurred; xa_err() tells you which error occurred. * * Context: Any context. * Return: %true if the entry indicates an error. */ static inline bool xa_is_err(const void *entry) { return unlikely(xa_is_internal(entry) && entry >= xa_mk_internal(-MAX_ERRNO)); } /** * xa_err() - Turn an XArray result into an errno. * @entry: Result from calling an XArray function. * * If an XArray operation cannot complete an operation, it will return * a special pointer value which encodes an errno. This function extracts * the errno from the pointer value, or returns 0 if the pointer does not * represent an errno. * * Context: Any context. * Return: A negative errno or 0. */ static inline int xa_err(void *entry) { /* xa_to_internal() would not do sign extension. */ if (xa_is_err(entry)) return (long)entry >> 2; return 0; } /** * struct xa_limit - Represents a range of IDs. * @min: The lowest ID to allocate (inclusive). * @max: The maximum ID to allocate (inclusive). * * This structure is used either directly or via the XA_LIMIT() macro * to communicate the range of IDs that are valid for allocation. * Three common ranges are predefined for you: * * xa_limit_32b - [0 - UINT_MAX] * * xa_limit_31b - [0 - INT_MAX] * * xa_limit_16b - [0 - USHRT_MAX] */ struct xa_limit { u32 max; u32 min; }; #define XA_LIMIT(_min, _max) (struct xa_limit) { .min = _min, .max = _max } #define xa_limit_32b XA_LIMIT(0, UINT_MAX) #define xa_limit_31b XA_LIMIT(0, INT_MAX) #define xa_limit_16b XA_LIMIT(0, USHRT_MAX) typedef unsigned __bitwise xa_mark_t; #define XA_MARK_0 ((__force xa_mark_t)0U) #define XA_MARK_1 ((__force xa_mark_t)1U) #define XA_MARK_2 ((__force xa_mark_t)2U) #define XA_PRESENT ((__force xa_mark_t)8U) #define XA_MARK_MAX XA_MARK_2 #define XA_FREE_MARK XA_MARK_0 enum xa_lock_type { XA_LOCK_IRQ = 1, XA_LOCK_BH = 2, }; /* * Values for xa_flags. The radix tree stores its GFP flags in the xa_flags, * and we remain compatible with that. */ #define XA_FLAGS_LOCK_IRQ ((__force gfp_t)XA_LOCK_IRQ) #define XA_FLAGS_LOCK_BH ((__force gfp_t)XA_LOCK_BH) #define XA_FLAGS_TRACK_FREE ((__force gfp_t)4U) #define XA_FLAGS_ZERO_BUSY ((__force gfp_t)8U) #define XA_FLAGS_ALLOC_WRAPPED ((__force gfp_t)16U) #define XA_FLAGS_ACCOUNT ((__force gfp_t)32U) #define XA_FLAGS_MARK(mark) ((__force gfp_t)((1U << __GFP_BITS_SHIFT) << \ (__force unsigned)(mark))) /* ALLOC is for a normal 0-based alloc. ALLOC1 is for an 1-based alloc */ #define XA_FLAGS_ALLOC (XA_FLAGS_TRACK_FREE | XA_FLAGS_MARK(XA_FREE_MARK)) #define XA_FLAGS_ALLOC1 (XA_FLAGS_TRACK_FREE | XA_FLAGS_ZERO_BUSY) /** * struct xarray - The anchor of the XArray. * @xa_lock: Lock that protects the contents of the XArray. * * To use the xarray, define it statically or embed it in your data structure. * It is a very small data structure, so it does not usually make sense to * allocate it separately and keep a pointer to it in your data structure. * * You may use the xa_lock to protect your own data structures as well. */ /* * If all of the entries in the array are NULL, @xa_head is a NULL pointer. * If the only non-NULL entry in the array is at index 0, @xa_head is that * entry. If any other entry in the array is non-NULL, @xa_head points * to an @xa_node. */ struct xarray { spinlock_t xa_lock; /* private: The rest of the data structure is not to be used directly. */ gfp_t xa_flags; void __rcu * xa_head; }; #define XARRAY_INIT(name, flags) { \ .xa_lock = __SPIN_LOCK_UNLOCKED(name.xa_lock), \ .xa_flags = flags, \ .xa_head = NULL, \ } /** * DEFINE_XARRAY_FLAGS() - Define an XArray with custom flags. * @name: A string that names your XArray. * @flags: XA_FLAG values. * * This is intended for file scope definitions of XArrays. It declares * and initialises an empty XArray with the chosen name and flags. It is * equivalent to calling xa_init_flags() on the array, but it does the * initialisation at compiletime instead of runtime. */ #define DEFINE_XARRAY_FLAGS(name, flags) \ struct xarray name = XARRAY_INIT(name, flags) /** * DEFINE_XARRAY() - Define an XArray. * @name: A string that names your XArray. * * This is intended for file scope definitions of XArrays. It declares * and initialises an empty XArray with the chosen name. It is equivalent * to calling xa_init() on the array, but it does the initialisation at * compiletime instead of runtime. */ #define DEFINE_XARRAY(name) DEFINE_XARRAY_FLAGS(name, 0) /** * DEFINE_XARRAY_ALLOC() - Define an XArray which allocates IDs starting at 0. * @name: A string that names your XArray. * * This is intended for file scope definitions of allocating XArrays. * See also DEFINE_XARRAY(). */ #define DEFINE_XARRAY_ALLOC(name) DEFINE_XARRAY_FLAGS(name, XA_FLAGS_ALLOC) /** * DEFINE_XARRAY_ALLOC1() - Define an XArray which allocates IDs starting at 1. * @name: A string that names your XArray. * * This is intended for file scope definitions of allocating XArrays. * See also DEFINE_XARRAY(). */ #define DEFINE_XARRAY_ALLOC1(name) DEFINE_XARRAY_FLAGS(name, XA_FLAGS_ALLOC1) void *xa_load(struct xarray *, unsigned long index); void *xa_store(struct xarray *, unsigned long index, void *entry, gfp_t); void *xa_erase(struct xarray *, unsigned long index); void *xa_store_range(struct xarray *, unsigned long first, unsigned long last, void *entry, gfp_t); bool xa_get_mark(struct xarray *, unsigned long index, xa_mark_t); void xa_set_mark(struct xarray *, unsigned long index, xa_mark_t); void xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t); void *xa_find(struct xarray *xa, unsigned long *index, unsigned long max, xa_mark_t) __attribute__((nonnull(2))); void *xa_find_after(struct xarray *xa, unsigned long *index, unsigned long max, xa_mark_t) __attribute__((nonnull(2))); unsigned int xa_extract(struct xarray *, void **dst, unsigned long start, unsigned long max, unsigned int n, xa_mark_t); void xa_destroy(struct xarray *); /** * xa_init_flags() - Initialise an empty XArray with flags. * @xa: XArray. * @flags: XA_FLAG values. * * If you need to initialise an XArray with special flags (eg you need * to take the lock from interrupt context), use this function instead * of xa_init(). * * Context: Any context. */ static inline void xa_init_flags(struct xarray *xa, gfp_t flags) { spin_lock_init(&xa->xa_lock); xa->xa_flags = flags; xa->xa_head = NULL; } /** * xa_init() - Initialise an empty XArray. * @xa: XArray. * * An empty XArray is full of NULL entries. * * Context: Any context. */ static inline void xa_init(struct xarray *xa) { xa_init_flags(xa, 0); } /** * xa_empty() - Determine if an array has any present entries. * @xa: XArray. * * Context: Any context. * Return: %true if the array contains only NULL pointers. */ static inline bool xa_empty(const struct xarray *xa) { return xa->xa_head == NULL; } /** * xa_marked() - Inquire whether any entry in this array has a mark set * @xa: Array * @mark: Mark value * * Context: Any context. * Return: %true if any entry has this mark set. */ static inline bool xa_marked(const struct xarray *xa, xa_mark_t mark) { return xa->xa_flags & XA_FLAGS_MARK(mark); } /** * xa_for_each_range() - Iterate over a portion of an XArray. * @xa: XArray. * @index: Index of @entry. * @entry: Entry retrieved from array. * @start: First index to retrieve from array. * @last: Last index to retrieve from array. * * During the iteration, @entry will have the value of the entry stored * in @xa at @index. You may modify @index during the iteration if you * want to skip or reprocess indices. It is safe to modify the array * during the iteration. At the end of the iteration, @entry will be set * to NULL and @index will have a value less than or equal to max. * * xa_for_each_range() is O(n.log(n)) while xas_for_each() is O(n). You have * to handle your own locking with xas_for_each(), and if you have to unlock * after each iteration, it will also end up being O(n.log(n)). * xa_for_each_range() will spin if it hits a retry entry; if you intend to * see retry entries, you should use the xas_for_each() iterator instead. * The xas_for_each() iterator will expand into more inline code than * xa_for_each_range(). * * Context: Any context. Takes and releases the RCU lock. */ #define xa_for_each_range(xa, index, entry, start, last) \ for (index = start, \ entry = xa_find(xa, &index, last, XA_PRESENT); \ entry; \ entry = xa_find_after(xa, &index, last, XA_PRESENT)) /** * xa_for_each_start() - Iterate over a portion of an XArray. * @xa: XArray. * @index: Index of @entry. * @entry: Entry retrieved from array. * @start: First index to retrieve from array. * * During the iteration, @entry will have the value of the entry stored * in @xa at @index. You may modify @index during the iteration if you * want to skip or reprocess indices. It is safe to modify the array * during the iteration. At the end of the iteration, @entry will be set * to NULL and @index will have a value less than or equal to max. * * xa_for_each_start() is O(n.log(n)) while xas_for_each() is O(n). You have * to handle your own locking with xas_for_each(), and if you have to unlock * after each iteration, it will also end up being O(n.log(n)). * xa_for_each_start() will spin if it hits a retry entry; if you intend to * see retry entries, you should use the xas_for_each() iterator instead. * The xas_for_each() iterator will expand into more inline code than * xa_for_each_start(). * * Context: Any context. Takes and releases the RCU lock. */ #define xa_for_each_start(xa, index, entry, start) \ xa_for_each_range(xa, index, entry, start, ULONG_MAX) /** * xa_for_each() - Iterate over present entries in an XArray. * @xa: XArray. * @index: Index of @entry. * @entry: Entry retrieved from array. * * During the iteration, @entry will have the value of the entry stored * in @xa at @index. You may modify @index during the iteration if you want * to skip or reprocess indices. It is safe to modify the array during the * iteration. At the end of the iteration, @entry will be set to NULL and * @index will have a value less than or equal to max. * * xa_for_each() is O(n.log(n)) while xas_for_each() is O(n). You have * to handle your own locking with xas_for_each(), and if you have to unlock * after each iteration, it will also end up being O(n.log(n)). xa_for_each() * will spin if it hits a retry entry; if you intend to see retry entries, * you should use the xas_for_each() iterator instead. The xas_for_each() * iterator will expand into more inline code than xa_for_each(). * * Context: Any context. Takes and releases the RCU lock. */ #define xa_for_each(xa, index, entry) \ xa_for_each_start(xa, index, entry, 0) /** * xa_for_each_marked() - Iterate over marked entries in an XArray. * @xa: XArray. * @index: Index of @entry. * @entry: Entry retrieved from array. * @filter: Selection criterion. * * During the iteration, @entry will have the value of the entry stored * in @xa at @index. The iteration will skip all entries in the array * which do not match @filter. You may modify @index during the iteration * if you want to skip or reprocess indices. It is safe to modify the array * during the iteration. At the end of the iteration, @entry will be set to * NULL and @index will have a value less than or equal to max. * * xa_for_each_marked() is O(n.log(n)) while xas_for_each_marked() is O(n). * You have to handle your own locking with xas_for_each(), and if you have * to unlock after each iteration, it will also end up being O(n.log(n)). * xa_for_each_marked() will spin if it hits a retry entry; if you intend to * see retry entries, you should use the xas_for_each_marked() iterator * instead. The xas_for_each_marked() iterator will expand into more inline * code than xa_for_each_marked(). * * Context: Any context. Takes and releases the RCU lock. */ #define xa_for_each_marked(xa, index, entry, filter) \ for (index = 0, entry = xa_find(xa, &index, ULONG_MAX, filter); \ entry; entry = xa_find_after(xa, &index, ULONG_MAX, filter)) #define xa_trylock(xa) spin_trylock(&(xa)->xa_lock) #define xa_lock(xa) spin_lock(&(xa)->xa_lock) #define xa_unlock(xa) spin_unlock(&(xa)->xa_lock) #define xa_lock_bh(xa) spin_lock_bh(&(xa)->xa_lock) #define xa_unlock_bh(xa) spin_unlock_bh(&(xa)->xa_lock) #define xa_lock_irq(xa) spin_lock_irq(&(xa)->xa_lock) #define xa_unlock_irq(xa) spin_unlock_irq(&(xa)->xa_lock) #define xa_lock_irqsave(xa, flags) \ spin_lock_irqsave(&(xa)->xa_lock, flags) #define xa_unlock_irqrestore(xa, flags) \ spin_unlock_irqrestore(&(xa)->xa_lock, flags) #define xa_lock_nested(xa, subclass) \ spin_lock_nested(&(xa)->xa_lock, subclass) #define xa_lock_bh_nested(xa, subclass) \ spin_lock_bh_nested(&(xa)->xa_lock, subclass) #define xa_lock_irq_nested(xa, subclass) \ spin_lock_irq_nested(&(xa)->xa_lock, subclass) #define xa_lock_irqsave_nested(xa, flags, subclass) \ spin_lock_irqsave_nested(&(xa)->xa_lock, flags, subclass) /* * Versions of the normal API which require the caller to hold the * xa_lock. If the GFP flags allow it, they will drop the lock to * allocate memory, then reacquire it afterwards. These functions * may also re-enable interrupts if the XArray flags indicate the * locking should be interrupt safe. */ void *__xa_erase(struct xarray *, unsigned long index); void *__xa_store(struct xarray *, unsigned long index, void *entry, gfp_t); void *__xa_cmpxchg(struct xarray *, unsigned long index, void *old, void *entry, gfp_t); int __must_check __xa_insert(struct xarray *, unsigned long index, void *entry, gfp_t); int __must_check __xa_alloc(struct xarray *, u32 *id, void *entry, struct xa_limit, gfp_t); int __must_check __xa_alloc_cyclic(struct xarray *, u32 *id, void *entry, struct xa_limit, u32 *next, gfp_t); void __xa_set_mark(struct xarray *, unsigned long index, xa_mark_t); void __xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t); /** * xa_store_bh() - Store this entry in the XArray. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * This function is like calling xa_store() except it disables softirqs * while holding the array lock. * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. * Return: The old entry at this index or xa_err() if an error happened. */ static inline void *xa_store_bh(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { void *curr; might_alloc(gfp); xa_lock_bh(xa); curr = __xa_store(xa, index, entry, gfp); xa_unlock_bh(xa); return curr; } /** * xa_store_irq() - Store this entry in the XArray. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * This function is like calling xa_store() except it disables interrupts * while holding the array lock. * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. * Return: The old entry at this index or xa_err() if an error happened. */ static inline void *xa_store_irq(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { void *curr; might_alloc(gfp); xa_lock_irq(xa); curr = __xa_store(xa, index, entry, gfp); xa_unlock_irq(xa); return curr; } /** * xa_erase_bh() - Erase this entry from the XArray. * @xa: XArray. * @index: Index of entry. * * After this function returns, loading from @index will return %NULL. * If the index is part of a multi-index entry, all indices will be erased * and none of the entries will be part of a multi-index entry. * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. * Return: The entry which used to be at this index. */ static inline void *xa_erase_bh(struct xarray *xa, unsigned long index) { void *entry; xa_lock_bh(xa); entry = __xa_erase(xa, index); xa_unlock_bh(xa); return entry; } /** * xa_erase_irq() - Erase this entry from the XArray. * @xa: XArray. * @index: Index of entry. * * After this function returns, loading from @index will return %NULL. * If the index is part of a multi-index entry, all indices will be erased * and none of the entries will be part of a multi-index entry. * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. * Return: The entry which used to be at this index. */ static inline void *xa_erase_irq(struct xarray *xa, unsigned long index) { void *entry; xa_lock_irq(xa); entry = __xa_erase(xa, index); xa_unlock_irq(xa); return entry; } /** * xa_cmpxchg() - Conditionally replace an entry in the XArray. * @xa: XArray. * @index: Index into array. * @old: Old value to test against. * @entry: New value to place in array. * @gfp: Memory allocation flags. * * If the entry at @index is the same as @old, replace it with @entry. * If the return value is equal to @old, then the exchange was successful. * * Context: Any context. Takes and releases the xa_lock. May sleep * if the @gfp flags permit. * Return: The old value at this index or xa_err() if an error happened. */ static inline void *xa_cmpxchg(struct xarray *xa, unsigned long index, void *old, void *entry, gfp_t gfp) { void *curr; might_alloc(gfp); xa_lock(xa); curr = __xa_cmpxchg(xa, index, old, entry, gfp); xa_unlock(xa); return curr; } /** * xa_cmpxchg_bh() - Conditionally replace an entry in the XArray. * @xa: XArray. * @index: Index into array. * @old: Old value to test against. * @entry: New value to place in array. * @gfp: Memory allocation flags. * * This function is like calling xa_cmpxchg() except it disables softirqs * while holding the array lock. * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. May sleep if the @gfp flags permit. * Return: The old value at this index or xa_err() if an error happened. */ static inline void *xa_cmpxchg_bh(struct xarray *xa, unsigned long index, void *old, void *entry, gfp_t gfp) { void *curr; might_alloc(gfp); xa_lock_bh(xa); curr = __xa_cmpxchg(xa, index, old, entry, gfp); xa_unlock_bh(xa); return curr; } /** * xa_cmpxchg_irq() - Conditionally replace an entry in the XArray. * @xa: XArray. * @index: Index into array. * @old: Old value to test against. * @entry: New value to place in array. * @gfp: Memory allocation flags. * * This function is like calling xa_cmpxchg() except it disables interrupts * while holding the array lock. * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. May sleep if the @gfp flags permit. * Return: The old value at this index or xa_err() if an error happened. */ static inline void *xa_cmpxchg_irq(struct xarray *xa, unsigned long index, void *old, void *entry, gfp_t gfp) { void *curr; might_alloc(gfp); xa_lock_irq(xa); curr = __xa_cmpxchg(xa, index, old, entry, gfp); xa_unlock_irq(xa); return curr; } /** * xa_insert() - Store this entry in the XArray unless another entry is * already present. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * Inserting a NULL entry will store a reserved entry (like xa_reserve()) * if no entry is present. Inserting will fail if a reserved entry is * present, even though loading from this index will return NULL. * * Context: Any context. Takes and releases the xa_lock. May sleep if * the @gfp flags permit. * Return: 0 if the store succeeded. -EBUSY if another entry was present. * -ENOMEM if memory could not be allocated. */ static inline int __must_check xa_insert(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { int err; might_alloc(gfp); xa_lock(xa); err = __xa_insert(xa, index, entry, gfp); xa_unlock(xa); return err; } /** * xa_insert_bh() - Store this entry in the XArray unless another entry is * already present. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * Inserting a NULL entry will store a reserved entry (like xa_reserve()) * if no entry is present. Inserting will fail if a reserved entry is * present, even though loading from this index will return NULL. * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. May sleep if the @gfp flags permit. * Return: 0 if the store succeeded. -EBUSY if another entry was present. * -ENOMEM if memory could not be allocated. */ static inline int __must_check xa_insert_bh(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { int err; might_alloc(gfp); xa_lock_bh(xa); err = __xa_insert(xa, index, entry, gfp); xa_unlock_bh(xa); return err; } /** * xa_insert_irq() - Store this entry in the XArray unless another entry is * already present. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * Inserting a NULL entry will store a reserved entry (like xa_reserve()) * if no entry is present. Inserting will fail if a reserved entry is * present, even though loading from this index will return NULL. * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. May sleep if the @gfp flags permit. * Return: 0 if the store succeeded. -EBUSY if another entry was present. * -ENOMEM if memory could not be allocated. */ static inline int __must_check xa_insert_irq(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { int err; might_alloc(gfp); xa_lock_irq(xa); err = __xa_insert(xa, index, entry, gfp); xa_unlock_irq(xa); return err; } /** * xa_alloc() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Context: Any context. Takes and releases the xa_lock. May sleep if * the @gfp flags permit. * Return: 0 on success, -ENOMEM if memory could not be allocated or * -EBUSY if there are no free entries in @limit. */ static inline __must_check int xa_alloc(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, gfp_t gfp) { int err; might_alloc(gfp); xa_lock(xa); err = __xa_alloc(xa, id, entry, limit, gfp); xa_unlock(xa); return err; } /** * xa_alloc_bh() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. May sleep if the @gfp flags permit. * Return: 0 on success, -ENOMEM if memory could not be allocated or * -EBUSY if there are no free entries in @limit. */ static inline int __must_check xa_alloc_bh(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, gfp_t gfp) { int err; might_alloc(gfp); xa_lock_bh(xa); err = __xa_alloc(xa, id, entry, limit, gfp); xa_unlock_bh(xa); return err; } /** * xa_alloc_irq() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. May sleep if the @gfp flags permit. * Return: 0 on success, -ENOMEM if memory could not be allocated or * -EBUSY if there are no free entries in @limit. */ static inline int __must_check xa_alloc_irq(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, gfp_t gfp) { int err; might_alloc(gfp); xa_lock_irq(xa); err = __xa_alloc(xa, id, entry, limit, gfp); xa_unlock_irq(xa); return err; } /** * xa_alloc_cyclic() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of allocated ID. * @next: Pointer to next ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * The search for an empty entry will start at @next and will wrap * around if necessary. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Note that callers interested in whether wrapping has occurred should * use __xa_alloc_cyclic() instead. * * Context: Any context. Takes and releases the xa_lock. May sleep if * the @gfp flags permit. * Return: 0 if the allocation succeeded, -ENOMEM if memory could not be * allocated or -EBUSY if there are no free entries in @limit. */ static inline int xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, u32 *next, gfp_t gfp) { int err; might_alloc(gfp); xa_lock(xa); err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp); xa_unlock(xa); return err < 0 ? err : 0; } /** * xa_alloc_cyclic_bh() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of allocated ID. * @next: Pointer to next ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * The search for an empty entry will start at @next and will wrap * around if necessary. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Note that callers interested in whether wrapping has occurred should * use __xa_alloc_cyclic() instead. * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. May sleep if the @gfp flags permit. * Return: 0 if the allocation succeeded, -ENOMEM if memory could not be * allocated or -EBUSY if there are no free entries in @limit. */ static inline int xa_alloc_cyclic_bh(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, u32 *next, gfp_t gfp) { int err; might_alloc(gfp); xa_lock_bh(xa); err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp); xa_unlock_bh(xa); return err < 0 ? err : 0; } /** * xa_alloc_cyclic_irq() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of allocated ID. * @next: Pointer to next ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * The search for an empty entry will start at @next and will wrap * around if necessary. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Note that callers interested in whether wrapping has occurred should * use __xa_alloc_cyclic() instead. * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. May sleep if the @gfp flags permit. * Return: 0 if the allocation succeeded, -ENOMEM if memory could not be * allocated or -EBUSY if there are no free entries in @limit. */ static inline int xa_alloc_cyclic_irq(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, u32 *next, gfp_t gfp) { int err; might_alloc(gfp); xa_lock_irq(xa); err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp); xa_unlock_irq(xa); return err < 0 ? err : 0; } /** * xa_reserve() - Reserve this index in the XArray. * @xa: XArray. * @index: Index into array. * @gfp: Memory allocation flags. * * Ensures there is somewhere to store an entry at @index in the array. * If there is already something stored at @index, this function does * nothing. If there was nothing there, the entry is marked as reserved. * Loading from a reserved entry returns a %NULL pointer. * * If you do not use the entry that you have reserved, call xa_release() * or xa_erase() to free any unnecessary memory. * * Context: Any context. Takes and releases the xa_lock. * May sleep if the @gfp flags permit. * Return: 0 if the reservation succeeded or -ENOMEM if it failed. */ static inline __must_check int xa_reserve(struct xarray *xa, unsigned long index, gfp_t gfp) { return xa_err(xa_cmpxchg(xa, index, NULL, XA_ZERO_ENTRY, gfp)); } /** * xa_reserve_bh() - Reserve this index in the XArray. * @xa: XArray. * @index: Index into array. * @gfp: Memory allocation flags. * * A softirq-disabling version of xa_reserve(). * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. * Return: 0 if the reservation succeeded or -ENOMEM if it failed. */ static inline __must_check int xa_reserve_bh(struct xarray *xa, unsigned long index, gfp_t gfp) { return xa_err(xa_cmpxchg_bh(xa, index, NULL, XA_ZERO_ENTRY, gfp)); } /** * xa_reserve_irq() - Reserve this index in the XArray. * @xa: XArray. * @index: Index into array. * @gfp: Memory allocation flags. * * An interrupt-disabling version of xa_reserve(). * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. * Return: 0 if the reservation succeeded or -ENOMEM if it failed. */ static inline __must_check int xa_reserve_irq(struct xarray *xa, unsigned long index, gfp_t gfp) { return xa_err(xa_cmpxchg_irq(xa, index, NULL, XA_ZERO_ENTRY, gfp)); } /** * xa_release() - Release a reserved entry. * @xa: XArray. * @index: Index of entry. * * After calling xa_reserve(), you can call this function to release the * reservation. If the entry at @index has been stored to, this function * will do nothing. */ static inline void xa_release(struct xarray *xa, unsigned long index) { xa_cmpxchg(xa, index, XA_ZERO_ENTRY, NULL, 0); } /* Everything below here is the Advanced API. Proceed with caution. */ /* * The xarray is constructed out of a set of 'chunks' of pointers. Choosing * the best chunk size requires some tradeoffs. A power of two recommends * itself so that we can walk the tree based purely on shifts and masks. * Generally, the larger the better; as the number of slots per level of the * tree increases, the less tall the tree needs to be. But that needs to be * balanced against the memory consumption of each node. On a 64-bit system, * xa_node is currently 576 bytes, and we get 7 of them per 4kB page. If we * doubled the number of slots per node, we'd get only 3 nodes per 4kB page. */ #ifndef XA_CHUNK_SHIFT #define XA_CHUNK_SHIFT (IS_ENABLED(CONFIG_BASE_SMALL) ? 4 : 6) #endif #define XA_CHUNK_SIZE (1UL << XA_CHUNK_SHIFT) #define XA_CHUNK_MASK (XA_CHUNK_SIZE - 1) #define XA_MAX_MARKS 3 #define XA_MARK_LONGS BITS_TO_LONGS(XA_CHUNK_SIZE) /* * @count is the count of every non-NULL element in the ->slots array * whether that is a value entry, a retry entry, a user pointer, * a sibling entry or a pointer to the next level of the tree. * @nr_values is the count of every element in ->slots which is * either a value entry or a sibling of a value entry. */ struct xa_node { unsigned char shift; /* Bits remaining in each slot */ unsigned char offset; /* Slot offset in parent */ unsigned char count; /* Total entry count */ unsigned char nr_values; /* Value entry count */ struct xa_node __rcu *parent; /* NULL at top of tree */ struct xarray *array; /* The array we belong to */ union { struct list_head private_list; /* For tree user */ struct rcu_head rcu_head; /* Used when freeing node */ }; void __rcu *slots[XA_CHUNK_SIZE]; union { unsigned long tags[XA_MAX_MARKS][XA_MARK_LONGS]; unsigned long marks[XA_MAX_MARKS][XA_MARK_LONGS]; }; }; void xa_dump(const struct xarray *); void xa_dump_node(const struct xa_node *); #ifdef XA_DEBUG #define XA_BUG_ON(xa, x) do { \ if (x) { \ xa_dump(xa); \ BUG(); \ } \ } while (0) #define XA_NODE_BUG_ON(node, x) do { \ if (x) { \ if (node) xa_dump_node(node); \ BUG(); \ } \ } while (0) #else #define XA_BUG_ON(xa, x) do { } while (0) #define XA_NODE_BUG_ON(node, x) do { } while (0) #endif /* Private */ static inline void *xa_head(const struct xarray *xa) { return rcu_dereference_check(xa->xa_head, lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline void *xa_head_locked(const struct xarray *xa) { return rcu_dereference_protected(xa->xa_head, lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline void *xa_entry(const struct xarray *xa, const struct xa_node *node, unsigned int offset) { XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE); return rcu_dereference_check(node->slots[offset], lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline void *xa_entry_locked(const struct xarray *xa, const struct xa_node *node, unsigned int offset) { XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE); return rcu_dereference_protected(node->slots[offset], lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline struct xa_node *xa_parent(const struct xarray *xa, const struct xa_node *node) { return rcu_dereference_check(node->parent, lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline struct xa_node *xa_parent_locked(const struct xarray *xa, const struct xa_node *node) { return rcu_dereference_protected(node->parent, lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline void *xa_mk_node(const struct xa_node *node) { return (void *)((unsigned long)node | 2); } /* Private */ static inline struct xa_node *xa_to_node(const void *entry) { return (struct xa_node *)((unsigned long)entry - 2); } /* Private */ static inline bool xa_is_node(const void *entry) { return xa_is_internal(entry) && (unsigned long)entry > 4096; } /* Private */ static inline void *xa_mk_sibling(unsigned int offset) { return xa_mk_internal(offset); } /* Private */ static inline unsigned long xa_to_sibling(const void *entry) { return xa_to_internal(entry); } /** * xa_is_sibling() - Is the entry a sibling entry? * @entry: Entry retrieved from the XArray * * Return: %true if the entry is a sibling entry. */ static inline bool xa_is_sibling(const void *entry) { return IS_ENABLED(CONFIG_XARRAY_MULTI) && xa_is_internal(entry) && (entry < xa_mk_sibling(XA_CHUNK_SIZE - 1)); } #define XA_RETRY_ENTRY xa_mk_internal(256) /** * xa_is_retry() - Is the entry a retry entry? * @entry: Entry retrieved from the XArray * * Return: %true if the entry is a retry entry. */ static inline bool xa_is_retry(const void *entry) { return unlikely(entry == XA_RETRY_ENTRY); } /** * xa_is_advanced() - Is the entry only permitted for the advanced API? * @entry: Entry to be stored in the XArray. * * Return: %true if the entry cannot be stored by the normal API. */ static inline bool xa_is_advanced(const void *entry) { return xa_is_internal(entry) && (entry <= XA_RETRY_ENTRY); } /** * typedef xa_update_node_t - A callback function from the XArray. * @node: The node which is being processed * * This function is called every time the XArray updates the count of * present and value entries in a node. It allows advanced users to * maintain the private_list in the node. * * Context: The xa_lock is held and interrupts may be disabled. * Implementations should not drop the xa_lock, nor re-enable * interrupts. */ typedef void (*xa_update_node_t)(struct xa_node *node); void xa_delete_node(struct xa_node *, xa_update_node_t); /* * The xa_state is opaque to its users. It contains various different pieces * of state involved in the current operation on the XArray. It should be * declared on the stack and passed between the various internal routines. * The various elements in it should not be accessed directly, but only * through the provided accessor functions. The below documentation is for * the benefit of those working on the code, not for users of the XArray. * * @xa_node usually points to the xa_node containing the slot we're operating * on (and @xa_offset is the offset in the slots array). If there is a * single entry in the array at index 0, there are no allocated xa_nodes to * point to, and so we store %NULL in @xa_node. @xa_node is set to * the value %XAS_RESTART if the xa_state is not walked to the correct * position in the tree of nodes for this operation. If an error occurs * during an operation, it is set to an %XAS_ERROR value. If we run off the * end of the allocated nodes, it is set to %XAS_BOUNDS. */ struct xa_state { struct xarray *xa; unsigned long xa_index; unsigned char xa_shift; unsigned char xa_sibs; unsigned char xa_offset; unsigned char xa_pad; /* Helps gcc generate better code */ struct xa_node *xa_node; struct xa_node *xa_alloc; xa_update_node_t xa_update; struct list_lru *xa_lru; }; /* * We encode errnos in the xas->xa_node. If an error has happened, we need to * drop the lock to fix it, and once we've done so the xa_state is invalid. */ #define XA_ERROR(errno) ((struct xa_node *)(((unsigned long)errno << 2) | 2UL)) #define XAS_BOUNDS ((struct xa_node *)1UL) #define XAS_RESTART ((struct xa_node *)3UL) #define __XA_STATE(array, index, shift, sibs) { \ .xa = array, \ .xa_index = index, \ .xa_shift = shift, \ .xa_sibs = sibs, \ .xa_offset = 0, \ .xa_pad = 0, \ .xa_node = XAS_RESTART, \ .xa_alloc = NULL, \ .xa_update = NULL, \ .xa_lru = NULL, \ } /** * XA_STATE() - Declare an XArray operation state. * @name: Name of this operation state (usually xas). * @array: Array to operate on. * @index: Initial index of interest. * * Declare and initialise an xa_state on the stack. */ #define XA_STATE(name, array, index) \ struct xa_state name = __XA_STATE(array, index, 0, 0) /** * XA_STATE_ORDER() - Declare an XArray operation state. * @name: Name of this operation state (usually xas). * @array: Array to operate on. * @index: Initial index of interest. * @order: Order of entry. * * Declare and initialise an xa_state on the stack. This variant of * XA_STATE() allows you to specify the 'order' of the element you * want to operate on.` */ #define XA_STATE_ORDER(name, array, index, order) \ struct xa_state name = __XA_STATE(array, \ (index >> order) << order, \ order - (order % XA_CHUNK_SHIFT), \ (1U << (order % XA_CHUNK_SHIFT)) - 1) #define xas_marked(xas, mark) xa_marked((xas)->xa, (mark)) #define xas_trylock(xas) xa_trylock((xas)->xa) #define xas_lock(xas) xa_lock((xas)->xa) #define xas_unlock(xas) xa_unlock((xas)->xa) #define xas_lock_bh(xas) xa_lock_bh((xas)->xa) #define xas_unlock_bh(xas) xa_unlock_bh((xas)->xa) #define xas_lock_irq(xas) xa_lock_irq((xas)->xa) #define xas_unlock_irq(xas) xa_unlock_irq((xas)->xa) #define xas_lock_irqsave(xas, flags) \ xa_lock_irqsave((xas)->xa, flags) #define xas_unlock_irqrestore(xas, flags) \ xa_unlock_irqrestore((xas)->xa, flags) /** * xas_error() - Return an errno stored in the xa_state. * @xas: XArray operation state. * * Return: 0 if no error has been noted. A negative errno if one has. */ static inline int xas_error(const struct xa_state *xas) { return xa_err(xas->xa_node); } /** * xas_set_err() - Note an error in the xa_state. * @xas: XArray operation state. * @err: Negative error number. * * Only call this function with a negative @err; zero or positive errors * will probably not behave the way you think they should. If you want * to clear the error from an xa_state, use xas_reset(). */ static inline void xas_set_err(struct xa_state *xas, long err) { xas->xa_node = XA_ERROR(err); } /** * xas_invalid() - Is the xas in a retry or error state? * @xas: XArray operation state. * * Return: %true if the xas cannot be used for operations. */ static inline bool xas_invalid(const struct xa_state *xas) { return (unsigned long)xas->xa_node & 3; } /** * xas_valid() - Is the xas a valid cursor into the array? * @xas: XArray operation state. * * Return: %true if the xas can be used for operations. */ static inline bool xas_valid(const struct xa_state *xas) { return !xas_invalid(xas); } /** * xas_is_node() - Does the xas point to a node? * @xas: XArray operation state. * * Return: %true if the xas currently references a node. */ static inline bool xas_is_node(const struct xa_state *xas) { return xas_valid(xas) && xas->xa_node; } /* True if the pointer is something other than a node */ static inline bool xas_not_node(struct xa_node *node) { return ((unsigned long)node & 3) || !node; } /* True if the node represents RESTART or an error */ static inline bool xas_frozen(struct xa_node *node) { return (unsigned long)node & 2; } /* True if the node represents head-of-tree, RESTART or BOUNDS */ static inline bool xas_top(struct xa_node *node) { return node <= XAS_RESTART; } /** * xas_reset() - Reset an XArray operation state. * @xas: XArray operation state. * * Resets the error or walk state of the @xas so future walks of the * array will start from the root. Use this if you have dropped the * xarray lock and want to reuse the xa_state. * * Context: Any context. */ static inline void xas_reset(struct xa_state *xas) { xas->xa_node = XAS_RESTART; } /** * xas_retry() - Retry the operation if appropriate. * @xas: XArray operation state. * @entry: Entry from xarray. * * The advanced functions may sometimes return an internal entry, such as * a retry entry or a zero entry. This function sets up the @xas to restart * the walk from the head of the array if needed. * * Context: Any context. * Return: true if the operation needs to be retried. */ static inline bool xas_retry(struct xa_state *xas, const void *entry) { if (xa_is_zero(entry)) return true; if (!xa_is_retry(entry)) return false; xas_reset(xas); return true; } void *xas_load(struct xa_state *); void *xas_store(struct xa_state *, void *entry); void *xas_find(struct xa_state *, unsigned long max); void *xas_find_conflict(struct xa_state *); bool xas_get_mark(const struct xa_state *, xa_mark_t); void xas_set_mark(const struct xa_state *, xa_mark_t); void xas_clear_mark(const struct xa_state *, xa_mark_t); void *xas_find_marked(struct xa_state *, unsigned long max, xa_mark_t); void xas_init_marks(const struct xa_state *); bool xas_nomem(struct xa_state *, gfp_t); void xas_destroy(struct xa_state *); void xas_pause(struct xa_state *); void xas_create_range(struct xa_state *); #ifdef CONFIG_XARRAY_MULTI int xa_get_order(struct xarray *, unsigned long index); int xas_get_order(struct xa_state *xas); void xas_split(struct xa_state *, void *entry, unsigned int order); void xas_split_alloc(struct xa_state *, void *entry, unsigned int order, gfp_t); void xas_try_split(struct xa_state *xas, void *entry, unsigned int order); unsigned int xas_try_split_min_order(unsigned int order); #else static inline int xa_get_order(struct xarray *xa, unsigned long index) { return 0; } static inline int xas_get_order(struct xa_state *xas) { return 0; } static inline void xas_split(struct xa_state *xas, void *entry, unsigned int order) { xas_store(xas, entry); } static inline void xas_split_alloc(struct xa_state *xas, void *entry, unsigned int order, gfp_t gfp) { } static inline void xas_try_split(struct xa_state *xas, void *entry, unsigned int order) { } static inline unsigned int xas_try_split_min_order(unsigned int order) { return 0; } #endif /** * xas_reload() - Refetch an entry from the xarray. * @xas: XArray operation state. * * Use this function to check that a previously loaded entry still has * the same value. This is useful for the lockless pagecache lookup where * we walk the array with only the RCU lock to protect us, lock the page, * then check that the page hasn't moved since we looked it up. * * The caller guarantees that @xas is still valid. If it may be in an * error or restart state, call xas_load() instead. * * Return: The entry at this location in the xarray. */ static inline void *xas_reload(struct xa_state *xas) { struct xa_node *node = xas->xa_node; void *entry; char offset; if (!node) return xa_head(xas->xa); if (IS_ENABLED(CONFIG_XARRAY_MULTI)) { offset = (xas->xa_index >> node->shift) & XA_CHUNK_MASK; entry = xa_entry(xas->xa, node, offset); if (!xa_is_sibling(entry)) return entry; offset = xa_to_sibling(entry); } else { offset = xas->xa_offset; } return xa_entry(xas->xa, node, offset); } /** * xas_set() - Set up XArray operation state for a different index. * @xas: XArray operation state. * @index: New index into the XArray. * * Move the operation state to refer to a different index. This will * have the effect of starting a walk from the top; see xas_next() * to move to an adjacent index. */ static inline void xas_set(struct xa_state *xas, unsigned long index) { xas->xa_index = index; xas->xa_node = XAS_RESTART; } /** * xas_advance() - Skip over sibling entries. * @xas: XArray operation state. * @index: Index of last sibling entry. * * Move the operation state to refer to the last sibling entry. * This is useful for loops that normally want to see sibling * entries but sometimes want to skip them. Use xas_set() if you * want to move to an index which is not part of this entry. */ static inline void xas_advance(struct xa_state *xas, unsigned long index) { unsigned char shift = xas_is_node(xas) ? xas->xa_node->shift : 0; xas->xa_index = index; xas->xa_offset = (index >> shift) & XA_CHUNK_MASK; } /** * xas_set_order() - Set up XArray operation state for a multislot entry. * @xas: XArray operation state. * @index: Target of the operation. * @order: Entry occupies 2^@order indices. */ static inline void xas_set_order(struct xa_state *xas, unsigned long index, unsigned int order) { #ifdef CONFIG_XARRAY_MULTI xas->xa_index = order < BITS_PER_LONG ? (index >> order) << order : 0; xas->xa_shift = order - (order % XA_CHUNK_SHIFT); xas->xa_sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1; xas->xa_node = XAS_RESTART; #else BUG_ON(order > 0); xas_set(xas, index); #endif } /** * xas_set_update() - Set up XArray operation state for a callback. * @xas: XArray operation state. * @update: Function to call when updating a node. * * The XArray can notify a caller after it has updated an xa_node. * This is advanced functionality and is only needed by the page * cache and swap cache. */ static inline void xas_set_update(struct xa_state *xas, xa_update_node_t update) { xas->xa_update = update; } static inline void xas_set_lru(struct xa_state *xas, struct list_lru *lru) { xas->xa_lru = lru; } /** * xas_next_entry() - Advance iterator to next present entry. * @xas: XArray operation state. * @max: Highest index to return. * * xas_next_entry() is an inline function to optimise xarray traversal for * speed. It is equivalent to calling xas_find(), and will call xas_find() * for all the hard cases. * * Return: The next present entry after the one currently referred to by @xas. */ static inline void *xas_next_entry(struct xa_state *xas, unsigned long max) { struct xa_node *node = xas->xa_node; void *entry; if (unlikely(xas_not_node(node) || node->shift || xas->xa_offset != (xas->xa_index & XA_CHUNK_MASK))) return xas_find(xas, max); do { if (unlikely(xas->xa_index >= max)) return xas_find(xas, max); if (unlikely(xas->xa_offset == XA_CHUNK_MASK)) return xas_find(xas, max); entry = xa_entry(xas->xa, node, xas->xa_offset + 1); if (unlikely(xa_is_internal(entry))) return xas_find(xas, max); xas->xa_offset++; xas->xa_index++; } while (!entry); return entry; } /* Private */ static inline unsigned int xas_find_chunk(struct xa_state *xas, bool advance, xa_mark_t mark) { unsigned long *addr = xas->xa_node->marks[(__force unsigned)mark]; unsigned int offset = xas->xa_offset; if (advance) offset++; if (XA_CHUNK_SIZE == BITS_PER_LONG) { if (offset < XA_CHUNK_SIZE) { unsigned long data = *addr & (~0UL << offset); if (data) return __ffs(data); } return XA_CHUNK_SIZE; } return find_next_bit(addr, XA_CHUNK_SIZE, offset); } /** * xas_next_marked() - Advance iterator to next marked entry. * @xas: XArray operation state. * @max: Highest index to return. * @mark: Mark to search for. * * xas_next_marked() is an inline function to optimise xarray traversal for * speed. It is equivalent to calling xas_find_marked(), and will call * xas_find_marked() for all the hard cases. * * Return: The next marked entry after the one currently referred to by @xas. */ static inline void *xas_next_marked(struct xa_state *xas, unsigned long max, xa_mark_t mark) { struct xa_node *node = xas->xa_node; void *entry; unsigned int offset; if (unlikely(xas_not_node(node) || node->shift)) return xas_find_marked(xas, max, mark); offset = xas_find_chunk(xas, true, mark); xas->xa_offset = offset; xas->xa_index = (xas->xa_index & ~XA_CHUNK_MASK) + offset; if (xas->xa_index > max) return NULL; if (offset == XA_CHUNK_SIZE) return xas_find_marked(xas, max, mark); entry = xa_entry(xas->xa, node, offset); if (!entry) return xas_find_marked(xas, max, mark); return entry; } /* * If iterating while holding a lock, drop the lock and reschedule * every %XA_CHECK_SCHED loops. */ enum { XA_CHECK_SCHED = 4096, }; /** * xas_for_each() - Iterate over a range of an XArray. * @xas: XArray operation state. * @entry: Entry retrieved from the array. * @max: Maximum index to retrieve from array. * * The loop body will be executed for each entry present in the xarray * between the current xas position and @max. @entry will be set to * the entry retrieved from the xarray. It is safe to delete entries * from the array in the loop body. You should hold either the RCU lock * or the xa_lock while iterating. If you need to drop the lock, call * xas_pause() first. */ #define xas_for_each(xas, entry, max) \ for (entry = xas_find(xas, max); entry; \ entry = xas_next_entry(xas, max)) /** * xas_for_each_marked() - Iterate over a range of an XArray. * @xas: XArray operation state. * @entry: Entry retrieved from the array. * @max: Maximum index to retrieve from array. * @mark: Mark to search for. * * The loop body will be executed for each marked entry in the xarray * between the current xas position and @max. @entry will be set to * the entry retrieved from the xarray. It is safe to delete entries * from the array in the loop body. You should hold either the RCU lock * or the xa_lock while iterating. If you need to drop the lock, call * xas_pause() first. */ #define xas_for_each_marked(xas, entry, max, mark) \ for (entry = xas_find_marked(xas, max, mark); entry; \ entry = xas_next_marked(xas, max, mark)) /** * xas_for_each_conflict() - Iterate over a range of an XArray. * @xas: XArray operation state. * @entry: Entry retrieved from the array. * * The loop body will be executed for each entry in the XArray that * lies within the range specified by @xas. If the loop terminates * normally, @entry will be %NULL. The user may break out of the loop, * which will leave @entry set to the conflicting entry. The caller * may also call xa_set_err() to exit the loop while setting an error * to record the reason. */ #define xas_for_each_conflict(xas, entry) \ while ((entry = xas_find_conflict(xas))) void *__xas_next(struct xa_state *); void *__xas_prev(struct xa_state *); /** * xas_prev() - Move iterator to previous index. * @xas: XArray operation state. * * If the @xas was in an error state, it will remain in an error state * and this function will return %NULL. If the @xas has never been walked, * it will have the effect of calling xas_load(). Otherwise one will be * subtracted from the index and the state will be walked to the correct * location in the array for the next operation. * * If the iterator was referencing index 0, this function wraps * around to %ULONG_MAX. * * Return: The entry at the new index. This may be %NULL or an internal * entry. */ static inline void *xas_prev(struct xa_state *xas) { struct xa_node *node = xas->xa_node; if (unlikely(xas_not_node(node) || node->shift || xas->xa_offset == 0)) return __xas_prev(xas); xas->xa_index--; xas->xa_offset--; return xa_entry(xas->xa, node, xas->xa_offset); } /** * xas_next() - Move state to next index. * @xas: XArray operation state. * * If the @xas was in an error state, it will remain in an error state * and this function will return %NULL. If the @xas has never been walked, * it will have the effect of calling xas_load(). Otherwise one will be * added to the index and the state will be walked to the correct * location in the array for the next operation. * * If the iterator was referencing index %ULONG_MAX, this function wraps * around to 0. * * Return: The entry at the new index. This may be %NULL or an internal * entry. */ static inline void *xas_next(struct xa_state *xas) { struct xa_node *node = xas->xa_node; if (unlikely(xas_not_node(node) || node->shift || xas->xa_offset == XA_CHUNK_MASK)) return __xas_next(xas); xas->xa_index++; xas->xa_offset++; return xa_entry(xas->xa, node, xas->xa_offset); } #endif /* _LINUX_XARRAY_H */
5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2019 Arm Ltd. */ #ifndef __KVM_ARM_HYPERCALLS_H #define __KVM_ARM_HYPERCALLS_H #include <asm/kvm_emulate.h> int kvm_smccc_call_handler(struct kvm_vcpu *vcpu); static inline u32 smccc_get_function(struct kvm_vcpu *vcpu) { return vcpu_get_reg(vcpu, 0); } static inline unsigned long smccc_get_arg1(struct kvm_vcpu *vcpu) { return vcpu_get_reg(vcpu, 1); } static inline unsigned long smccc_get_arg2(struct kvm_vcpu *vcpu) { return vcpu_get_reg(vcpu, 2); } static inline unsigned long smccc_get_arg3(struct kvm_vcpu *vcpu) { return vcpu_get_reg(vcpu, 3); } static inline void smccc_set_retval(struct kvm_vcpu *vcpu, unsigned long a0, unsigned long a1, unsigned long a2, unsigned long a3) { vcpu_set_reg(vcpu, 0, a0); vcpu_set_reg(vcpu, 1, a1); vcpu_set_reg(vcpu, 2, a2); vcpu_set_reg(vcpu, 3, a3); } struct kvm_one_reg; void kvm_arm_init_hypercalls(struct kvm *kvm); void kvm_arm_teardown_hypercalls(struct kvm *kvm); int kvm_arm_get_fw_num_regs(struct kvm_vcpu *vcpu); int kvm_arm_copy_fw_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices); int kvm_arm_get_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg); int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg); int kvm_vm_smccc_has_attr(struct kvm *kvm, struct kvm_device_attr *attr); int kvm_vm_smccc_set_attr(struct kvm *kvm, struct kvm_device_attr *attr); #endif
8 8 8 5 17 57 6 51 11 2 127 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2015, 2016 ARM Ltd. */ #ifndef __KVM_ARM_VGIC_NEW_H__ #define __KVM_ARM_VGIC_NEW_H__ #include <linux/irqchip/arm-gic-common.h> #include <asm/kvm_mmu.h> #define PRODUCT_ID_KVM 0x4b /* ASCII code K */ #define IMPLEMENTER_ARM 0x43b #define VGIC_ADDR_UNDEF (-1) #define IS_VGIC_ADDR_UNDEF(_x) ((_x) == VGIC_ADDR_UNDEF) #define INTERRUPT_ID_BITS_SPIS 10 #define INTERRUPT_ID_BITS_ITS 16 #define VGIC_LPI_MAX_INTID ((1 << INTERRUPT_ID_BITS_ITS) - 1) #define VGIC_PRI_BITS 5 #define vgic_irq_is_sgi(intid) ((intid) < VGIC_NR_SGIS) #define VGIC_AFFINITY_0_SHIFT 0 #define VGIC_AFFINITY_0_MASK (0xffUL << VGIC_AFFINITY_0_SHIFT) #define VGIC_AFFINITY_1_SHIFT 8 #define VGIC_AFFINITY_1_MASK (0xffUL << VGIC_AFFINITY_1_SHIFT) #define VGIC_AFFINITY_2_SHIFT 16 #define VGIC_AFFINITY_2_MASK (0xffUL << VGIC_AFFINITY_2_SHIFT) #define VGIC_AFFINITY_3_SHIFT 24 #define VGIC_AFFINITY_3_MASK (0xffUL << VGIC_AFFINITY_3_SHIFT) #define VGIC_AFFINITY_LEVEL(reg, level) \ ((((reg) & VGIC_AFFINITY_## level ##_MASK) \ >> VGIC_AFFINITY_## level ##_SHIFT) << MPIDR_LEVEL_SHIFT(level)) /* * The Userspace encodes the affinity differently from the MPIDR, * Below macro converts vgic userspace format to MPIDR reg format. */ #define VGIC_TO_MPIDR(val) (VGIC_AFFINITY_LEVEL(val, 0) | \ VGIC_AFFINITY_LEVEL(val, 1) | \ VGIC_AFFINITY_LEVEL(val, 2) | \ VGIC_AFFINITY_LEVEL(val, 3)) /* * As per Documentation/virt/kvm/devices/arm-vgic-v3.rst, * below macros are defined for CPUREG encoding. */ #define KVM_REG_ARM_VGIC_SYSREG_OP0_MASK 0x000000000000c000 #define KVM_REG_ARM_VGIC_SYSREG_OP0_SHIFT 14 #define KVM_REG_ARM_VGIC_SYSREG_OP1_MASK 0x0000000000003800 #define KVM_REG_ARM_VGIC_SYSREG_OP1_SHIFT 11 #define KVM_REG_ARM_VGIC_SYSREG_CRN_MASK 0x0000000000000780 #define KVM_REG_ARM_VGIC_SYSREG_CRN_SHIFT 7 #define KVM_REG_ARM_VGIC_SYSREG_CRM_MASK 0x0000000000000078 #define KVM_REG_ARM_VGIC_SYSREG_CRM_SHIFT 3 #define KVM_REG_ARM_VGIC_SYSREG_OP2_MASK 0x0000000000000007 #define KVM_REG_ARM_VGIC_SYSREG_OP2_SHIFT 0 #define KVM_DEV_ARM_VGIC_SYSREG_MASK (KVM_REG_ARM_VGIC_SYSREG_OP0_MASK | \ KVM_REG_ARM_VGIC_SYSREG_OP1_MASK | \ KVM_REG_ARM_VGIC_SYSREG_CRN_MASK | \ KVM_REG_ARM_VGIC_SYSREG_CRM_MASK | \ KVM_REG_ARM_VGIC_SYSREG_OP2_MASK) #define KVM_ICC_SRE_EL2 (ICC_SRE_EL2_ENABLE | ICC_SRE_EL2_SRE | \ ICC_SRE_EL1_DIB | ICC_SRE_EL1_DFB) #define KVM_ICH_VTR_EL2_RES0 (ICH_VTR_EL2_DVIM | \ ICH_VTR_EL2_A3V | \ ICH_VTR_EL2_IDbits) #define KVM_ICH_VTR_EL2_RES1 ICH_VTR_EL2_nV4 static inline u64 kvm_get_guest_vtr_el2(void) { u64 vtr; vtr = kvm_vgic_global_state.ich_vtr_el2; vtr &= ~KVM_ICH_VTR_EL2_RES0; vtr |= KVM_ICH_VTR_EL2_RES1; return vtr; } /* * As per Documentation/virt/kvm/devices/arm-vgic-its.rst, * below macros are defined for ITS table entry encoding. */ #define KVM_ITS_CTE_VALID_SHIFT 63 #define KVM_ITS_CTE_VALID_MASK BIT_ULL(63) #define KVM_ITS_CTE_RDBASE_SHIFT 16 #define KVM_ITS_CTE_ICID_MASK GENMASK_ULL(15, 0) #define KVM_ITS_ITE_NEXT_SHIFT 48 #define KVM_ITS_ITE_PINTID_SHIFT 16 #define KVM_ITS_ITE_PINTID_MASK GENMASK_ULL(47, 16) #define KVM_ITS_ITE_ICID_MASK GENMASK_ULL(15, 0) #define KVM_ITS_DTE_VALID_SHIFT 63 #define KVM_ITS_DTE_VALID_MASK BIT_ULL(63) #define KVM_ITS_DTE_NEXT_SHIFT 49 #define KVM_ITS_DTE_NEXT_MASK GENMASK_ULL(62, 49) #define KVM_ITS_DTE_ITTADDR_SHIFT 5 #define KVM_ITS_DTE_ITTADDR_MASK GENMASK_ULL(48, 5) #define KVM_ITS_DTE_SIZE_MASK GENMASK_ULL(4, 0) #define KVM_ITS_L1E_VALID_MASK BIT_ULL(63) /* we only support 64 kB translation table page size */ #define KVM_ITS_L1E_ADDR_MASK GENMASK_ULL(51, 16) #define KVM_VGIC_V3_RDIST_INDEX_MASK GENMASK_ULL(11, 0) #define KVM_VGIC_V3_RDIST_FLAGS_MASK GENMASK_ULL(15, 12) #define KVM_VGIC_V3_RDIST_FLAGS_SHIFT 12 #define KVM_VGIC_V3_RDIST_BASE_MASK GENMASK_ULL(51, 16) #define KVM_VGIC_V3_RDIST_COUNT_MASK GENMASK_ULL(63, 52) #define KVM_VGIC_V3_RDIST_COUNT_SHIFT 52 #ifdef CONFIG_DEBUG_SPINLOCK #define DEBUG_SPINLOCK_BUG_ON(p) BUG_ON(p) #else #define DEBUG_SPINLOCK_BUG_ON(p) #endif static inline u32 vgic_get_implementation_rev(struct kvm_vcpu *vcpu) { return vcpu->kvm->arch.vgic.implementation_rev; } /* Requires the irq_lock to be held by the caller. */ static inline bool irq_is_pending(struct vgic_irq *irq) { if (irq->config == VGIC_CONFIG_EDGE) return irq->pending_latch; else return irq->pending_latch || irq->line_level; } static inline bool vgic_irq_is_mapped_level(struct vgic_irq *irq) { return irq->config == VGIC_CONFIG_LEVEL && irq->hw; } static inline int vgic_irq_get_lr_count(struct vgic_irq *irq) { /* Account for the active state as an interrupt */ if (vgic_irq_is_sgi(irq->intid) && irq->source) return hweight8(irq->source) + irq->active; return irq_is_pending(irq) || irq->active; } static inline bool vgic_irq_is_multi_sgi(struct vgic_irq *irq) { return vgic_irq_get_lr_count(irq) > 1; } static inline int vgic_write_guest_lock(struct kvm *kvm, gpa_t gpa, const void *data, unsigned long len) { struct vgic_dist *dist = &kvm->arch.vgic; int ret; dist->table_write_in_progress = true; ret = kvm_write_guest_lock(kvm, gpa, data, len); dist->table_write_in_progress = false; return ret; } /* * This struct provides an intermediate representation of the fields contained * in the GICH_VMCR and ICH_VMCR registers, such that code exporting the GIC * state to userspace can generate either GICv2 or GICv3 CPU interface * registers regardless of the hardware backed GIC used. */ struct vgic_vmcr { u32 grpen0; u32 grpen1; u32 ackctl; u32 fiqen; u32 cbpr; u32 eoim; u32 abpr; u32 bpr; u32 pmr; /* Priority mask field in the GICC_PMR and * ICC_PMR_EL1 priority field format */ }; struct vgic_reg_attr { struct kvm_vcpu *vcpu; gpa_t addr; }; struct its_device { struct list_head dev_list; /* the head for the list of ITTEs */ struct list_head itt_head; u32 num_eventid_bits; gpa_t itt_addr; u32 device_id; }; #define COLLECTION_NOT_MAPPED ((u32)~0) struct its_collection { struct list_head coll_list; u32 collection_id; u32 target_addr; }; #define its_is_collection_mapped(coll) ((coll) && \ ((coll)->target_addr != COLLECTION_NOT_MAPPED)) struct its_ite { struct list_head ite_list; struct vgic_irq *irq; struct its_collection *collection; u32 event_id; }; int vgic_v3_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, struct vgic_reg_attr *reg_attr); int vgic_v2_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, struct vgic_reg_attr *reg_attr); const struct vgic_register_region * vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev, gpa_t addr, int len); struct vgic_irq *vgic_get_irq(struct kvm *kvm, u32 intid); struct vgic_irq *vgic_get_vcpu_irq(struct kvm_vcpu *vcpu, u32 intid); void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq); bool vgic_get_phys_line_level(struct vgic_irq *irq); void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending); void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active); bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq, unsigned long flags) __releases(&irq->irq_lock); void vgic_kick_vcpus(struct kvm *kvm); void vgic_irq_handle_resampling(struct vgic_irq *irq, bool lr_deactivated, bool lr_pending); int vgic_check_iorange(struct kvm *kvm, phys_addr_t ioaddr, phys_addr_t addr, phys_addr_t alignment, phys_addr_t size); void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu); void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr); void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr); void vgic_v2_set_underflow(struct kvm_vcpu *vcpu); int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr); int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write, int offset, u32 *val); int vgic_v2_cpuif_uaccess(struct kvm_vcpu *vcpu, bool is_write, int offset, u32 *val); void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); void vgic_v2_enable(struct kvm_vcpu *vcpu); int vgic_v2_probe(const struct gic_kvm_info *info); int vgic_v2_map_resources(struct kvm *kvm); int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address, enum vgic_type); void vgic_v2_init_lrs(void); void vgic_v2_load(struct kvm_vcpu *vcpu); void vgic_v2_put(struct kvm_vcpu *vcpu); void vgic_v2_save_state(struct kvm_vcpu *vcpu); void vgic_v2_restore_state(struct kvm_vcpu *vcpu); static inline bool vgic_try_get_irq_kref(struct vgic_irq *irq) { if (!irq) return false; if (irq->intid < VGIC_MIN_LPI) return true; return kref_get_unless_zero(&irq->refcount); } static inline void vgic_get_irq_kref(struct vgic_irq *irq) { WARN_ON_ONCE(!vgic_try_get_irq_kref(irq)); } void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu); void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr); void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr); void vgic_v3_set_underflow(struct kvm_vcpu *vcpu); void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); void vgic_v3_enable(struct kvm_vcpu *vcpu); int vgic_v3_probe(const struct gic_kvm_info *info); int vgic_v3_map_resources(struct kvm *kvm); int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq); int vgic_v3_save_pending_tables(struct kvm *kvm); int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, u64 addr, u32 count); int vgic_register_redist_iodev(struct kvm_vcpu *vcpu); void vgic_unregister_redist_iodev(struct kvm_vcpu *vcpu); bool vgic_v3_check_base(struct kvm *kvm); void vgic_v3_load(struct kvm_vcpu *vcpu); void vgic_v3_put(struct kvm_vcpu *vcpu); bool vgic_has_its(struct kvm *kvm); int kvm_vgic_register_its_device(void); void vgic_enable_lpis(struct kvm_vcpu *vcpu); void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu); int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi); int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr); int vgic_v3_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write, int offset, u32 *val); int vgic_v3_redist_uaccess(struct kvm_vcpu *vcpu, bool is_write, int offset, u32 *val); int vgic_v3_cpu_sysregs_uaccess(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr, bool is_write); int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); const struct sys_reg_desc *vgic_v3_get_sysreg_table(unsigned int *sz); int vgic_v3_line_level_info_uaccess(struct kvm_vcpu *vcpu, bool is_write, u32 intid, u32 *val); int kvm_register_vgic_device(unsigned long type); void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); int vgic_lazy_init(struct kvm *kvm); int vgic_init(struct kvm *kvm); void vgic_debug_init(struct kvm *kvm); void vgic_debug_destroy(struct kvm *kvm); int vgic_v5_probe(const struct gic_kvm_info *info); static inline int vgic_v3_max_apr_idx(struct kvm_vcpu *vcpu) { struct vgic_cpu *cpu_if = &vcpu->arch.vgic_cpu; /* * num_pri_bits are initialized with HW supported values. * We can rely safely on num_pri_bits even if VM has not * restored ICC_CTLR_EL1 before restoring APnR registers. */ switch (cpu_if->num_pri_bits) { case 7: return 3; case 6: return 1; default: return 0; } } static inline bool vgic_v3_redist_region_full(struct vgic_redist_region *region) { if (!region->count) return false; return (region->free_index >= region->count); } struct vgic_redist_region *vgic_v3_rdist_free_slot(struct list_head *rdregs); static inline size_t vgic_v3_rd_region_size(struct kvm *kvm, struct vgic_redist_region *rdreg) { if (!rdreg->count) return atomic_read(&kvm->online_vcpus) * KVM_VGIC_V3_REDIST_SIZE; else return rdreg->count * KVM_VGIC_V3_REDIST_SIZE; } struct vgic_redist_region *vgic_v3_rdist_region_from_index(struct kvm *kvm, u32 index); void vgic_v3_free_redist_region(struct kvm *kvm, struct vgic_redist_region *rdreg); bool vgic_v3_rdist_overlap(struct kvm *kvm, gpa_t base, size_t size); static inline bool vgic_dist_overlap(struct kvm *kvm, gpa_t base, size_t size) { struct vgic_dist *d = &kvm->arch.vgic; return (base + size > d->vgic_dist_base) && (base < d->vgic_dist_base + KVM_VGIC_V3_DIST_SIZE); } bool vgic_lpis_enabled(struct kvm_vcpu *vcpu); int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its, u32 devid, u32 eventid, struct vgic_irq **irq); struct vgic_its *vgic_msi_to_its(struct kvm *kvm, struct kvm_msi *msi); int vgic_its_inject_cached_translation(struct kvm *kvm, struct kvm_msi *msi); void vgic_its_invalidate_all_caches(struct kvm *kvm); /* GICv4.1 MMIO interface */ int vgic_its_inv_lpi(struct kvm *kvm, struct vgic_irq *irq); int vgic_its_invall(struct kvm_vcpu *vcpu); bool system_supports_direct_sgis(void); bool vgic_supports_direct_msis(struct kvm *kvm); bool vgic_supports_direct_sgis(struct kvm *kvm); static inline bool vgic_supports_direct_irqs(struct kvm *kvm) { return vgic_supports_direct_msis(kvm) || vgic_supports_direct_sgis(kvm); } int vgic_v4_init(struct kvm *kvm); void vgic_v4_teardown(struct kvm *kvm); void vgic_v4_configure_vsgis(struct kvm *kvm); void vgic_v4_get_vlpi_state(struct vgic_irq *irq, bool *val); int vgic_v4_request_vpe_irq(struct kvm_vcpu *vcpu, int irq); void vcpu_set_ich_hcr(struct kvm_vcpu *vcpu); static inline bool kvm_has_gicv3(struct kvm *kvm) { return kvm_has_feat(kvm, ID_AA64PFR0_EL1, GIC, IMP); } void vgic_v3_sync_nested(struct kvm_vcpu *vcpu); void vgic_v3_load_nested(struct kvm_vcpu *vcpu); void vgic_v3_put_nested(struct kvm_vcpu *vcpu); void vgic_v3_handle_nested_maint_irq(struct kvm_vcpu *vcpu); void vgic_v3_nested_update_mi(struct kvm_vcpu *vcpu); static inline bool vgic_is_v3_compat(struct kvm *kvm) { return cpus_have_final_cap(ARM64_HAS_GICV5_CPUIF) && kvm_vgic_global_state.has_gcie_v3_compat; } static inline bool vgic_is_v3(struct kvm *kvm) { return kvm_vgic_global_state.type == VGIC_V3 || vgic_is_v3_compat(kvm); } int vgic_its_debug_init(struct kvm_device *dev); void vgic_its_debug_destroy(struct kvm_device *dev); #endif
311 311 311 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar * Copyright (C) 2005-2006 Thomas Gleixner * * This file contains driver APIs to the irq subsystem. */ #define pr_fmt(fmt) "genirq: " fmt #include <linux/irq.h> #include <linux/kthread.h> #include <linux/module.h> #include <linux/random.h> #include <linux/interrupt.h> #include <linux/irqdomain.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/sched/rt.h> #include <linux/sched/task.h> #include <linux/sched/isolation.h> #include <uapi/linux/sched/types.h> #include <linux/task_work.h> #include "internals.h" #if defined(CONFIG_IRQ_FORCED_THREADING) && !defined(CONFIG_PREEMPT_RT) DEFINE_STATIC_KEY_FALSE(force_irqthreads_key); static int __init setup_forced_irqthreads(char *arg) { static_branch_enable(&force_irqthreads_key); return 0; } early_param("threadirqs", setup_forced_irqthreads); #endif static int __irq_get_irqchip_state(struct irq_data *d, enum irqchip_irq_state which, bool *state); static void __synchronize_hardirq(struct irq_desc *desc, bool sync_chip) { struct irq_data *irqd = irq_desc_get_irq_data(desc); bool inprogress; do { /* * Wait until we're out of the critical section. This might * give the wrong answer due to the lack of memory barriers. */ while (irqd_irq_inprogress(&desc->irq_data)) cpu_relax(); /* Ok, that indicated we're done: double-check carefully. */ guard(raw_spinlock_irqsave)(&desc->lock); inprogress = irqd_irq_inprogress(&desc->irq_data); /* * If requested and supported, check at the chip whether it * is in flight at the hardware level, i.e. already pending * in a CPU and waiting for service and acknowledge. */ if (!inprogress && sync_chip) { /* * Ignore the return code. inprogress is only updated * when the chip supports it. */ __irq_get_irqchip_state(irqd, IRQCHIP_STATE_ACTIVE, &inprogress); } /* Oops, that failed? */ } while (inprogress); } /** * synchronize_hardirq - wait for pending hard IRQ handlers (on other CPUs) * @irq: interrupt number to wait for * * This function waits for any pending hard IRQ handlers for this interrupt * to complete before returning. If you use this function while holding a * resource the IRQ handler may need you will deadlock. It does not take * associated threaded handlers into account. * * Do not use this for shutdown scenarios where you must be sure that all * parts (hardirq and threaded handler) have completed. * * Returns: false if a threaded handler is active. * * This function may be called - with care - from IRQ context. * * It does not check whether there is an interrupt in flight at the * hardware level, but not serviced yet, as this might deadlock when called * with interrupts disabled and the target CPU of the interrupt is the * current CPU. */ bool synchronize_hardirq(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); if (desc) { __synchronize_hardirq(desc, false); return !atomic_read(&desc->threads_active); } return true; } EXPORT_SYMBOL(synchronize_hardirq); static void __synchronize_irq(struct irq_desc *desc) { __synchronize_hardirq(desc, true); /* * We made sure that no hardirq handler is running. Now verify that no * threaded handlers are active. */ wait_event(desc->wait_for_threads, !atomic_read(&desc->threads_active)); } /** * synchronize_irq - wait for pending IRQ handlers (on other CPUs) * @irq: interrupt number to wait for * * This function waits for any pending IRQ handlers for this interrupt to * complete before returning. If you use this function while holding a * resource the IRQ handler may need you will deadlock. * * Can only be called from preemptible code as it might sleep when * an interrupt thread is associated to @irq. * * It optionally makes sure (when the irq chip supports that method) * that the interrupt is not pending in any CPU and waiting for * service. */ void synchronize_irq(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); if (desc) __synchronize_irq(desc); } EXPORT_SYMBOL(synchronize_irq); #ifdef CONFIG_SMP cpumask_var_t irq_default_affinity; static bool __irq_can_set_affinity(struct irq_desc *desc) { if (!desc || !irqd_can_balance(&desc->irq_data) || !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity) return false; return true; } /** * irq_can_set_affinity - Check if the affinity of a given irq can be set * @irq: Interrupt to check * */ int irq_can_set_affinity(unsigned int irq) { return __irq_can_set_affinity(irq_to_desc(irq)); } /** * irq_can_set_affinity_usr - Check if affinity of a irq can be set from user space * @irq: Interrupt to check * * Like irq_can_set_affinity() above, but additionally checks for the * AFFINITY_MANAGED flag. */ bool irq_can_set_affinity_usr(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); return __irq_can_set_affinity(desc) && !irqd_affinity_is_managed(&desc->irq_data); } /** * irq_set_thread_affinity - Notify irq threads to adjust affinity * @desc: irq descriptor which has affinity changed * * Just set IRQTF_AFFINITY and delegate the affinity setting to the * interrupt thread itself. We can not call set_cpus_allowed_ptr() here as * we hold desc->lock and this code can be called from hard interrupt * context. */ static void irq_set_thread_affinity(struct irq_desc *desc) { struct irqaction *action; for_each_action_of_desc(desc, action) { if (action->thread) { set_bit(IRQTF_AFFINITY, &action->thread_flags); wake_up_process(action->thread); } if (action->secondary && action->secondary->thread) { set_bit(IRQTF_AFFINITY, &action->secondary->thread_flags); wake_up_process(action->secondary->thread); } } } #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK static void irq_validate_effective_affinity(struct irq_data *data) { const struct cpumask *m = irq_data_get_effective_affinity_mask(data); struct irq_chip *chip = irq_data_get_irq_chip(data); if (!cpumask_empty(m)) return; pr_warn_once("irq_chip %s did not update eff. affinity mask of irq %u\n", chip->name, data->irq); } #else static inline void irq_validate_effective_affinity(struct irq_data *data) { } #endif static DEFINE_PER_CPU(struct cpumask, __tmp_mask); int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { struct cpumask *tmp_mask = this_cpu_ptr(&__tmp_mask); struct irq_desc *desc = irq_data_to_desc(data); struct irq_chip *chip = irq_data_get_irq_chip(data); const struct cpumask *prog_mask; int ret; if (!chip || !chip->irq_set_affinity) return -EINVAL; /* * If this is a managed interrupt and housekeeping is enabled on * it check whether the requested affinity mask intersects with * a housekeeping CPU. If so, then remove the isolated CPUs from * the mask and just keep the housekeeping CPU(s). This prevents * the affinity setter from routing the interrupt to an isolated * CPU to avoid that I/O submitted from a housekeeping CPU causes * interrupts on an isolated one. * * If the masks do not intersect or include online CPU(s) then * keep the requested mask. The isolated target CPUs are only * receiving interrupts when the I/O operation was submitted * directly from them. * * If all housekeeping CPUs in the affinity mask are offline, the * interrupt will be migrated by the CPU hotplug code once a * housekeeping CPU which belongs to the affinity mask comes * online. */ if (irqd_affinity_is_managed(data) && housekeeping_enabled(HK_TYPE_MANAGED_IRQ)) { const struct cpumask *hk_mask; hk_mask = housekeeping_cpumask(HK_TYPE_MANAGED_IRQ); cpumask_and(tmp_mask, mask, hk_mask); if (!cpumask_intersects(tmp_mask, cpu_online_mask)) prog_mask = mask; else prog_mask = tmp_mask; } else { prog_mask = mask; } /* * Make sure we only provide online CPUs to the irqchip, * unless we are being asked to force the affinity (in which * case we do as we are told). */ cpumask_and(tmp_mask, prog_mask, cpu_online_mask); if (!force && !cpumask_empty(tmp_mask)) ret = chip->irq_set_affinity(data, tmp_mask, force); else if (force) ret = chip->irq_set_affinity(data, mask, force); else ret = -EINVAL; switch (ret) { case IRQ_SET_MASK_OK: case IRQ_SET_MASK_OK_DONE: cpumask_copy(desc->irq_common_data.affinity, mask); fallthrough; case IRQ_SET_MASK_OK_NOCOPY: irq_validate_effective_affinity(data); irq_set_thread_affinity(desc); ret = 0; } return ret; } #ifdef CONFIG_GENERIC_PENDING_IRQ static inline int irq_set_affinity_pending(struct irq_data *data, const struct cpumask *dest) { struct irq_desc *desc = irq_data_to_desc(data); irqd_set_move_pending(data); irq_copy_pending(desc, dest); return 0; } #else static inline int irq_set_affinity_pending(struct irq_data *data, const struct cpumask *dest) { return -EBUSY; } #endif static int irq_try_set_affinity(struct irq_data *data, const struct cpumask *dest, bool force) { int ret = irq_do_set_affinity(data, dest, force); /* * In case that the underlying vector management is busy and the * architecture supports the generic pending mechanism then utilize * this to avoid returning an error to user space. */ if (ret == -EBUSY && !force) ret = irq_set_affinity_pending(data, dest); return ret; } static bool irq_set_affinity_deactivated(struct irq_data *data, const struct cpumask *mask) { struct irq_desc *desc = irq_data_to_desc(data); /* * Handle irq chips which can handle affinity only in activated * state correctly * * If the interrupt is not yet activated, just store the affinity * mask and do not call the chip driver at all. On activation the * driver has to make sure anyway that the interrupt is in a * usable state so startup works. */ if (!IS_ENABLED(CONFIG_IRQ_DOMAIN_HIERARCHY) || irqd_is_activated(data) || !irqd_affinity_on_activate(data)) return false; cpumask_copy(desc->irq_common_data.affinity, mask); irq_data_update_effective_affinity(data, mask); irqd_set(data, IRQD_AFFINITY_SET); return true; } int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask, bool force) { struct irq_chip *chip = irq_data_get_irq_chip(data); struct irq_desc *desc = irq_data_to_desc(data); int ret = 0; if (!chip || !chip->irq_set_affinity) return -EINVAL; if (irq_set_affinity_deactivated(data, mask)) return 0; if (irq_can_move_pcntxt(data) && !irqd_is_setaffinity_pending(data)) { ret = irq_try_set_affinity(data, mask, force); } else { irqd_set_move_pending(data); irq_copy_pending(desc, mask); } if (desc->affinity_notify) { kref_get(&desc->affinity_notify->kref); if (!schedule_work(&desc->affinity_notify->work)) { /* Work was already scheduled, drop our extra ref */ kref_put(&desc->affinity_notify->kref, desc->affinity_notify->release); } } irqd_set(data, IRQD_AFFINITY_SET); return ret; } /** * irq_update_affinity_desc - Update affinity management for an interrupt * @irq: The interrupt number to update * @affinity: Pointer to the affinity descriptor * * This interface can be used to configure the affinity management of * interrupts which have been allocated already. * * There are certain limitations on when it may be used - attempts to use it * for when the kernel is configured for generic IRQ reservation mode (in * config GENERIC_IRQ_RESERVATION_MODE) will fail, as it may conflict with * managed/non-managed interrupt accounting. In addition, attempts to use it on * an interrupt which is already started or which has already been configured * as managed will also fail, as these mean invalid init state or double init. */ int irq_update_affinity_desc(unsigned int irq, struct irq_affinity_desc *affinity) { /* * Supporting this with the reservation scheme used by x86 needs * some more thought. Fail it for now. */ if (IS_ENABLED(CONFIG_GENERIC_IRQ_RESERVATION_MODE)) return -EOPNOTSUPP; scoped_irqdesc_get_and_buslock(irq, 0) { struct irq_desc *desc = scoped_irqdesc; bool activated; /* Requires the interrupt to be shut down */ if (irqd_is_started(&desc->irq_data)) return -EBUSY; /* Interrupts which are already managed cannot be modified */ if (irqd_affinity_is_managed(&desc->irq_data)) return -EBUSY; /* * Deactivate the interrupt. That's required to undo * anything an earlier activation has established. */ activated = irqd_is_activated(&desc->irq_data); if (activated) irq_domain_deactivate_irq(&desc->irq_data); if (affinity->is_managed) { irqd_set(&desc->irq_data, IRQD_AFFINITY_MANAGED); irqd_set(&desc->irq_data, IRQD_MANAGED_SHUTDOWN); } cpumask_copy(desc->irq_common_data.affinity, &affinity->mask); /* Restore the activation state */ if (activated) irq_domain_activate_irq(&desc->irq_data, false); return 0; } return -EINVAL; } static int __irq_set_affinity(unsigned int irq, const struct cpumask *mask, bool force) { struct irq_desc *desc = irq_to_desc(irq); if (!desc) return -EINVAL; guard(raw_spinlock_irqsave)(&desc->lock); return irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask, force); } /** * irq_set_affinity - Set the irq affinity of a given irq * @irq: Interrupt to set affinity * @cpumask: cpumask * * Fails if cpumask does not contain an online CPU */ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) { return __irq_set_affinity(irq, cpumask, false); } EXPORT_SYMBOL_GPL(irq_set_affinity); /** * irq_force_affinity - Force the irq affinity of a given irq * @irq: Interrupt to set affinity * @cpumask: cpumask * * Same as irq_set_affinity, but without checking the mask against * online cpus. * * Solely for low level cpu hotplug code, where we need to make per * cpu interrupts affine before the cpu becomes online. */ int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask) { return __irq_set_affinity(irq, cpumask, true); } EXPORT_SYMBOL_GPL(irq_force_affinity); int __irq_apply_affinity_hint(unsigned int irq, const struct cpumask *m, bool setaffinity) { int ret = -EINVAL; scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { scoped_irqdesc->affinity_hint = m; ret = 0; } if (!ret && m && setaffinity) __irq_set_affinity(irq, m, false); return ret; } EXPORT_SYMBOL_GPL(__irq_apply_affinity_hint); static void irq_affinity_notify(struct work_struct *work) { struct irq_affinity_notify *notify = container_of(work, struct irq_affinity_notify, work); struct irq_desc *desc = irq_to_desc(notify->irq); cpumask_var_t cpumask; if (!desc || !alloc_cpumask_var(&cpumask, GFP_KERNEL)) goto out; scoped_guard(raw_spinlock_irqsave, &desc->lock) { if (irq_move_pending(&desc->irq_data)) irq_get_pending(cpumask, desc); else cpumask_copy(cpumask, desc->irq_common_data.affinity); } notify->notify(notify, cpumask); free_cpumask_var(cpumask); out: kref_put(&notify->kref, notify->release); } /** * irq_set_affinity_notifier - control notification of IRQ affinity changes * @irq: Interrupt for which to enable/disable notification * @notify: Context for notification, or %NULL to disable * notification. Function pointers must be initialised; * the other fields will be initialised by this function. * * Must be called in process context. Notification may only be enabled * after the IRQ is allocated and must be disabled before the IRQ is freed * using free_irq(). */ int irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) { struct irq_desc *desc = irq_to_desc(irq); struct irq_affinity_notify *old_notify; /* The release function is promised process context */ might_sleep(); if (!desc || irq_is_nmi(desc)) return -EINVAL; /* Complete initialisation of *notify */ if (notify) { notify->irq = irq; kref_init(&notify->kref); INIT_WORK(&notify->work, irq_affinity_notify); } scoped_guard(raw_spinlock_irqsave, &desc->lock) { old_notify = desc->affinity_notify; desc->affinity_notify = notify; } if (old_notify) { if (cancel_work_sync(&old_notify->work)) { /* Pending work had a ref, put that one too */ kref_put(&old_notify->kref, old_notify->release); } kref_put(&old_notify->kref, old_notify->release); } return 0; } EXPORT_SYMBOL_GPL(irq_set_affinity_notifier); #ifndef CONFIG_AUTO_IRQ_AFFINITY /* * Generic version of the affinity autoselector. */ int irq_setup_affinity(struct irq_desc *desc) { struct cpumask *set = irq_default_affinity; int node = irq_desc_get_node(desc); static DEFINE_RAW_SPINLOCK(mask_lock); static struct cpumask mask; /* Excludes PER_CPU and NO_BALANCE interrupts */ if (!__irq_can_set_affinity(desc)) return 0; guard(raw_spinlock)(&mask_lock); /* * Preserve the managed affinity setting and a userspace affinity * setup, but make sure that one of the targets is online. */ if (irqd_affinity_is_managed(&desc->irq_data) || irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) { if (cpumask_intersects(desc->irq_common_data.affinity, cpu_online_mask)) set = desc->irq_common_data.affinity; else irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET); } cpumask_and(&mask, cpu_online_mask, set); if (cpumask_empty(&mask)) cpumask_copy(&mask, cpu_online_mask); if (node != NUMA_NO_NODE) { const struct cpumask *nodemask = cpumask_of_node(node); /* make sure at least one of the cpus in nodemask is online */ if (cpumask_intersects(&mask, nodemask)) cpumask_and(&mask, &mask, nodemask); } return irq_do_set_affinity(&desc->irq_data, &mask, false); } #else /* Wrapper for ALPHA specific affinity selector magic */ int irq_setup_affinity(struct irq_desc *desc) { return irq_select_affinity(irq_desc_get_irq(desc)); } #endif /* CONFIG_AUTO_IRQ_AFFINITY */ #endif /* CONFIG_SMP */ /** * irq_set_vcpu_affinity - Set vcpu affinity for the interrupt * @irq: interrupt number to set affinity * @vcpu_info: vCPU specific data or pointer to a percpu array of vCPU * specific data for percpu_devid interrupts * * This function uses the vCPU specific data to set the vCPU affinity for * an irq. The vCPU specific data is passed from outside, such as KVM. One * example code path is as below: KVM -> IOMMU -> irq_set_vcpu_affinity(). */ int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info) { scoped_irqdesc_get_and_lock(irq, 0) { struct irq_desc *desc = scoped_irqdesc; struct irq_data *data; struct irq_chip *chip; data = irq_desc_get_irq_data(desc); do { chip = irq_data_get_irq_chip(data); if (chip && chip->irq_set_vcpu_affinity) break; data = irqd_get_parent_data(data); } while (data); if (!data) return -ENOSYS; return chip->irq_set_vcpu_affinity(data, vcpu_info); } return -EINVAL; } EXPORT_SYMBOL_GPL(irq_set_vcpu_affinity); void __disable_irq(struct irq_desc *desc) { if (!desc->depth++) irq_disable(desc); } static int __disable_irq_nosync(unsigned int irq) { scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { __disable_irq(scoped_irqdesc); return 0; } return -EINVAL; } /** * disable_irq_nosync - disable an irq without waiting * @irq: Interrupt to disable * * Disable the selected interrupt line. Disables and Enables are * nested. * Unlike disable_irq(), this function does not ensure existing * instances of the IRQ handler have completed before returning. * * This function may be called from IRQ context. */ void disable_irq_nosync(unsigned int irq) { __disable_irq_nosync(irq); } EXPORT_SYMBOL(disable_irq_nosync); /** * disable_irq - disable an irq and wait for completion * @irq: Interrupt to disable * * Disable the selected interrupt line. Enables and Disables are nested. * * This function waits for any pending IRQ handlers for this interrupt to * complete before returning. If you use this function while holding a * resource the IRQ handler may need you will deadlock. * * Can only be called from preemptible code as it might sleep when an * interrupt thread is associated to @irq. * */ void disable_irq(unsigned int irq) { might_sleep(); if (!__disable_irq_nosync(irq)) synchronize_irq(irq); } EXPORT_SYMBOL(disable_irq); /** * disable_hardirq - disables an irq and waits for hardirq completion * @irq: Interrupt to disable * * Disable the selected interrupt line. Enables and Disables are nested. * * This function waits for any pending hard IRQ handlers for this interrupt * to complete before returning. If you use this function while holding a * resource the hard IRQ handler may need you will deadlock. * * When used to optimistically disable an interrupt from atomic context the * return value must be checked. * * Returns: false if a threaded handler is active. * * This function may be called - with care - from IRQ context. */ bool disable_hardirq(unsigned int irq) { if (!__disable_irq_nosync(irq)) return synchronize_hardirq(irq); return false; } EXPORT_SYMBOL_GPL(disable_hardirq); /** * disable_nmi_nosync - disable an nmi without waiting * @irq: Interrupt to disable * * Disable the selected interrupt line. Disables and enables are nested. * * The interrupt to disable must have been requested through request_nmi. * Unlike disable_nmi(), this function does not ensure existing * instances of the IRQ handler have completed before returning. */ void disable_nmi_nosync(unsigned int irq) { disable_irq_nosync(irq); } void __enable_irq(struct irq_desc *desc) { switch (desc->depth) { case 0: err_out: WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq_desc_get_irq(desc)); break; case 1: { if (desc->istate & IRQS_SUSPENDED) goto err_out; /* Prevent probing on this irq: */ irq_settings_set_noprobe(desc); /* * Call irq_startup() not irq_enable() here because the * interrupt might be marked NOAUTOEN so irq_startup() * needs to be invoked when it gets enabled the first time. * This is also required when __enable_irq() is invoked for * a managed and shutdown interrupt from the S3 resume * path. * * If it was already started up, then irq_startup() will * invoke irq_enable() under the hood. */ irq_startup(desc, IRQ_RESEND, IRQ_START_FORCE); break; } default: desc->depth--; } } /** * enable_irq - enable handling of an irq * @irq: Interrupt to enable * * Undoes the effect of one call to disable_irq(). If this matches the * last disable, processing of interrupts on this IRQ line is re-enabled. * * This function may be called from IRQ context only when * desc->irq_data.chip->bus_lock and desc->chip->bus_sync_unlock are NULL ! */ void enable_irq(unsigned int irq) { scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { struct irq_desc *desc = scoped_irqdesc; if (WARN(!desc->irq_data.chip, "enable_irq before setup/request_irq: irq %u\n", irq)) return; __enable_irq(desc); } } EXPORT_SYMBOL(enable_irq); /** * enable_nmi - enable handling of an nmi * @irq: Interrupt to enable * * The interrupt to enable must have been requested through request_nmi. * Undoes the effect of one call to disable_nmi(). If this matches the last * disable, processing of interrupts on this IRQ line is re-enabled. */ void enable_nmi(unsigned int irq) { enable_irq(irq); } static int set_irq_wake_real(unsigned int irq, unsigned int on) { struct irq_desc *desc = irq_to_desc(irq); int ret = -ENXIO; if (irq_desc_get_chip(desc)->flags & IRQCHIP_SKIP_SET_WAKE) return 0; if (desc->irq_data.chip->irq_set_wake) ret = desc->irq_data.chip->irq_set_wake(&desc->irq_data, on); return ret; } /** * irq_set_irq_wake - control irq power management wakeup * @irq: interrupt to control * @on: enable/disable power management wakeup * * Enable/disable power management wakeup mode, which is disabled by * default. Enables and disables must match, just as they match for * non-wakeup mode support. * * Wakeup mode lets this IRQ wake the system from sleep states like * "suspend to RAM". * * Note: irq enable/disable state is completely orthogonal to the * enable/disable state of irq wake. An irq can be disabled with * disable_irq() and still wake the system as long as the irq has wake * enabled. If this does not hold, then the underlying irq chip and the * related driver need to be investigated. */ int irq_set_irq_wake(unsigned int irq, unsigned int on) { scoped_irqdesc_get_and_buslock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { struct irq_desc *desc = scoped_irqdesc; int ret = 0; /* Don't use NMIs as wake up interrupts please */ if (irq_is_nmi(desc)) return -EINVAL; /* * wakeup-capable irqs can be shared between drivers that * don't need to have the same sleep mode behaviors. */ if (on) { if (desc->wake_depth++ == 0) { ret = set_irq_wake_real(irq, on); if (ret) desc->wake_depth = 0; else irqd_set(&desc->irq_data, IRQD_WAKEUP_STATE); } } else { if (desc->wake_depth == 0) { WARN(1, "Unbalanced IRQ %d wake disable\n", irq); } else if (--desc->wake_depth == 0) { ret = set_irq_wake_real(irq, on); if (ret) desc->wake_depth = 1; else irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE); } } return ret; } return -EINVAL; } EXPORT_SYMBOL(irq_set_irq_wake); /* * Internal function that tells the architecture code whether a * particular irq has been exclusively allocated or is available * for driver use. */ bool can_request_irq(unsigned int irq, unsigned long irqflags) { scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { struct irq_desc *desc = scoped_irqdesc; if (irq_settings_can_request(desc)) { if (!desc->action || irqflags & desc->action->flags & IRQF_SHARED) return true; } } return false; } int __irq_set_trigger(struct irq_desc *desc, unsigned long flags) { struct irq_chip *chip = desc->irq_data.chip; int ret, unmask = 0; if (!chip || !chip->irq_set_type) { /* * IRQF_TRIGGER_* but the PIC does not support multiple * flow-types? */ pr_debug("No set_type function for IRQ %d (%s)\n", irq_desc_get_irq(desc), chip ? (chip->name ? : "unknown") : "unknown"); return 0; } if (chip->flags & IRQCHIP_SET_TYPE_MASKED) { if (!irqd_irq_masked(&desc->irq_data)) mask_irq(desc); if (!irqd_irq_disabled(&desc->irq_data)) unmask = 1; } /* Mask all flags except trigger mode */ flags &= IRQ_TYPE_SENSE_MASK; ret = chip->irq_set_type(&desc->irq_data, flags); switch (ret) { case IRQ_SET_MASK_OK: case IRQ_SET_MASK_OK_DONE: irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK); irqd_set(&desc->irq_data, flags); fallthrough; case IRQ_SET_MASK_OK_NOCOPY: flags = irqd_get_trigger_type(&desc->irq_data); irq_settings_set_trigger_mask(desc, flags); irqd_clear(&desc->irq_data, IRQD_LEVEL); irq_settings_clr_level(desc); if (flags & IRQ_TYPE_LEVEL_MASK) { irq_settings_set_level(desc); irqd_set(&desc->irq_data, IRQD_LEVEL); } ret = 0; break; default: pr_err("Setting trigger mode %lu for irq %u failed (%pS)\n", flags, irq_desc_get_irq(desc), chip->irq_set_type); } if (unmask) unmask_irq(desc); return ret; } #ifdef CONFIG_HARDIRQS_SW_RESEND int irq_set_parent(int irq, int parent_irq) { scoped_irqdesc_get_and_lock(irq, 0) { scoped_irqdesc->parent_irq = parent_irq; return 0; } return -EINVAL; } EXPORT_SYMBOL_GPL(irq_set_parent); #endif /* * Default primary interrupt handler for threaded interrupts. Is * assigned as primary handler when request_threaded_irq is called * with handler == NULL. Useful for oneshot interrupts. */ static irqreturn_t irq_default_primary_handler(int irq, void *dev_id) { return IRQ_WAKE_THREAD; } /* * Primary handler for nested threaded interrupts. Should never be * called. */ static irqreturn_t irq_nested_primary_handler(int irq, void *dev_id) { WARN(1, "Primary handler called for nested irq %d\n", irq); return IRQ_NONE; } static irqreturn_t irq_forced_secondary_handler(int irq, void *dev_id) { WARN(1, "Secondary action handler called for irq %d\n", irq); return IRQ_NONE; } #ifdef CONFIG_SMP /* * Check whether we need to change the affinity of the interrupt thread. */ static void irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { cpumask_var_t mask; bool valid = false; if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags)) return; __set_current_state(TASK_RUNNING); /* * In case we are out of memory we set IRQTF_AFFINITY again and * try again next time */ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) { set_bit(IRQTF_AFFINITY, &action->thread_flags); return; } scoped_guard(raw_spinlock_irq, &desc->lock) { /* * This code is triggered unconditionally. Check the affinity * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out. */ if (cpumask_available(desc->irq_common_data.affinity)) { const struct cpumask *m; m = irq_data_get_effective_affinity_mask(&desc->irq_data); cpumask_copy(mask, m); valid = true; } } if (valid) set_cpus_allowed_ptr(current, mask); free_cpumask_var(mask); } #else static inline void irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { } #endif static int irq_wait_for_interrupt(struct irq_desc *desc, struct irqaction *action) { for (;;) { set_current_state(TASK_INTERRUPTIBLE); irq_thread_check_affinity(desc, action); if (kthread_should_stop()) { /* may need to run one last time */ if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags)) { __set_current_state(TASK_RUNNING); return 0; } __set_current_state(TASK_RUNNING); return -1; } if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags)) { __set_current_state(TASK_RUNNING); return 0; } schedule(); } } /* * Oneshot interrupts keep the irq line masked until the threaded * handler finished. unmask if the interrupt has not been disabled and * is marked MASKED. */ static void irq_finalize_oneshot(struct irq_desc *desc, struct irqaction *action) { if (!(desc->istate & IRQS_ONESHOT) || action->handler == irq_forced_secondary_handler) return; again: chip_bus_lock(desc); raw_spin_lock_irq(&desc->lock); /* * Implausible though it may be we need to protect us against * the following scenario: * * The thread is faster done than the hard interrupt handler * on the other CPU. If we unmask the irq line then the * interrupt can come in again and masks the line, leaves due * to IRQS_INPROGRESS and the irq line is masked forever. * * This also serializes the state of shared oneshot handlers * versus "desc->threads_oneshot |= action->thread_mask;" in * irq_wake_thread(). See the comment there which explains the * serialization. */ if (unlikely(irqd_irq_inprogress(&desc->irq_data))) { raw_spin_unlock_irq(&desc->lock); chip_bus_sync_unlock(desc); cpu_relax(); goto again; } /* * Now check again, whether the thread should run. Otherwise * we would clear the threads_oneshot bit of this thread which * was just set. */ if (test_bit(IRQTF_RUNTHREAD, &action->thread_flags)) goto out_unlock; desc->threads_oneshot &= ~action->thread_mask; if (!desc->threads_oneshot && !irqd_irq_disabled(&desc->irq_data) && irqd_irq_masked(&desc->irq_data)) unmask_threaded_irq(desc); out_unlock: raw_spin_unlock_irq(&desc->lock); chip_bus_sync_unlock(desc); } /* * Interrupts explicitly requested as threaded interrupts want to be * preemptible - many of them need to sleep and wait for slow busses to * complete. */ static irqreturn_t irq_thread_fn(struct irq_desc *desc, struct irqaction *action) { irqreturn_t ret = action->thread_fn(action->irq, action->dev_id); if (ret == IRQ_HANDLED) atomic_inc(&desc->threads_handled); irq_finalize_oneshot(desc, action); return ret; } /* * Interrupts which are not explicitly requested as threaded * interrupts rely on the implicit bh/preempt disable of the hard irq * context. So we need to disable bh here to avoid deadlocks and other * side effects. */ static irqreturn_t irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) { irqreturn_t ret; local_bh_disable(); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) local_irq_disable(); ret = irq_thread_fn(desc, action); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) local_irq_enable(); local_bh_enable(); return ret; } void wake_threads_waitq(struct irq_desc *desc) { if (atomic_dec_and_test(&desc->threads_active)) wake_up(&desc->wait_for_threads); } static void irq_thread_dtor(struct callback_head *unused) { struct task_struct *tsk = current; struct irq_desc *desc; struct irqaction *action; if (WARN_ON_ONCE(!(current->flags & PF_EXITING))) return; action = kthread_data(tsk); pr_err("exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", tsk->comm, tsk->pid, action->irq); desc = irq_to_desc(action->irq); /* * If IRQTF_RUNTHREAD is set, we need to decrement * desc->threads_active and wake possible waiters. */ if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags)) wake_threads_waitq(desc); /* Prevent a stale desc->threads_oneshot */ irq_finalize_oneshot(desc, action); } static void irq_wake_secondary(struct irq_desc *desc, struct irqaction *action) { struct irqaction *secondary = action->secondary; if (WARN_ON_ONCE(!secondary)) return; guard(raw_spinlock_irq)(&desc->lock); __irq_wake_thread(desc, secondary); } /* * Internal function to notify that a interrupt thread is ready. */ static void irq_thread_set_ready(struct irq_desc *desc, struct irqaction *action) { set_bit(IRQTF_READY, &action->thread_flags); wake_up(&desc->wait_for_threads); } /* * Internal function to wake up a interrupt thread and wait until it is * ready. */ static void wake_up_and_wait_for_irq_thread_ready(struct irq_desc *desc, struct irqaction *action) { if (!action || !action->thread) return; wake_up_process(action->thread); wait_event(desc->wait_for_threads, test_bit(IRQTF_READY, &action->thread_flags)); } /* * Interrupt handler thread */ static int irq_thread(void *data) { struct callback_head on_exit_work; struct irqaction *action = data; struct irq_desc *desc = irq_to_desc(action->irq); irqreturn_t (*handler_fn)(struct irq_desc *desc, struct irqaction *action); irq_thread_set_ready(desc, action); sched_set_fifo(current); if (force_irqthreads() && test_bit(IRQTF_FORCED_THREAD, &action->thread_flags)) handler_fn = irq_forced_thread_fn; else handler_fn = irq_thread_fn; init_task_work(&on_exit_work, irq_thread_dtor); task_work_add(current, &on_exit_work, TWA_NONE); while (!irq_wait_for_interrupt(desc, action)) { irqreturn_t action_ret; action_ret = handler_fn(desc, action); if (action_ret == IRQ_WAKE_THREAD) irq_wake_secondary(desc, action); wake_threads_waitq(desc); } /* * This is the regular exit path. __free_irq() is stopping the * thread via kthread_stop() after calling * synchronize_hardirq(). So neither IRQTF_RUNTHREAD nor the * oneshot mask bit can be set. */ task_work_cancel_func(current, irq_thread_dtor); return 0; } /** * irq_wake_thread - wake the irq thread for the action identified by dev_id * @irq: Interrupt line * @dev_id: Device identity for which the thread should be woken */ void irq_wake_thread(unsigned int irq, void *dev_id) { struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc))) return; guard(raw_spinlock_irqsave)(&desc->lock); for_each_action_of_desc(desc, action) { if (action->dev_id == dev_id) { if (action->thread) __irq_wake_thread(desc, action); break; } } } EXPORT_SYMBOL_GPL(irq_wake_thread); static int irq_setup_forced_threading(struct irqaction *new) { if (!force_irqthreads()) return 0; if (new->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT)) return 0; /* * No further action required for interrupts which are requested as * threaded interrupts already */ if (new->handler == irq_default_primary_handler) return 0; new->flags |= IRQF_ONESHOT; /* * Handle the case where we have a real primary handler and a * thread handler. We force thread them as well by creating a * secondary action. */ if (new->handler && new->thread_fn) { /* Allocate the secondary action */ new->secondary = kzalloc(sizeof(struct irqaction), GFP_KERNEL); if (!new->secondary) return -ENOMEM; new->secondary->handler = irq_forced_secondary_handler; new->secondary->thread_fn = new->thread_fn; new->secondary->dev_id = new->dev_id; new->secondary->irq = new->irq; new->secondary->name = new->name; } /* Deal with the primary handler */ set_bit(IRQTF_FORCED_THREAD, &new->thread_flags); new->thread_fn = new->handler; new->handler = irq_default_primary_handler; return 0; } static int irq_request_resources(struct irq_desc *desc) { struct irq_data *d = &desc->irq_data; struct irq_chip *c = d->chip; return c->irq_request_resources ? c->irq_request_resources(d) : 0; } static void irq_release_resources(struct irq_desc *desc) { struct irq_data *d = &desc->irq_data; struct irq_chip *c = d->chip; if (c->irq_release_resources) c->irq_release_resources(d); } static bool irq_supports_nmi(struct irq_desc *desc) { struct irq_data *d = irq_desc_get_irq_data(desc); #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY /* Only IRQs directly managed by the root irqchip can be set as NMI */ if (d->parent_data) return false; #endif /* Don't support NMIs for chips behind a slow bus */ if (d->chip->irq_bus_lock || d->chip->irq_bus_sync_unlock) return false; return d->chip->flags & IRQCHIP_SUPPORTS_NMI; } static int irq_nmi_setup(struct irq_desc *desc) { struct irq_data *d = irq_desc_get_irq_data(desc); struct irq_chip *c = d->chip; return c->irq_nmi_setup ? c->irq_nmi_setup(d) : -EINVAL; } static void irq_nmi_teardown(struct irq_desc *desc) { struct irq_data *d = irq_desc_get_irq_data(desc); struct irq_chip *c = d->chip; if (c->irq_nmi_teardown) c->irq_nmi_teardown(d); } static int setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary) { struct task_struct *t; if (!secondary) { t = kthread_create(irq_thread, new, "irq/%d-%s", irq, new->name); } else { t = kthread_create(irq_thread, new, "irq/%d-s-%s", irq, new->name); } if (IS_ERR(t)) return PTR_ERR(t); /* * We keep the reference to the task struct even if * the thread dies to avoid that the interrupt code * references an already freed task_struct. */ new->thread = get_task_struct(t); /* * Tell the thread to set its affinity. This is * important for shared interrupt handlers as we do * not invoke setup_affinity() for the secondary * handlers as everything is already set up. Even for * interrupts marked with IRQF_NO_BALANCE this is * correct as we want the thread to move to the cpu(s) * on which the requesting code placed the interrupt. */ set_bit(IRQTF_AFFINITY, &new->thread_flags); return 0; } /* * Internal function to register an irqaction - typically used to * allocate special interrupts that are part of the architecture. * * Locking rules: * * desc->request_mutex Provides serialization against a concurrent free_irq() * chip_bus_lock Provides serialization for slow bus operations * desc->lock Provides serialization against hard interrupts * * chip_bus_lock and desc->lock are sufficient for all other management and * interrupt related functions. desc->request_mutex solely serializes * request/free_irq(). */ static int __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) { struct irqaction *old, **old_ptr; unsigned long flags, thread_mask = 0; int ret, nested, shared = 0; if (!desc) return -EINVAL; if (desc->irq_data.chip == &no_irq_chip) return -ENOSYS; if (!try_module_get(desc->owner)) return -ENODEV; new->irq = irq; /* * If the trigger type is not specified by the caller, * then use the default for this interrupt. */ if (!(new->flags & IRQF_TRIGGER_MASK)) new->flags |= irqd_get_trigger_type(&desc->irq_data); /* * Check whether the interrupt nests into another interrupt * thread. */ nested = irq_settings_is_nested_thread(desc); if (nested) { if (!new->thread_fn) { ret = -EINVAL; goto out_mput; } /* * Replace the primary handler which was provided from * the driver for non nested interrupt handling by the * dummy function which warns when called. */ new->handler = irq_nested_primary_handler; } else { if (irq_settings_can_thread(desc)) { ret = irq_setup_forced_threading(new); if (ret) goto out_mput; } } /* * Create a handler thread when a thread function is supplied * and the interrupt does not nest into another interrupt * thread. */ if (new->thread_fn && !nested) { ret = setup_irq_thread(new, irq, false); if (ret) goto out_mput; if (new->secondary) { ret = setup_irq_thread(new->secondary, irq, true); if (ret) goto out_thread; } } /* * Drivers are often written to work w/o knowledge about the * underlying irq chip implementation, so a request for a * threaded irq without a primary hard irq context handler * requires the ONESHOT flag to be set. Some irq chips like * MSI based interrupts are per se one shot safe. Check the * chip flags, so we can avoid the unmask dance at the end of * the threaded handler for those. */ if (desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE) new->flags &= ~IRQF_ONESHOT; /* * Protects against a concurrent __free_irq() call which might wait * for synchronize_hardirq() to complete without holding the optional * chip bus lock and desc->lock. Also protects against handing out * a recycled oneshot thread_mask bit while it's still in use by * its previous owner. */ mutex_lock(&desc->request_mutex); /* * Acquire bus lock as the irq_request_resources() callback below * might rely on the serialization or the magic power management * functions which are abusing the irq_bus_lock() callback, */ chip_bus_lock(desc); /* First installed action requests resources. */ if (!desc->action) { ret = irq_request_resources(desc); if (ret) { pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n", new->name, irq, desc->irq_data.chip->name); goto out_bus_unlock; } } /* * The following block of code has to be executed atomically * protected against a concurrent interrupt and any of the other * management calls which are not serialized via * desc->request_mutex or the optional bus lock. */ raw_spin_lock_irqsave(&desc->lock, flags); old_ptr = &desc->action; old = *old_ptr; if (old) { /* * Can't share interrupts unless both agree to and are * the same type (level, edge, polarity). So both flag * fields must have IRQF_SHARED set and the bits which * set the trigger type must match. Also all must * agree on ONESHOT. * Interrupt lines used for NMIs cannot be shared. */ unsigned int oldtype; if (irq_is_nmi(desc)) { pr_err("Invalid attempt to share NMI for %s (irq %d) on irqchip %s.\n", new->name, irq, desc->irq_data.chip->name); ret = -EINVAL; goto out_unlock; } /* * If nobody did set the configuration before, inherit * the one provided by the requester. */ if (irqd_trigger_type_was_set(&desc->irq_data)) { oldtype = irqd_get_trigger_type(&desc->irq_data); } else { oldtype = new->flags & IRQF_TRIGGER_MASK; irqd_set_trigger_type(&desc->irq_data, oldtype); } if (!((old->flags & new->flags) & IRQF_SHARED) || (oldtype != (new->flags & IRQF_TRIGGER_MASK))) goto mismatch; if ((old->flags & IRQF_ONESHOT) && (new->flags & IRQF_COND_ONESHOT)) new->flags |= IRQF_ONESHOT; else if ((old->flags ^ new->flags) & IRQF_ONESHOT) goto mismatch; /* All handlers must agree on per-cpuness */ if ((old->flags & IRQF_PERCPU) != (new->flags & IRQF_PERCPU)) goto mismatch; /* add new interrupt at end of irq queue */ do { /* * Or all existing action->thread_mask bits, * so we can find the next zero bit for this * new action. */ thread_mask |= old->thread_mask; old_ptr = &old->next; old = *old_ptr; } while (old); shared = 1; } /* * Setup the thread mask for this irqaction for ONESHOT. For * !ONESHOT irqs the thread mask is 0 so we can avoid a * conditional in irq_wake_thread(). */ if (new->flags & IRQF_ONESHOT) { /* * Unlikely to have 32 resp 64 irqs sharing one line, * but who knows. */ if (thread_mask == ~0UL) { ret = -EBUSY; goto out_unlock; } /* * The thread_mask for the action is or'ed to * desc->thread_active to indicate that the * IRQF_ONESHOT thread handler has been woken, but not * yet finished. The bit is cleared when a thread * completes. When all threads of a shared interrupt * line have completed desc->threads_active becomes * zero and the interrupt line is unmasked. See * handle.c:irq_wake_thread() for further information. * * If no thread is woken by primary (hard irq context) * interrupt handlers, then desc->threads_active is * also checked for zero to unmask the irq line in the * affected hard irq flow handlers * (handle_[fasteoi|level]_irq). * * The new action gets the first zero bit of * thread_mask assigned. See the loop above which or's * all existing action->thread_mask bits. */ new->thread_mask = 1UL << ffz(thread_mask); } else if (new->handler == irq_default_primary_handler && !(desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)) { /* * The interrupt was requested with handler = NULL, so * we use the default primary handler for it. But it * does not have the oneshot flag set. In combination * with level interrupts this is deadly, because the * default primary handler just wakes the thread, then * the irq lines is reenabled, but the device still * has the level irq asserted. Rinse and repeat.... * * While this works for edge type interrupts, we play * it safe and reject unconditionally because we can't * say for sure which type this interrupt really * has. The type flags are unreliable as the * underlying chip implementation can override them. */ pr_err("Threaded irq requested with handler=NULL and !ONESHOT for %s (irq %d)\n", new->name, irq); ret = -EINVAL; goto out_unlock; } if (!shared) { /* Setup the type (level, edge polarity) if configured: */ if (new->flags & IRQF_TRIGGER_MASK) { ret = __irq_set_trigger(desc, new->flags & IRQF_TRIGGER_MASK); if (ret) goto out_unlock; } /* * Activate the interrupt. That activation must happen * independently of IRQ_NOAUTOEN. request_irq() can fail * and the callers are supposed to handle * that. enable_irq() of an interrupt requested with * IRQ_NOAUTOEN is not supposed to fail. The activation * keeps it in shutdown mode, it merily associates * resources if necessary and if that's not possible it * fails. Interrupts which are in managed shutdown mode * will simply ignore that activation request. */ ret = irq_activate(desc); if (ret) goto out_unlock; desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \ IRQS_ONESHOT | IRQS_WAITING); irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); if (new->flags & IRQF_PERCPU) { irqd_set(&desc->irq_data, IRQD_PER_CPU); irq_settings_set_per_cpu(desc); if (new->flags & IRQF_NO_DEBUG) irq_settings_set_no_debug(desc); } if (noirqdebug) irq_settings_set_no_debug(desc); if (new->flags & IRQF_ONESHOT) desc->istate |= IRQS_ONESHOT; /* Exclude IRQ from balancing if requested */ if (new->flags & IRQF_NOBALANCING) { irq_settings_set_no_balancing(desc); irqd_set(&desc->irq_data, IRQD_NO_BALANCING); } if (!(new->flags & IRQF_NO_AUTOEN) && irq_settings_can_autoenable(desc)) { irq_startup(desc, IRQ_RESEND, IRQ_START_COND); } else { /* * Shared interrupts do not go well with disabling * auto enable. The sharing interrupt might request * it while it's still disabled and then wait for * interrupts forever. */ WARN_ON_ONCE(new->flags & IRQF_SHARED); /* Undo nested disables: */ desc->depth = 1; } } else if (new->flags & IRQF_TRIGGER_MASK) { unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK; unsigned int omsk = irqd_get_trigger_type(&desc->irq_data); if (nmsk != omsk) /* hope the handler works with current trigger mode */ pr_warn("irq %d uses trigger mode %u; requested %u\n", irq, omsk, nmsk); } *old_ptr = new; irq_pm_install_action(desc, new); /* Reset broken irq detection when installing new handler */ desc->irq_count = 0; desc->irqs_unhandled = 0; /* * Check whether we disabled the irq via the spurious handler * before. Reenable it and give it another chance. */ if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) { desc->istate &= ~IRQS_SPURIOUS_DISABLED; __enable_irq(desc); } raw_spin_unlock_irqrestore(&desc->lock, flags); chip_bus_sync_unlock(desc); mutex_unlock(&desc->request_mutex); irq_setup_timings(desc, new); wake_up_and_wait_for_irq_thread_ready(desc, new); wake_up_and_wait_for_irq_thread_ready(desc, new->secondary); register_irq_proc(irq, desc); new->dir = NULL; register_handler_proc(irq, new); return 0; mismatch: if (!(new->flags & IRQF_PROBE_SHARED)) { pr_err("Flags mismatch irq %d. %08x (%s) vs. %08x (%s)\n", irq, new->flags, new->name, old->flags, old->name); #ifdef CONFIG_DEBUG_SHIRQ dump_stack(); #endif } ret = -EBUSY; out_unlock: raw_spin_unlock_irqrestore(&desc->lock, flags); if (!desc->action) irq_release_resources(desc); out_bus_unlock: chip_bus_sync_unlock(desc); mutex_unlock(&desc->request_mutex); out_thread: if (new->thread) { struct task_struct *t = new->thread; new->thread = NULL; kthread_stop_put(t); } if (new->secondary && new->secondary->thread) { struct task_struct *t = new->secondary->thread; new->secondary->thread = NULL; kthread_stop_put(t); } out_mput: module_put(desc->owner); return ret; } /* * Internal function to unregister an irqaction - used to free * regular and special interrupts that are part of the architecture. */ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id) { unsigned irq = desc->irq_data.irq; struct irqaction *action, **action_ptr; unsigned long flags; WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); mutex_lock(&desc->request_mutex); chip_bus_lock(desc); raw_spin_lock_irqsave(&desc->lock, flags); /* * There can be multiple actions per IRQ descriptor, find the right * one based on the dev_id: */ action_ptr = &desc->action; for (;;) { action = *action_ptr; if (!action) { WARN(1, "Trying to free already-free IRQ %d\n", irq); raw_spin_unlock_irqrestore(&desc->lock, flags); chip_bus_sync_unlock(desc); mutex_unlock(&desc->request_mutex); return NULL; } if (action->dev_id == dev_id) break; action_ptr = &action->next; } /* Found it - now remove it from the list of entries: */ *action_ptr = action->next; irq_pm_remove_action(desc, action); /* If this was the last handler, shut down the IRQ line: */ if (!desc->action) { irq_settings_clr_disable_unlazy(desc); /* Only shutdown. Deactivate after synchronize_hardirq() */ irq_shutdown(desc); } #ifdef CONFIG_SMP /* make sure affinity_hint is cleaned up */ if (WARN_ON_ONCE(desc->affinity_hint)) desc->affinity_hint = NULL; #endif raw_spin_unlock_irqrestore(&desc->lock, flags); /* * Drop bus_lock here so the changes which were done in the chip * callbacks above are synced out to the irq chips which hang * behind a slow bus (I2C, SPI) before calling synchronize_hardirq(). * * Aside of that the bus_lock can also be taken from the threaded * handler in irq_finalize_oneshot() which results in a deadlock * because kthread_stop() would wait forever for the thread to * complete, which is blocked on the bus lock. * * The still held desc->request_mutex() protects against a * concurrent request_irq() of this irq so the release of resources * and timing data is properly serialized. */ chip_bus_sync_unlock(desc); unregister_handler_proc(irq, action); /* * Make sure it's not being used on another CPU and if the chip * supports it also make sure that there is no (not yet serviced) * interrupt in flight at the hardware level. */ __synchronize_irq(desc); #ifdef CONFIG_DEBUG_SHIRQ /* * It's a shared IRQ -- the driver ought to be prepared for an IRQ * event to happen even now it's being freed, so let's make sure that * is so by doing an extra call to the handler .... * * ( We do this after actually deregistering it, to make sure that a * 'real' IRQ doesn't run in parallel with our fake. ) */ if (action->flags & IRQF_SHARED) { local_irq_save(flags); action->handler(irq, dev_id); local_irq_restore(flags); } #endif /* * The action has already been removed above, but the thread writes * its oneshot mask bit when it completes. Though request_mutex is * held across this which prevents __setup_irq() from handing out * the same bit to a newly requested action. */ if (action->thread) { kthread_stop_put(action->thread); if (action->secondary && action->secondary->thread) kthread_stop_put(action->secondary->thread); } /* Last action releases resources */ if (!desc->action) { /* * Reacquire bus lock as irq_release_resources() might * require it to deallocate resources over the slow bus. */ chip_bus_lock(desc); /* * There is no interrupt on the fly anymore. Deactivate it * completely. */ scoped_guard(raw_spinlock_irqsave, &desc->lock) irq_domain_deactivate_irq(&desc->irq_data); irq_release_resources(desc); chip_bus_sync_unlock(desc); irq_remove_timings(desc); } mutex_unlock(&desc->request_mutex); irq_chip_pm_put(&desc->irq_data); module_put(desc->owner); kfree(action->secondary); return action; } /** * free_irq - free an interrupt allocated with request_irq * @irq: Interrupt line to free * @dev_id: Device identity to free * * Remove an interrupt handler. The handler is removed and if the interrupt * line is no longer in use by any driver it is disabled. On a shared IRQ * the caller must ensure the interrupt is disabled on the card it drives * before calling this function. The function does not return until any * executing interrupts for this IRQ have completed. * * This function must not be called from interrupt context. * * Returns the devname argument passed to request_irq. */ const void *free_irq(unsigned int irq, void *dev_id) { struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; const char *devname; if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc))) return NULL; #ifdef CONFIG_SMP if (WARN_ON(desc->affinity_notify)) desc->affinity_notify = NULL; #endif action = __free_irq(desc, dev_id); if (!action) return NULL; devname = action->name; kfree(action); return devname; } EXPORT_SYMBOL(free_irq); /* This function must be called with desc->lock held */ static const void *__cleanup_nmi(unsigned int irq, struct irq_desc *desc) { const char *devname = NULL; desc->istate &= ~IRQS_NMI; if (!WARN_ON(desc->action == NULL)) { irq_pm_remove_action(desc, desc->action); devname = desc->action->name; unregister_handler_proc(irq, desc->action); kfree(desc->action); desc->action = NULL; } irq_settings_clr_disable_unlazy(desc); irq_shutdown_and_deactivate(desc); irq_release_resources(desc); irq_chip_pm_put(&desc->irq_data); module_put(desc->owner); return devname; } const void *free_nmi(unsigned int irq, void *dev_id) { struct irq_desc *desc = irq_to_desc(irq); if (!desc || WARN_ON(!irq_is_nmi(desc))) return NULL; if (WARN_ON(irq_settings_is_per_cpu_devid(desc))) return NULL; /* NMI still enabled */ if (WARN_ON(desc->depth == 0)) disable_nmi_nosync(irq); guard(raw_spinlock_irqsave)(&desc->lock); irq_nmi_teardown(desc); return __cleanup_nmi(irq, desc); } /** * request_threaded_irq - allocate an interrupt line * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs. * Primary handler for threaded interrupts. * If handler is NULL and thread_fn != NULL * the default primary handler is installed. * @thread_fn: Function called from the irq handler thread * If NULL, no irq thread is created * @irqflags: Interrupt type flags * @devname: An ascii name for the claiming device * @dev_id: A cookie passed back to the handler function * * This call allocates interrupt resources and enables the interrupt line * and IRQ handling. From the point this call is made your handler function * may be invoked. Since your handler function must clear any interrupt the * board raises, you must take care both to initialise your hardware and to * set up the interrupt handler in the right order. * * If you want to set up a threaded irq handler for your device then you * need to supply @handler and @thread_fn. @handler is still called in hard * interrupt context and has to check whether the interrupt originates from * the device. If yes it needs to disable the interrupt on the device and * return IRQ_WAKE_THREAD which will wake up the handler thread and run * @thread_fn. This split handler design is necessary to support shared * interrupts. * * @dev_id must be globally unique. Normally the address of the device data * structure is used as the cookie. Since the handler receives this value * it makes sense to use it. * * If your interrupt is shared you must pass a non NULL dev_id as this is * required when freeing the interrupt. * * Flags: * * IRQF_SHARED Interrupt is shared * IRQF_TRIGGER_* Specify active edge(s) or level * IRQF_ONESHOT Run thread_fn with interrupt line masked */ int request_threaded_irq(unsigned int irq, irq_handler_t handler, irq_handler_t thread_fn, unsigned long irqflags, const char *devname, void *dev_id) { struct irqaction *action; struct irq_desc *desc; int retval; if (irq == IRQ_NOTCONNECTED) return -ENOTCONN; /* * Sanity-check: shared interrupts must pass in a real dev-ID, * otherwise we'll have trouble later trying to figure out * which interrupt is which (messes up the interrupt freeing * logic etc). * * Also shared interrupts do not go well with disabling auto enable. * The sharing interrupt might request it while it's still disabled * and then wait for interrupts forever. * * Also IRQF_COND_SUSPEND only makes sense for shared interrupts and * it cannot be set along with IRQF_NO_SUSPEND. */ if (((irqflags & IRQF_SHARED) && !dev_id) || ((irqflags & IRQF_SHARED) && (irqflags & IRQF_NO_AUTOEN)) || (!(irqflags & IRQF_SHARED) && (irqflags & IRQF_COND_SUSPEND)) || ((irqflags & IRQF_NO_SUSPEND) && (irqflags & IRQF_COND_SUSPEND))) return -EINVAL; desc = irq_to_desc(irq); if (!desc) return -EINVAL; if (!irq_settings_can_request(desc) || WARN_ON(irq_settings_is_per_cpu_devid(desc))) return -EINVAL; if (!handler) { if (!thread_fn) return -EINVAL; handler = irq_default_primary_handler; } action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); if (!action) return -ENOMEM; action->handler = handler; action->thread_fn = thread_fn; action->flags = irqflags; action->name = devname; action->dev_id = dev_id; retval = irq_chip_pm_get(&desc->irq_data); if (retval < 0) { kfree(action); return retval; } retval = __setup_irq(irq, desc, action); if (retval) { irq_chip_pm_put(&desc->irq_data); kfree(action->secondary); kfree(action); } #ifdef CONFIG_DEBUG_SHIRQ_FIXME if (!retval && (irqflags & IRQF_SHARED)) { /* * It's a shared IRQ -- the driver ought to be prepared for it * to happen immediately, so let's make sure.... * We disable the irq to make sure that a 'real' IRQ doesn't * run in parallel with our fake. */ unsigned long flags; disable_irq(irq); local_irq_save(flags); handler(irq, dev_id); local_irq_restore(flags); enable_irq(irq); } #endif return retval; } EXPORT_SYMBOL(request_threaded_irq); /** * request_any_context_irq - allocate an interrupt line * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs. * Threaded handler for threaded interrupts. * @flags: Interrupt type flags * @name: An ascii name for the claiming device * @dev_id: A cookie passed back to the handler function * * This call allocates interrupt resources and enables the interrupt line * and IRQ handling. It selects either a hardirq or threaded handling * method depending on the context. * * Returns: On failure, it returns a negative value. On success, it returns either * IRQC_IS_HARDIRQ or IRQC_IS_NESTED. */ int request_any_context_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *name, void *dev_id) { struct irq_desc *desc; int ret; if (irq == IRQ_NOTCONNECTED) return -ENOTCONN; desc = irq_to_desc(irq); if (!desc) return -EINVAL; if (irq_settings_is_nested_thread(desc)) { ret = request_threaded_irq(irq, NULL, handler, flags, name, dev_id); return !ret ? IRQC_IS_NESTED : ret; } ret = request_irq(irq, handler, flags, name, dev_id); return !ret ? IRQC_IS_HARDIRQ : ret; } EXPORT_SYMBOL_GPL(request_any_context_irq); /** * request_nmi - allocate an interrupt line for NMI delivery * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs. * Threaded handler for threaded interrupts. * @irqflags: Interrupt type flags * @name: An ascii name for the claiming device * @dev_id: A cookie passed back to the handler function * * This call allocates interrupt resources and enables the interrupt line * and IRQ handling. It sets up the IRQ line to be handled as an NMI. * * An interrupt line delivering NMIs cannot be shared and IRQ handling * cannot be threaded. * * Interrupt lines requested for NMI delivering must produce per cpu * interrupts and have auto enabling setting disabled. * * @dev_id must be globally unique. Normally the address of the device data * structure is used as the cookie. Since the handler receives this value * it makes sense to use it. * * If the interrupt line cannot be used to deliver NMIs, function will fail * and return a negative value. */ int request_nmi(unsigned int irq, irq_handler_t handler, unsigned long irqflags, const char *name, void *dev_id) { struct irqaction *action; struct irq_desc *desc; int retval; if (irq == IRQ_NOTCONNECTED) return -ENOTCONN; /* NMI cannot be shared, used for Polling */ if (irqflags & (IRQF_SHARED | IRQF_COND_SUSPEND | IRQF_IRQPOLL)) return -EINVAL; if (!(irqflags & IRQF_PERCPU)) return -EINVAL; if (!handler) return -EINVAL; desc = irq_to_desc(irq); if (!desc || (irq_settings_can_autoenable(desc) && !(irqflags & IRQF_NO_AUTOEN)) || !irq_settings_can_request(desc) || WARN_ON(irq_settings_is_per_cpu_devid(desc)) || !irq_supports_nmi(desc)) return -EINVAL; action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); if (!action) return -ENOMEM; action->handler = handler; action->flags = irqflags | IRQF_NO_THREAD | IRQF_NOBALANCING; action->name = name; action->dev_id = dev_id; retval = irq_chip_pm_get(&desc->irq_data); if (retval < 0) goto err_out; retval = __setup_irq(irq, desc, action); if (retval) goto err_irq_setup; scoped_guard(raw_spinlock_irqsave, &desc->lock) { /* Setup NMI state */ desc->istate |= IRQS_NMI; retval = irq_nmi_setup(desc); if (retval) { __cleanup_nmi(irq, desc); return -EINVAL; } return 0; } err_irq_setup: irq_chip_pm_put(&desc->irq_data); err_out: kfree(action); return retval; } void enable_percpu_irq(unsigned int irq, unsigned int type) { scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU) { struct irq_desc *desc = scoped_irqdesc; /* * If the trigger type is not specified by the caller, then * use the default for this interrupt. */ type &= IRQ_TYPE_SENSE_MASK; if (type == IRQ_TYPE_NONE) type = irqd_get_trigger_type(&desc->irq_data); if (type != IRQ_TYPE_NONE) { if (__irq_set_trigger(desc, type)) { WARN(1, "failed to set type for IRQ%d\n", irq); return; } } irq_percpu_enable(desc, smp_processor_id()); } } EXPORT_SYMBOL_GPL(enable_percpu_irq); void enable_percpu_nmi(unsigned int irq, unsigned int type) { enable_percpu_irq(irq, type); } /** * irq_percpu_is_enabled - Check whether the per cpu irq is enabled * @irq: Linux irq number to check for * * Must be called from a non migratable context. Returns the enable * state of a per cpu interrupt on the current cpu. */ bool irq_percpu_is_enabled(unsigned int irq) { scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU) return cpumask_test_cpu(smp_processor_id(), scoped_irqdesc->percpu_enabled); return false; } EXPORT_SYMBOL_GPL(irq_percpu_is_enabled); void disable_percpu_irq(unsigned int irq) { scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU) irq_percpu_disable(scoped_irqdesc, smp_processor_id()); } EXPORT_SYMBOL_GPL(disable_percpu_irq); void disable_percpu_nmi(unsigned int irq) { disable_percpu_irq(irq); } /* * Internal function to unregister a percpu irqaction. */ static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_id) { struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); if (!desc) return NULL; scoped_guard(raw_spinlock_irqsave, &desc->lock) { action = desc->action; if (!action || action->percpu_dev_id != dev_id) { WARN(1, "Trying to free already-free IRQ %d\n", irq); return NULL; } if (!cpumask_empty(desc->percpu_enabled)) { WARN(1, "percpu IRQ %d still enabled on CPU%d!\n", irq, cpumask_first(desc->percpu_enabled)); return NULL; } /* Found it - now remove it from the list of entries: */ desc->action = NULL; desc->istate &= ~IRQS_NMI; } unregister_handler_proc(irq, action); irq_chip_pm_put(&desc->irq_data); module_put(desc->owner); return action; } /** * free_percpu_irq - free an interrupt allocated with request_percpu_irq * @irq: Interrupt line to free * @dev_id: Device identity to free * * Remove a percpu interrupt handler. The handler is removed, but the * interrupt line is not disabled. This must be done on each CPU before * calling this function. The function does not return until any executing * interrupts for this IRQ have completed. * * This function must not be called from interrupt context. */ void free_percpu_irq(unsigned int irq, void __percpu *dev_id) { struct irq_desc *desc = irq_to_desc(irq); if (!desc || !irq_settings_is_per_cpu_devid(desc)) return; chip_bus_lock(desc); kfree(__free_percpu_irq(irq, dev_id)); chip_bus_sync_unlock(desc); } EXPORT_SYMBOL_GPL(free_percpu_irq); void free_percpu_nmi(unsigned int irq, void __percpu *dev_id) { struct irq_desc *desc = irq_to_desc(irq); if (!desc || !irq_settings_is_per_cpu_devid(desc)) return; if (WARN_ON(!irq_is_nmi(desc))) return; kfree(__free_percpu_irq(irq, dev_id)); } /** * setup_percpu_irq - setup a per-cpu interrupt * @irq: Interrupt line to setup * @act: irqaction for the interrupt * * Used to statically setup per-cpu interrupts in the early boot process. */ int setup_percpu_irq(unsigned int irq, struct irqaction *act) { struct irq_desc *desc = irq_to_desc(irq); int retval; if (!desc || !irq_settings_is_per_cpu_devid(desc)) return -EINVAL; retval = irq_chip_pm_get(&desc->irq_data); if (retval < 0) return retval; retval = __setup_irq(irq, desc, act); if (retval) irq_chip_pm_put(&desc->irq_data); return retval; } /** * __request_percpu_irq - allocate a percpu interrupt line * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs. * @flags: Interrupt type flags (IRQF_TIMER only) * @devname: An ascii name for the claiming device * @dev_id: A percpu cookie passed back to the handler function * * This call allocates interrupt resources and enables the interrupt on the * local CPU. If the interrupt is supposed to be enabled on other CPUs, it * has to be done on each CPU using enable_percpu_irq(). * * @dev_id must be globally unique. It is a per-cpu variable, and * the handler gets called with the interrupted CPU's instance of * that variable. */ int __request_percpu_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *devname, void __percpu *dev_id) { struct irqaction *action; struct irq_desc *desc; int retval; if (!dev_id) return -EINVAL; desc = irq_to_desc(irq); if (!desc || !irq_settings_can_request(desc) || !irq_settings_is_per_cpu_devid(desc)) return -EINVAL; if (flags && flags != IRQF_TIMER) return -EINVAL; action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); if (!action) return -ENOMEM; action->handler = handler; action->flags = flags | IRQF_PERCPU | IRQF_NO_SUSPEND; action->name = devname; action->percpu_dev_id = dev_id; retval = irq_chip_pm_get(&desc->irq_data); if (retval < 0) { kfree(action); return retval; } retval = __setup_irq(irq, desc, action); if (retval) { irq_chip_pm_put(&desc->irq_data); kfree(action); } return retval; } EXPORT_SYMBOL_GPL(__request_percpu_irq); /** * request_percpu_nmi - allocate a percpu interrupt line for NMI delivery * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs. * @name: An ascii name for the claiming device * @dev_id: A percpu cookie passed back to the handler function * * This call allocates interrupt resources for a per CPU NMI. Per CPU NMIs * have to be setup on each CPU by calling prepare_percpu_nmi() before * being enabled on the same CPU by using enable_percpu_nmi(). * * @dev_id must be globally unique. It is a per-cpu variable, and the * handler gets called with the interrupted CPU's instance of that * variable. * * Interrupt lines requested for NMI delivering should have auto enabling * setting disabled. * * If the interrupt line cannot be used to deliver NMIs, function * will fail returning a negative value. */ int request_percpu_nmi(unsigned int irq, irq_handler_t handler, const char *name, void __percpu *dev_id) { struct irqaction *action; struct irq_desc *desc; int retval; if (!handler) return -EINVAL; desc = irq_to_desc(irq); if (!desc || !irq_settings_can_request(desc) || !irq_settings_is_per_cpu_devid(desc) || irq_settings_can_autoenable(desc) || !irq_supports_nmi(desc)) return -EINVAL; /* The line cannot already be NMI */ if (irq_is_nmi(desc)) return -EINVAL; action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); if (!action) return -ENOMEM; action->handler = handler; action->flags = IRQF_PERCPU | IRQF_NO_SUSPEND | IRQF_NO_THREAD | IRQF_NOBALANCING; action->name = name; action->percpu_dev_id = dev_id; retval = irq_chip_pm_get(&desc->irq_data); if (retval < 0) goto err_out; retval = __setup_irq(irq, desc, action); if (retval) goto err_irq_setup; scoped_guard(raw_spinlock_irqsave, &desc->lock) desc->istate |= IRQS_NMI; return 0; err_irq_setup: irq_chip_pm_put(&desc->irq_data); err_out: kfree(action); return retval; } /** * prepare_percpu_nmi - performs CPU local setup for NMI delivery * @irq: Interrupt line to prepare for NMI delivery * * This call prepares an interrupt line to deliver NMI on the current CPU, * before that interrupt line gets enabled with enable_percpu_nmi(). * * As a CPU local operation, this should be called from non-preemptible * context. * * If the interrupt line cannot be used to deliver NMIs, function will fail * returning a negative value. */ int prepare_percpu_nmi(unsigned int irq) { int ret = -EINVAL; WARN_ON(preemptible()); scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU) { if (WARN(!irq_is_nmi(scoped_irqdesc), "prepare_percpu_nmi called for a non-NMI interrupt: irq %u\n", irq)) return -EINVAL; ret = irq_nmi_setup(scoped_irqdesc); if (ret) pr_err("Failed to setup NMI delivery: irq %u\n", irq); } return ret; } /** * teardown_percpu_nmi - undoes NMI setup of IRQ line * @irq: Interrupt line from which CPU local NMI configuration should be removed * * This call undoes the setup done by prepare_percpu_nmi(). * * IRQ line should not be enabled for the current CPU. * As a CPU local operation, this should be called from non-preemptible * context. */ void teardown_percpu_nmi(unsigned int irq) { WARN_ON(preemptible()); scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU) { if (WARN_ON(!irq_is_nmi(scoped_irqdesc))) return; irq_nmi_teardown(scoped_irqdesc); } } static int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which, bool *state) { struct irq_chip *chip; int err = -EINVAL; do { chip = irq_data_get_irq_chip(data); if (WARN_ON_ONCE(!chip)) return -ENODEV; if (chip->irq_get_irqchip_state) break; #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY data = data->parent_data; #else data = NULL; #endif } while (data); if (data) err = chip->irq_get_irqchip_state(data, which, state); return err; } /** * irq_get_irqchip_state - returns the irqchip state of a interrupt. * @irq: Interrupt line that is forwarded to a VM * @which: One of IRQCHIP_STATE_* the caller wants to know about * @state: a pointer to a boolean where the state is to be stored * * This call snapshots the internal irqchip state of an interrupt, * returning into @state the bit corresponding to stage @which * * This function should be called with preemption disabled if the interrupt * controller has per-cpu registers. */ int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which, bool *state) { scoped_irqdesc_get_and_buslock(irq, 0) { struct irq_data *data = irq_desc_get_irq_data(scoped_irqdesc); return __irq_get_irqchip_state(data, which, state); } return -EINVAL; } EXPORT_SYMBOL_GPL(irq_get_irqchip_state); /** * irq_set_irqchip_state - set the state of a forwarded interrupt. * @irq: Interrupt line that is forwarded to a VM * @which: State to be restored (one of IRQCHIP_STATE_*) * @val: Value corresponding to @which * * This call sets the internal irqchip state of an interrupt, depending on * the value of @which. * * This function should be called with migration disabled if the interrupt * controller has per-cpu registers. */ int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, bool val) { scoped_irqdesc_get_and_buslock(irq, 0) { struct irq_data *data = irq_desc_get_irq_data(scoped_irqdesc); struct irq_chip *chip; do { chip = irq_data_get_irq_chip(data); if (WARN_ON_ONCE(!chip)) return -ENODEV; if (chip->irq_set_irqchip_state) break; data = irqd_get_parent_data(data); } while (data); if (data) return chip->irq_set_irqchip_state(data, which, val); } return -EINVAL; } EXPORT_SYMBOL_GPL(irq_set_irqchip_state); /** * irq_has_action - Check whether an interrupt is requested * @irq: The linux irq number * * Returns: A snapshot of the current state */ bool irq_has_action(unsigned int irq) { bool res; rcu_read_lock(); res = irq_desc_has_action(irq_to_desc(irq)); rcu_read_unlock(); return res; } EXPORT_SYMBOL_GPL(irq_has_action); /** * irq_check_status_bit - Check whether bits in the irq descriptor status are set * @irq: The linux irq number * @bitmask: The bitmask to evaluate * * Returns: True if one of the bits in @bitmask is set */ bool irq_check_status_bit(unsigned int irq, unsigned int bitmask) { struct irq_desc *desc; bool res = false; rcu_read_lock(); desc = irq_to_desc(irq); if (desc) res = !!(desc->status_use_accessors & bitmask); rcu_read_unlock(); return res; } EXPORT_SYMBOL_GPL(irq_check_status_bit);
300 298 297 299 280 281 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __ASM_MMAN_H__ #define __ASM_MMAN_H__ #include <uapi/asm/mman.h> #ifndef BUILD_VDSO #include <linux/compiler.h> #include <linux/fs.h> #include <linux/hugetlb.h> #include <linux/shmem_fs.h> #include <linux/types.h> static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot, unsigned long pkey) { unsigned long ret = 0; if (system_supports_bti() && (prot & PROT_BTI)) ret |= VM_ARM64_BTI; if (system_supports_mte() && (prot & PROT_MTE)) ret |= VM_MTE; #ifdef CONFIG_ARCH_HAS_PKEYS if (system_supports_poe()) { ret |= pkey & BIT(0) ? VM_PKEY_BIT0 : 0; ret |= pkey & BIT(1) ? VM_PKEY_BIT1 : 0; ret |= pkey & BIT(2) ? VM_PKEY_BIT2 : 0; } #endif return ret; } #define arch_calc_vm_prot_bits(prot, pkey) arch_calc_vm_prot_bits(prot, pkey) static inline unsigned long arch_calc_vm_flag_bits(struct file *file, unsigned long flags) { /* * Only allow MTE on anonymous mappings as these are guaranteed to be * backed by tags-capable memory. The vm_flags may be overridden by a * filesystem supporting MTE (RAM-based). */ if (system_supports_mte()) { if (flags & (MAP_ANONYMOUS | MAP_HUGETLB)) return VM_MTE_ALLOWED; if (shmem_file(file) || is_file_hugepages(file)) return VM_MTE_ALLOWED; } return 0; } #define arch_calc_vm_flag_bits(file, flags) arch_calc_vm_flag_bits(file, flags) static inline bool arch_validate_prot(unsigned long prot, unsigned long addr __always_unused) { unsigned long supported = PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM; if (system_supports_bti()) supported |= PROT_BTI; if (system_supports_mte()) supported |= PROT_MTE; return (prot & ~supported) == 0; } #define arch_validate_prot(prot, addr) arch_validate_prot(prot, addr) static inline bool arch_validate_flags(unsigned long vm_flags) { if (system_supports_mte()) { /* * only allow VM_MTE if VM_MTE_ALLOWED has been set * previously */ if ((vm_flags & VM_MTE) && !(vm_flags & VM_MTE_ALLOWED)) return false; } if (system_supports_gcs() && (vm_flags & VM_SHADOW_STACK)) { /* An executable GCS isn't a good idea. */ if (vm_flags & VM_EXEC) return false; /* The memory management core should prevent this */ VM_WARN_ON(vm_flags & VM_SHARED); } return true; } #define arch_validate_flags(vm_flags) arch_validate_flags(vm_flags) #endif /* !BUILD_VDSO */ #endif /* ! __ASM_MMAN_H__ */
1630 2 1602 56 24 76 1536 78 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 /* SPDX-License-Identifier: GPL-2.0 */ /* * Wrapper functions for accessing the file_struct fd array. */ #ifndef __LINUX_FILE_H #define __LINUX_FILE_H #include <linux/compiler.h> #include <linux/types.h> #include <linux/posix_types.h> #include <linux/errno.h> #include <linux/cleanup.h> #include <linux/err.h> struct file; extern void fput(struct file *); struct file_operations; struct task_struct; struct vfsmount; struct dentry; struct inode; struct path; extern struct file *alloc_file_pseudo(struct inode *, struct vfsmount *, const char *, int flags, const struct file_operations *); extern struct file *alloc_file_pseudo_noaccount(struct inode *, struct vfsmount *, const char *, int flags, const struct file_operations *); extern struct file *alloc_file_clone(struct file *, int flags, const struct file_operations *); /* either a reference to struct file + flags * (cloned vs. borrowed, pos locked), with * flags stored in lower bits of value, * or empty (represented by 0). */ struct fd { unsigned long word; }; #define FDPUT_FPUT 1 #define FDPUT_POS_UNLOCK 2 #define fd_file(f) ((struct file *)((f).word & ~(FDPUT_FPUT|FDPUT_POS_UNLOCK))) static inline bool fd_empty(struct fd f) { return unlikely(!f.word); } #define EMPTY_FD (struct fd){0} static inline struct fd BORROWED_FD(struct file *f) { return (struct fd){(unsigned long)f}; } static inline struct fd CLONED_FD(struct file *f) { return (struct fd){(unsigned long)f | FDPUT_FPUT}; } static inline void fdput(struct fd fd) { if (unlikely(fd.word & FDPUT_FPUT)) fput(fd_file(fd)); } extern struct file *fget(unsigned int fd); extern struct file *fget_raw(unsigned int fd); extern struct file *fget_task(struct task_struct *task, unsigned int fd); extern struct file *fget_task_next(struct task_struct *task, unsigned int *fd); extern void __f_unlock_pos(struct file *); struct fd fdget(unsigned int fd); struct fd fdget_raw(unsigned int fd); struct fd fdget_pos(unsigned int fd); static inline void fdput_pos(struct fd f) { if (f.word & FDPUT_POS_UNLOCK) __f_unlock_pos(fd_file(f)); fdput(f); } DEFINE_CLASS(fd, struct fd, fdput(_T), fdget(fd), int fd) DEFINE_CLASS(fd_raw, struct fd, fdput(_T), fdget_raw(fd), int fd) DEFINE_CLASS(fd_pos, struct fd, fdput_pos(_T), fdget_pos(fd), int fd) extern int f_dupfd(unsigned int from, struct file *file, unsigned flags); extern int replace_fd(unsigned fd, struct file *file, unsigned flags); extern void set_close_on_exec(unsigned int fd, int flag); extern bool get_close_on_exec(unsigned int fd); extern int __get_unused_fd_flags(unsigned flags, unsigned long nofile); extern int get_unused_fd_flags(unsigned flags); extern void put_unused_fd(unsigned int fd); DEFINE_CLASS(get_unused_fd, int, if (_T >= 0) put_unused_fd(_T), get_unused_fd_flags(flags), unsigned flags) DEFINE_FREE(fput, struct file *, if (!IS_ERR_OR_NULL(_T)) fput(_T)) /* * take_fd() will take care to set @fd to -EBADF ensuring that * CLASS(get_unused_fd) won't call put_unused_fd(). This makes it * easier to rely on CLASS(get_unused_fd): * * struct file *f; * * CLASS(get_unused_fd, fd)(O_CLOEXEC); * if (fd < 0) * return fd; * * f = dentry_open(&path, O_RDONLY, current_cred()); * if (IS_ERR(f)) * return PTR_ERR(f); * * fd_install(fd, f); * return take_fd(fd); */ #define take_fd(fd) __get_and_null(fd, -EBADF) extern void fd_install(unsigned int fd, struct file *file); int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags); int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags); extern void flush_delayed_fput(void); extern void __fput_sync(struct file *); extern unsigned int sysctl_nr_open_min, sysctl_nr_open_max; #endif /* __LINUX_FILE_H */
33 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * include/net/dsa.h - Driver for Distributed Switch Architecture switch chips * Copyright (c) 2008-2009 Marvell Semiconductor */ #ifndef __LINUX_NET_DSA_H #define __LINUX_NET_DSA_H #include <linux/if.h> #include <linux/if_ether.h> #include <linux/list.h> #include <linux/notifier.h> #include <linux/timer.h> #include <linux/workqueue.h> #include <linux/of.h> #include <linux/ethtool.h> #include <linux/net_tstamp.h> #include <linux/phy.h> #include <linux/platform_data/dsa.h> #include <linux/phylink.h> #include <net/devlink.h> #include <net/switchdev.h> struct dsa_8021q_context; struct tc_action; #define DSA_TAG_PROTO_NONE_VALUE 0 #define DSA_TAG_PROTO_BRCM_VALUE 1 #define DSA_TAG_PROTO_BRCM_PREPEND_VALUE 2 #define DSA_TAG_PROTO_DSA_VALUE 3 #define DSA_TAG_PROTO_EDSA_VALUE 4 #define DSA_TAG_PROTO_GSWIP_VALUE 5 #define DSA_TAG_PROTO_KSZ9477_VALUE 6 #define DSA_TAG_PROTO_KSZ9893_VALUE 7 #define DSA_TAG_PROTO_LAN9303_VALUE 8 #define DSA_TAG_PROTO_MTK_VALUE 9 #define DSA_TAG_PROTO_QCA_VALUE 10 #define DSA_TAG_PROTO_TRAILER_VALUE 11 #define DSA_TAG_PROTO_8021Q_VALUE 12 #define DSA_TAG_PROTO_SJA1105_VALUE 13 #define DSA_TAG_PROTO_KSZ8795_VALUE 14 #define DSA_TAG_PROTO_OCELOT_VALUE 15 #define DSA_TAG_PROTO_AR9331_VALUE 16 #define DSA_TAG_PROTO_RTL4_A_VALUE 17 #define DSA_TAG_PROTO_HELLCREEK_VALUE 18 #define DSA_TAG_PROTO_XRS700X_VALUE 19 #define DSA_TAG_PROTO_OCELOT_8021Q_VALUE 20 #define DSA_TAG_PROTO_SEVILLE_VALUE 21 #define DSA_TAG_PROTO_BRCM_LEGACY_VALUE 22 #define DSA_TAG_PROTO_SJA1110_VALUE 23 #define DSA_TAG_PROTO_RTL8_4_VALUE 24 #define DSA_TAG_PROTO_RTL8_4T_VALUE 25 #define DSA_TAG_PROTO_RZN1_A5PSW_VALUE 26 #define DSA_TAG_PROTO_LAN937X_VALUE 27 #define DSA_TAG_PROTO_VSC73XX_8021Q_VALUE 28 enum dsa_tag_protocol { DSA_TAG_PROTO_NONE = DSA_TAG_PROTO_NONE_VALUE, DSA_TAG_PROTO_BRCM = DSA_TAG_PROTO_BRCM_VALUE, DSA_TAG_PROTO_BRCM_LEGACY = DSA_TAG_PROTO_BRCM_LEGACY_VALUE, DSA_TAG_PROTO_BRCM_PREPEND = DSA_TAG_PROTO_BRCM_PREPEND_VALUE, DSA_TAG_PROTO_DSA = DSA_TAG_PROTO_DSA_VALUE, DSA_TAG_PROTO_EDSA = DSA_TAG_PROTO_EDSA_VALUE, DSA_TAG_PROTO_GSWIP = DSA_TAG_PROTO_GSWIP_VALUE, DSA_TAG_PROTO_KSZ9477 = DSA_TAG_PROTO_KSZ9477_VALUE, DSA_TAG_PROTO_KSZ9893 = DSA_TAG_PROTO_KSZ9893_VALUE, DSA_TAG_PROTO_LAN9303 = DSA_TAG_PROTO_LAN9303_VALUE, DSA_TAG_PROTO_MTK = DSA_TAG_PROTO_MTK_VALUE, DSA_TAG_PROTO_QCA = DSA_TAG_PROTO_QCA_VALUE, DSA_TAG_PROTO_TRAILER = DSA_TAG_PROTO_TRAILER_VALUE, DSA_TAG_PROTO_8021Q = DSA_TAG_PROTO_8021Q_VALUE, DSA_TAG_PROTO_SJA1105 = DSA_TAG_PROTO_SJA1105_VALUE, DSA_TAG_PROTO_KSZ8795 = DSA_TAG_PROTO_KSZ8795_VALUE, DSA_TAG_PROTO_OCELOT = DSA_TAG_PROTO_OCELOT_VALUE, DSA_TAG_PROTO_AR9331 = DSA_TAG_PROTO_AR9331_VALUE, DSA_TAG_PROTO_RTL4_A = DSA_TAG_PROTO_RTL4_A_VALUE, DSA_TAG_PROTO_HELLCREEK = DSA_TAG_PROTO_HELLCREEK_VALUE, DSA_TAG_PROTO_XRS700X = DSA_TAG_PROTO_XRS700X_VALUE, DSA_TAG_PROTO_OCELOT_8021Q = DSA_TAG_PROTO_OCELOT_8021Q_VALUE, DSA_TAG_PROTO_SEVILLE = DSA_TAG_PROTO_SEVILLE_VALUE, DSA_TAG_PROTO_SJA1110 = DSA_TAG_PROTO_SJA1110_VALUE, DSA_TAG_PROTO_RTL8_4 = DSA_TAG_PROTO_RTL8_4_VALUE, DSA_TAG_PROTO_RTL8_4T = DSA_TAG_PROTO_RTL8_4T_VALUE, DSA_TAG_PROTO_RZN1_A5PSW = DSA_TAG_PROTO_RZN1_A5PSW_VALUE, DSA_TAG_PROTO_LAN937X = DSA_TAG_PROTO_LAN937X_VALUE, DSA_TAG_PROTO_VSC73XX_8021Q = DSA_TAG_PROTO_VSC73XX_8021Q_VALUE, }; struct dsa_switch; struct dsa_device_ops { struct sk_buff *(*xmit)(struct sk_buff *skb, struct net_device *dev); struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev); void (*flow_dissect)(const struct sk_buff *skb, __be16 *proto, int *offset); int (*connect)(struct dsa_switch *ds); void (*disconnect)(struct dsa_switch *ds); unsigned int needed_headroom; unsigned int needed_tailroom; const char *name; enum dsa_tag_protocol proto; /* Some tagging protocols either mangle or shift the destination MAC * address, in which case the DSA conduit would drop packets on ingress * if what it understands out of the destination MAC address is not in * its RX filter. */ bool promisc_on_conduit; }; struct dsa_lag { struct net_device *dev; unsigned int id; struct mutex fdb_lock; struct list_head fdbs; refcount_t refcount; }; struct dsa_switch_tree { struct list_head list; /* List of switch ports */ struct list_head ports; /* Notifier chain for switch-wide events */ struct raw_notifier_head nh; /* Tree identifier */ unsigned int index; /* Number of switches attached to this tree */ struct kref refcount; /* Maps offloaded LAG netdevs to a zero-based linear ID for * drivers that need it. */ struct dsa_lag **lags; /* Tagging protocol operations */ const struct dsa_device_ops *tag_ops; /* Default tagging protocol preferred by the switches in this * tree. */ enum dsa_tag_protocol default_proto; /* Has this tree been applied to the hardware? */ bool setup; /* * Configuration data for the platform device that owns * this dsa switch tree instance. */ struct dsa_platform_data *pd; /* List of DSA links composing the routing table */ struct list_head rtable; /* Length of "lags" array */ unsigned int lags_len; /* Track the largest switch index within a tree */ unsigned int last_switch; }; /* LAG IDs are one-based, the dst->lags array is zero-based */ #define dsa_lags_foreach_id(_id, _dst) \ for ((_id) = 1; (_id) <= (_dst)->lags_len; (_id)++) \ if ((_dst)->lags[(_id) - 1]) #define dsa_lag_foreach_port(_dp, _dst, _lag) \ list_for_each_entry((_dp), &(_dst)->ports, list) \ if (dsa_port_offloads_lag((_dp), (_lag))) #define dsa_hsr_foreach_port(_dp, _ds, _hsr) \ list_for_each_entry((_dp), &(_ds)->dst->ports, list) \ if ((_dp)->ds == (_ds) && (_dp)->hsr_dev == (_hsr)) static inline struct dsa_lag *dsa_lag_by_id(struct dsa_switch_tree *dst, unsigned int id) { /* DSA LAG IDs are one-based, dst->lags is zero-based */ return dst->lags[id - 1]; } static inline int dsa_lag_id(struct dsa_switch_tree *dst, struct net_device *lag_dev) { unsigned int id; dsa_lags_foreach_id(id, dst) { struct dsa_lag *lag = dsa_lag_by_id(dst, id); if (lag->dev == lag_dev) return lag->id; } return -ENODEV; } /* TC matchall action types */ enum dsa_port_mall_action_type { DSA_PORT_MALL_MIRROR, DSA_PORT_MALL_POLICER, }; /* TC mirroring entry */ struct dsa_mall_mirror_tc_entry { u8 to_local_port; bool ingress; }; /* TC port policer entry */ struct dsa_mall_policer_tc_entry { u32 burst; u64 rate_bytes_per_sec; }; /* TC matchall entry */ struct dsa_mall_tc_entry { struct list_head list; unsigned long cookie; enum dsa_port_mall_action_type type; union { struct dsa_mall_mirror_tc_entry mirror; struct dsa_mall_policer_tc_entry policer; }; }; struct dsa_bridge { struct net_device *dev; unsigned int num; bool tx_fwd_offload; refcount_t refcount; }; struct dsa_port { /* A CPU port is physically connected to a conduit device. A user port * exposes a network device to user-space, called 'user' here. */ union { struct net_device *conduit; struct net_device *user; }; /* Copy of the tagging protocol operations, for quicker access * in the data path. Valid only for the CPU ports. */ const struct dsa_device_ops *tag_ops; /* Copies for faster access in conduit receive hot path */ struct dsa_switch_tree *dst; struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev); struct dsa_switch *ds; unsigned int index; enum { DSA_PORT_TYPE_UNUSED = 0, DSA_PORT_TYPE_CPU, DSA_PORT_TYPE_DSA, DSA_PORT_TYPE_USER, } type; const char *name; struct dsa_port *cpu_dp; u8 mac[ETH_ALEN]; u8 stp_state; /* Warning: the following bit fields are not atomic, and updating them * can only be done from code paths where concurrency is not possible * (probe time or under rtnl_lock). */ u8 vlan_filtering:1; /* Managed by DSA on user ports and by drivers on CPU and DSA ports */ u8 learning:1; u8 lag_tx_enabled:1; /* conduit state bits, valid only on CPU ports */ u8 conduit_admin_up:1; u8 conduit_oper_up:1; /* Valid only on user ports */ u8 cpu_port_in_lag:1; u8 setup:1; struct device_node *dn; unsigned int ageing_time; struct dsa_bridge *bridge; struct devlink_port devlink_port; struct phylink *pl; struct phylink_config pl_config; struct dsa_lag *lag; struct net_device *hsr_dev; struct list_head list; /* * Original copy of the conduit netdev ethtool_ops */ const struct ethtool_ops *orig_ethtool_ops; /* List of MAC addresses that must be forwarded on this port. * These are only valid on CPU ports and DSA links. */ struct mutex addr_lists_lock; struct list_head fdbs; struct list_head mdbs; struct mutex vlans_lock; union { /* List of VLANs that CPU and DSA ports are members of. * Access to this is serialized by the sleepable @vlans_lock. */ struct list_head vlans; /* List of VLANs that user ports are members of. * Access to this is serialized by netif_addr_lock_bh(). */ struct list_head user_vlans; }; }; static inline struct dsa_port * dsa_phylink_to_port(struct phylink_config *config) { return container_of(config, struct dsa_port, pl_config); } /* TODO: ideally DSA ports would have a single dp->link_dp member, * and no dst->rtable nor this struct dsa_link would be needed, * but this would require some more complex tree walking, * so keep it stupid at the moment and list them all. */ struct dsa_link { struct dsa_port *dp; struct dsa_port *link_dp; struct list_head list; }; enum dsa_db_type { DSA_DB_PORT, DSA_DB_LAG, DSA_DB_BRIDGE, }; struct dsa_db { enum dsa_db_type type; union { const struct dsa_port *dp; struct dsa_lag lag; struct dsa_bridge bridge; }; }; struct dsa_mac_addr { unsigned char addr[ETH_ALEN]; u16 vid; refcount_t refcount; struct list_head list; struct dsa_db db; }; struct dsa_vlan { u16 vid; refcount_t refcount; struct list_head list; }; struct dsa_switch { struct device *dev; /* * Parent switch tree, and switch index. */ struct dsa_switch_tree *dst; unsigned int index; /* Warning: the following bit fields are not atomic, and updating them * can only be done from code paths where concurrency is not possible * (probe time or under rtnl_lock). */ u32 setup:1; /* Disallow bridge core from requesting different VLAN awareness * settings on ports if not hardware-supported */ u32 vlan_filtering_is_global:1; /* Keep VLAN filtering enabled on ports not offloading any upper */ u32 needs_standalone_vlan_filtering:1; /* Pass .port_vlan_add and .port_vlan_del to drivers even for bridges * that have vlan_filtering=0. All drivers should ideally set this (and * then the option would get removed), but it is unknown whether this * would break things or not. */ u32 configure_vlan_while_not_filtering:1; /* Pop the default_pvid of VLAN-unaware bridge ports from tagged frames. * DEPRECATED: Do NOT set this field in new drivers. Instead look at * the dsa_software_vlan_untag() comments. */ u32 untag_bridge_pvid:1; /* Pop the default_pvid of VLAN-aware bridge ports from tagged frames. * Useful if the switch cannot preserve the VLAN tag as seen on the * wire for user port ingress, and chooses to send all frames as * VLAN-tagged to the CPU, including those which were originally * untagged. */ u32 untag_vlan_aware_bridge_pvid:1; /* Let DSA manage the FDB entries towards the * CPU, based on the software bridge database. */ u32 assisted_learning_on_cpu_port:1; /* In case vlan_filtering_is_global is set, the VLAN awareness state * should be retrieved from here and not from the per-port settings. */ u32 vlan_filtering:1; /* For switches that only have the MRU configurable. To ensure the * configured MTU is not exceeded, normalization of MRU on all bridged * interfaces is needed. */ u32 mtu_enforcement_ingress:1; /* Drivers that isolate the FDBs of multiple bridges must set this * to true to receive the bridge as an argument in .port_fdb_{add,del} * and .port_mdb_{add,del}. Otherwise, the bridge.num will always be * passed as zero. */ u32 fdb_isolation:1; /* Drivers that have global DSCP mapping settings must set this to * true to automatically apply the settings to all ports. */ u32 dscp_prio_mapping_is_global:1; /* Listener for switch fabric events */ struct notifier_block nb; /* * Give the switch driver somewhere to hang its private data * structure. */ void *priv; void *tagger_data; /* * Configuration data for this switch. */ struct dsa_chip_data *cd; /* * The switch operations. */ const struct dsa_switch_ops *ops; /* * Allow a DSA switch driver to override the phylink MAC ops */ const struct phylink_mac_ops *phylink_mac_ops; /* * User mii_bus and devices for the individual ports. */ u32 phys_mii_mask; struct mii_bus *user_mii_bus; /* Ageing Time limits in msecs */ unsigned int ageing_time_min; unsigned int ageing_time_max; /* Storage for drivers using tag_8021q */ struct dsa_8021q_context *tag_8021q_ctx; /* devlink used to represent this switch device */ struct devlink *devlink; /* Number of switch port queues */ unsigned int num_tx_queues; /* Drivers that benefit from having an ID associated with each * offloaded LAG should set this to the maximum number of * supported IDs. DSA will then maintain a mapping of _at * least_ these many IDs, accessible to drivers via * dsa_lag_id(). */ unsigned int num_lag_ids; /* Drivers that support bridge forwarding offload or FDB isolation * should set this to the maximum number of bridges spanning the same * switch tree (or all trees, in the case of cross-tree bridging * support) that can be offloaded. */ unsigned int max_num_bridges; unsigned int num_ports; }; static inline struct dsa_port *dsa_to_port(struct dsa_switch *ds, int p) { struct dsa_switch_tree *dst = ds->dst; struct dsa_port *dp; list_for_each_entry(dp, &dst->ports, list) if (dp->ds == ds && dp->index == p) return dp; return NULL; } static inline bool dsa_port_is_dsa(struct dsa_port *port) { return port->type == DSA_PORT_TYPE_DSA; } static inline bool dsa_port_is_cpu(struct dsa_port *port) { return port->type == DSA_PORT_TYPE_CPU; } static inline bool dsa_port_is_user(struct dsa_port *dp) { return dp->type == DSA_PORT_TYPE_USER; } static inline bool dsa_port_is_unused(struct dsa_port *dp) { return dp->type == DSA_PORT_TYPE_UNUSED; } static inline bool dsa_port_conduit_is_operational(struct dsa_port *dp) { return dsa_port_is_cpu(dp) && dp->conduit_admin_up && dp->conduit_oper_up; } static inline bool dsa_is_unused_port(struct dsa_switch *ds, int p) { return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_UNUSED; } static inline bool dsa_is_cpu_port(struct dsa_switch *ds, int p) { return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_CPU; } static inline bool dsa_is_dsa_port(struct dsa_switch *ds, int p) { return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_DSA; } static inline bool dsa_is_user_port(struct dsa_switch *ds, int p) { return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_USER; } #define dsa_tree_for_each_user_port(_dp, _dst) \ list_for_each_entry((_dp), &(_dst)->ports, list) \ if (dsa_port_is_user((_dp))) #define dsa_tree_for_each_user_port_continue_reverse(_dp, _dst) \ list_for_each_entry_continue_reverse((_dp), &(_dst)->ports, list) \ if (dsa_port_is_user((_dp))) #define dsa_tree_for_each_cpu_port(_dp, _dst) \ list_for_each_entry((_dp), &(_dst)->ports, list) \ if (dsa_port_is_cpu((_dp))) #define dsa_switch_for_each_port(_dp, _ds) \ list_for_each_entry((_dp), &(_ds)->dst->ports, list) \ if ((_dp)->ds == (_ds)) #define dsa_switch_for_each_port_safe(_dp, _next, _ds) \ list_for_each_entry_safe((_dp), (_next), &(_ds)->dst->ports, list) \ if ((_dp)->ds == (_ds)) #define dsa_switch_for_each_port_continue_reverse(_dp, _ds) \ list_for_each_entry_continue_reverse((_dp), &(_ds)->dst->ports, list) \ if ((_dp)->ds == (_ds)) #define dsa_switch_for_each_available_port(_dp, _ds) \ dsa_switch_for_each_port((_dp), (_ds)) \ if (!dsa_port_is_unused((_dp))) #define dsa_switch_for_each_user_port(_dp, _ds) \ dsa_switch_for_each_port((_dp), (_ds)) \ if (dsa_port_is_user((_dp))) #define dsa_switch_for_each_user_port_continue_reverse(_dp, _ds) \ dsa_switch_for_each_port_continue_reverse((_dp), (_ds)) \ if (dsa_port_is_user((_dp))) #define dsa_switch_for_each_cpu_port(_dp, _ds) \ dsa_switch_for_each_port((_dp), (_ds)) \ if (dsa_port_is_cpu((_dp))) #define dsa_switch_for_each_cpu_port_continue_reverse(_dp, _ds) \ dsa_switch_for_each_port_continue_reverse((_dp), (_ds)) \ if (dsa_port_is_cpu((_dp))) static inline u32 dsa_user_ports(struct dsa_switch *ds) { struct dsa_port *dp; u32 mask = 0; dsa_switch_for_each_user_port(dp, ds) mask |= BIT(dp->index); return mask; } static inline u32 dsa_cpu_ports(struct dsa_switch *ds) { struct dsa_port *cpu_dp; u32 mask = 0; dsa_switch_for_each_cpu_port(cpu_dp, ds) mask |= BIT(cpu_dp->index); return mask; } /* Return the local port used to reach an arbitrary switch device */ static inline unsigned int dsa_routing_port(struct dsa_switch *ds, int device) { struct dsa_switch_tree *dst = ds->dst; struct dsa_link *dl; list_for_each_entry(dl, &dst->rtable, list) if (dl->dp->ds == ds && dl->link_dp->ds->index == device) return dl->dp->index; return ds->num_ports; } /* Return the local port used to reach an arbitrary switch port */ static inline unsigned int dsa_towards_port(struct dsa_switch *ds, int device, int port) { if (device == ds->index) return port; else return dsa_routing_port(ds, device); } /* Return the local port used to reach the dedicated CPU port */ static inline unsigned int dsa_upstream_port(struct dsa_switch *ds, int port) { const struct dsa_port *dp = dsa_to_port(ds, port); const struct dsa_port *cpu_dp = dp->cpu_dp; if (!cpu_dp) return port; return dsa_towards_port(ds, cpu_dp->ds->index, cpu_dp->index); } /* Return true if this is the local port used to reach the CPU port */ static inline bool dsa_is_upstream_port(struct dsa_switch *ds, int port) { if (dsa_is_unused_port(ds, port)) return false; return port == dsa_upstream_port(ds, port); } /* Return true if this is a DSA port leading away from the CPU */ static inline bool dsa_is_downstream_port(struct dsa_switch *ds, int port) { return dsa_is_dsa_port(ds, port) && !dsa_is_upstream_port(ds, port); } /* Return the local port used to reach the CPU port */ static inline unsigned int dsa_switch_upstream_port(struct dsa_switch *ds) { struct dsa_port *dp; dsa_switch_for_each_available_port(dp, ds) { return dsa_upstream_port(ds, dp->index); } return ds->num_ports; } /* Return true if @upstream_ds is an upstream switch of @downstream_ds, meaning * that the routing port from @downstream_ds to @upstream_ds is also the port * which @downstream_ds uses to reach its dedicated CPU. */ static inline bool dsa_switch_is_upstream_of(struct dsa_switch *upstream_ds, struct dsa_switch *downstream_ds) { int routing_port; if (upstream_ds == downstream_ds) return true; routing_port = dsa_routing_port(downstream_ds, upstream_ds->index); return dsa_is_upstream_port(downstream_ds, routing_port); } static inline bool dsa_port_is_vlan_filtering(const struct dsa_port *dp) { const struct dsa_switch *ds = dp->ds; if (ds->vlan_filtering_is_global) return ds->vlan_filtering; else return dp->vlan_filtering; } static inline unsigned int dsa_port_lag_id_get(struct dsa_port *dp) { return dp->lag ? dp->lag->id : 0; } static inline struct net_device *dsa_port_lag_dev_get(struct dsa_port *dp) { return dp->lag ? dp->lag->dev : NULL; } static inline bool dsa_port_offloads_lag(struct dsa_port *dp, const struct dsa_lag *lag) { return dsa_port_lag_dev_get(dp) == lag->dev; } static inline struct net_device *dsa_port_to_conduit(const struct dsa_port *dp) { if (dp->cpu_port_in_lag) return dsa_port_lag_dev_get(dp->cpu_dp); return dp->cpu_dp->conduit; } static inline struct net_device *dsa_port_to_bridge_port(const struct dsa_port *dp) { if (!dp->bridge) return NULL; if (dp->lag) return dp->lag->dev; else if (dp->hsr_dev) return dp->hsr_dev; return dp->user; } static inline struct net_device * dsa_port_bridge_dev_get(const struct dsa_port *dp) { return dp->bridge ? dp->bridge->dev : NULL; } static inline unsigned int dsa_port_bridge_num_get(struct dsa_port *dp) { return dp->bridge ? dp->bridge->num : 0; } static inline bool dsa_port_bridge_same(const struct dsa_port *a, const struct dsa_port *b) { struct net_device *br_a = dsa_port_bridge_dev_get(a); struct net_device *br_b = dsa_port_bridge_dev_get(b); /* Standalone ports are not in the same bridge with one another */ return (!br_a || !br_b) ? false : (br_a == br_b); } static inline bool dsa_port_offloads_bridge_port(struct dsa_port *dp, const struct net_device *dev) { return dsa_port_to_bridge_port(dp) == dev; } static inline bool dsa_port_offloads_bridge_dev(struct dsa_port *dp, const struct net_device *bridge_dev) { /* DSA ports connected to a bridge, and event was emitted * for the bridge. */ return dsa_port_bridge_dev_get(dp) == bridge_dev; } static inline bool dsa_port_offloads_bridge(struct dsa_port *dp, const struct dsa_bridge *bridge) { return dsa_port_bridge_dev_get(dp) == bridge->dev; } /* Returns true if any port of this tree offloads the given net_device */ static inline bool dsa_tree_offloads_bridge_port(struct dsa_switch_tree *dst, const struct net_device *dev) { struct dsa_port *dp; list_for_each_entry(dp, &dst->ports, list) if (dsa_port_offloads_bridge_port(dp, dev)) return true; return false; } /* Returns true if any port of this tree offloads the given bridge */ static inline bool dsa_tree_offloads_bridge_dev(struct dsa_switch_tree *dst, const struct net_device *bridge_dev) { struct dsa_port *dp; list_for_each_entry(dp, &dst->ports, list) if (dsa_port_offloads_bridge_dev(dp, bridge_dev)) return true; return false; } static inline bool dsa_port_tree_same(const struct dsa_port *a, const struct dsa_port *b) { return a->ds->dst == b->ds->dst; } typedef int dsa_fdb_dump_cb_t(const unsigned char *addr, u16 vid, bool is_static, void *data); struct dsa_switch_ops { /* * Tagging protocol helpers called for the CPU ports and DSA links. * @get_tag_protocol retrieves the initial tagging protocol and is * mandatory. Switches which can operate using multiple tagging * protocols should implement @change_tag_protocol and report in * @get_tag_protocol the tagger in current use. */ enum dsa_tag_protocol (*get_tag_protocol)(struct dsa_switch *ds, int port, enum dsa_tag_protocol mprot); int (*change_tag_protocol)(struct dsa_switch *ds, enum dsa_tag_protocol proto); /* * Method for switch drivers to connect to the tagging protocol driver * in current use. The switch driver can provide handlers for certain * types of packets for switch management. */ int (*connect_tag_protocol)(struct dsa_switch *ds, enum dsa_tag_protocol proto); int (*port_change_conduit)(struct dsa_switch *ds, int port, struct net_device *conduit, struct netlink_ext_ack *extack); /* Optional switch-wide initialization and destruction methods */ int (*setup)(struct dsa_switch *ds); void (*teardown)(struct dsa_switch *ds); /* Per-port initialization and destruction methods. Mandatory if the * driver registers devlink port regions, optional otherwise. */ int (*port_setup)(struct dsa_switch *ds, int port); void (*port_teardown)(struct dsa_switch *ds, int port); u32 (*get_phy_flags)(struct dsa_switch *ds, int port); /* * Access to the switch's PHY registers. */ int (*phy_read)(struct dsa_switch *ds, int port, int regnum); int (*phy_write)(struct dsa_switch *ds, int port, int regnum, u16 val); /* * PHYLINK integration */ void (*phylink_get_caps)(struct dsa_switch *ds, int port, struct phylink_config *config); void (*phylink_fixed_state)(struct dsa_switch *ds, int port, struct phylink_link_state *state); /* * Port statistics counters. */ void (*get_strings)(struct dsa_switch *ds, int port, u32 stringset, uint8_t *data); void (*get_ethtool_stats)(struct dsa_switch *ds, int port, uint64_t *data); int (*get_sset_count)(struct dsa_switch *ds, int port, int sset); void (*get_ethtool_phy_stats)(struct dsa_switch *ds, int port, uint64_t *data); void (*get_eth_phy_stats)(struct dsa_switch *ds, int port, struct ethtool_eth_phy_stats *phy_stats); void (*get_eth_mac_stats)(struct dsa_switch *ds, int port, struct ethtool_eth_mac_stats *mac_stats); void (*get_eth_ctrl_stats)(struct dsa_switch *ds, int port, struct ethtool_eth_ctrl_stats *ctrl_stats); void (*get_rmon_stats)(struct dsa_switch *ds, int port, struct ethtool_rmon_stats *rmon_stats, const struct ethtool_rmon_hist_range **ranges); void (*get_ts_stats)(struct dsa_switch *ds, int port, struct ethtool_ts_stats *ts_stats); void (*get_stats64)(struct dsa_switch *ds, int port, struct rtnl_link_stats64 *s); void (*get_pause_stats)(struct dsa_switch *ds, int port, struct ethtool_pause_stats *pause_stats); void (*self_test)(struct dsa_switch *ds, int port, struct ethtool_test *etest, u64 *data); /* * ethtool Wake-on-LAN */ void (*get_wol)(struct dsa_switch *ds, int port, struct ethtool_wolinfo *w); int (*set_wol)(struct dsa_switch *ds, int port, struct ethtool_wolinfo *w); /* * ethtool timestamp info */ int (*get_ts_info)(struct dsa_switch *ds, int port, struct kernel_ethtool_ts_info *ts); /* * ethtool MAC merge layer */ int (*get_mm)(struct dsa_switch *ds, int port, struct ethtool_mm_state *state); int (*set_mm)(struct dsa_switch *ds, int port, struct ethtool_mm_cfg *cfg, struct netlink_ext_ack *extack); void (*get_mm_stats)(struct dsa_switch *ds, int port, struct ethtool_mm_stats *stats); /* * DCB ops */ int (*port_get_default_prio)(struct dsa_switch *ds, int port); int (*port_set_default_prio)(struct dsa_switch *ds, int port, u8 prio); int (*port_get_dscp_prio)(struct dsa_switch *ds, int port, u8 dscp); int (*port_add_dscp_prio)(struct dsa_switch *ds, int port, u8 dscp, u8 prio); int (*port_del_dscp_prio)(struct dsa_switch *ds, int port, u8 dscp, u8 prio); int (*port_set_apptrust)(struct dsa_switch *ds, int port, const u8 *sel, int nsel); int (*port_get_apptrust)(struct dsa_switch *ds, int port, u8 *sel, int *nsel); /* * Suspend and resume */ int (*suspend)(struct dsa_switch *ds); int (*resume)(struct dsa_switch *ds); /* * Port enable/disable */ int (*port_enable)(struct dsa_switch *ds, int port, struct phy_device *phy); void (*port_disable)(struct dsa_switch *ds, int port); /* * Notification for MAC address changes on user ports. Drivers can * currently only veto operations. They should not use the method to * program the hardware, since the operation is not rolled back in case * of other errors. */ int (*port_set_mac_address)(struct dsa_switch *ds, int port, const unsigned char *addr); /* * Compatibility between device trees defining multiple CPU ports and * drivers which are not OK to use by default the numerically smallest * CPU port of a switch for its local ports. This can return NULL, * meaning "don't know/don't care". */ struct dsa_port *(*preferred_default_local_cpu_port)(struct dsa_switch *ds); /* * Port's MAC EEE settings */ bool (*support_eee)(struct dsa_switch *ds, int port); int (*set_mac_eee)(struct dsa_switch *ds, int port, struct ethtool_keee *e); /* EEPROM access */ int (*get_eeprom_len)(struct dsa_switch *ds); int (*get_eeprom)(struct dsa_switch *ds, struct ethtool_eeprom *eeprom, u8 *data); int (*set_eeprom)(struct dsa_switch *ds, struct ethtool_eeprom *eeprom, u8 *data); /* * Register access. */ int (*get_regs_len)(struct dsa_switch *ds, int port); void (*get_regs)(struct dsa_switch *ds, int port, struct ethtool_regs *regs, void *p); /* * Upper device tracking. */ int (*port_prechangeupper)(struct dsa_switch *ds, int port, struct netdev_notifier_changeupper_info *info); /* * Bridge integration */ int (*set_ageing_time)(struct dsa_switch *ds, unsigned int msecs); int (*port_bridge_join)(struct dsa_switch *ds, int port, struct dsa_bridge bridge, bool *tx_fwd_offload, struct netlink_ext_ack *extack); void (*port_bridge_leave)(struct dsa_switch *ds, int port, struct dsa_bridge bridge); void (*port_stp_state_set)(struct dsa_switch *ds, int port, u8 state); int (*port_mst_state_set)(struct dsa_switch *ds, int port, const struct switchdev_mst_state *state); void (*port_fast_age)(struct dsa_switch *ds, int port); int (*port_vlan_fast_age)(struct dsa_switch *ds, int port, u16 vid); int (*port_pre_bridge_flags)(struct dsa_switch *ds, int port, struct switchdev_brport_flags flags, struct netlink_ext_ack *extack); int (*port_bridge_flags)(struct dsa_switch *ds, int port, struct switchdev_brport_flags flags, struct netlink_ext_ack *extack); void (*port_set_host_flood)(struct dsa_switch *ds, int port, bool uc, bool mc); /* * VLAN support */ int (*port_vlan_filtering)(struct dsa_switch *ds, int port, bool vlan_filtering, struct netlink_ext_ack *extack); int (*port_vlan_add)(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan, struct netlink_ext_ack *extack); int (*port_vlan_del)(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan); int (*vlan_msti_set)(struct dsa_switch *ds, struct dsa_bridge bridge, const struct switchdev_vlan_msti *msti); /* * Forwarding database */ int (*port_fdb_add)(struct dsa_switch *ds, int port, const unsigned char *addr, u16 vid, struct dsa_db db); int (*port_fdb_del)(struct dsa_switch *ds, int port, const unsigned char *addr, u16 vid, struct dsa_db db); int (*port_fdb_dump)(struct dsa_switch *ds, int port, dsa_fdb_dump_cb_t *cb, void *data); int (*lag_fdb_add)(struct dsa_switch *ds, struct dsa_lag lag, const unsigned char *addr, u16 vid, struct dsa_db db); int (*lag_fdb_del)(struct dsa_switch *ds, struct dsa_lag lag, const unsigned char *addr, u16 vid, struct dsa_db db); /* * Multicast database */ int (*port_mdb_add)(struct dsa_switch *ds, int port, const struct switchdev_obj_port_mdb *mdb, struct dsa_db db); int (*port_mdb_del)(struct dsa_switch *ds, int port, const struct switchdev_obj_port_mdb *mdb, struct dsa_db db); /* * RXNFC */ int (*get_rxnfc)(struct dsa_switch *ds, int port, struct ethtool_rxnfc *nfc, u32 *rule_locs); int (*set_rxnfc)(struct dsa_switch *ds, int port, struct ethtool_rxnfc *nfc); /* * TC integration */ int (*cls_flower_add)(struct dsa_switch *ds, int port, struct flow_cls_offload *cls, bool ingress); int (*cls_flower_del)(struct dsa_switch *ds, int port, struct flow_cls_offload *cls, bool ingress); int (*cls_flower_stats)(struct dsa_switch *ds, int port, struct flow_cls_offload *cls, bool ingress); int (*port_mirror_add)(struct dsa_switch *ds, int port, struct dsa_mall_mirror_tc_entry *mirror, bool ingress, struct netlink_ext_ack *extack); void (*port_mirror_del)(struct dsa_switch *ds, int port, struct dsa_mall_mirror_tc_entry *mirror); int (*port_policer_add)(struct dsa_switch *ds, int port, struct dsa_mall_policer_tc_entry *policer); void (*port_policer_del)(struct dsa_switch *ds, int port); int (*port_setup_tc)(struct dsa_switch *ds, int port, enum tc_setup_type type, void *type_data); /* * Cross-chip operations */ int (*crosschip_bridge_join)(struct dsa_switch *ds, int tree_index, int sw_index, int port, struct dsa_bridge bridge, struct netlink_ext_ack *extack); void (*crosschip_bridge_leave)(struct dsa_switch *ds, int tree_index, int sw_index, int port, struct dsa_bridge bridge); int (*crosschip_lag_change)(struct dsa_switch *ds, int sw_index, int port); int (*crosschip_lag_join)(struct dsa_switch *ds, int sw_index, int port, struct dsa_lag lag, struct netdev_lag_upper_info *info, struct netlink_ext_ack *extack); int (*crosschip_lag_leave)(struct dsa_switch *ds, int sw_index, int port, struct dsa_lag lag); /* * PTP functionality */ int (*port_hwtstamp_get)(struct dsa_switch *ds, int port, struct kernel_hwtstamp_config *config); int (*port_hwtstamp_set)(struct dsa_switch *ds, int port, struct kernel_hwtstamp_config *config, struct netlink_ext_ack *extack); void (*port_txtstamp)(struct dsa_switch *ds, int port, struct sk_buff *skb); bool (*port_rxtstamp)(struct dsa_switch *ds, int port, struct sk_buff *skb, unsigned int type); /* Devlink parameters, etc */ int (*devlink_param_get)(struct dsa_switch *ds, u32 id, struct devlink_param_gset_ctx *ctx); int (*devlink_param_set)(struct dsa_switch *ds, u32 id, struct devlink_param_gset_ctx *ctx); int (*devlink_info_get)(struct dsa_switch *ds, struct devlink_info_req *req, struct netlink_ext_ack *extack); int (*devlink_sb_pool_get)(struct dsa_switch *ds, unsigned int sb_index, u16 pool_index, struct devlink_sb_pool_info *pool_info); int (*devlink_sb_pool_set)(struct dsa_switch *ds, unsigned int sb_index, u16 pool_index, u32 size, enum devlink_sb_threshold_type threshold_type, struct netlink_ext_ack *extack); int (*devlink_sb_port_pool_get)(struct dsa_switch *ds, int port, unsigned int sb_index, u16 pool_index, u32 *p_threshold); int (*devlink_sb_port_pool_set)(struct dsa_switch *ds, int port, unsigned int sb_index, u16 pool_index, u32 threshold, struct netlink_ext_ack *extack); int (*devlink_sb_tc_pool_bind_get)(struct dsa_switch *ds, int port, unsigned int sb_index, u16 tc_index, enum devlink_sb_pool_type pool_type, u16 *p_pool_index, u32 *p_threshold); int (*devlink_sb_tc_pool_bind_set)(struct dsa_switch *ds, int port, unsigned int sb_index, u16 tc_index, enum devlink_sb_pool_type pool_type, u16 pool_index, u32 threshold, struct netlink_ext_ack *extack); int (*devlink_sb_occ_snapshot)(struct dsa_switch *ds, unsigned int sb_index); int (*devlink_sb_occ_max_clear)(struct dsa_switch *ds, unsigned int sb_index); int (*devlink_sb_occ_port_pool_get)(struct dsa_switch *ds, int port, unsigned int sb_index, u16 pool_index, u32 *p_cur, u32 *p_max); int (*devlink_sb_occ_tc_port_bind_get)(struct dsa_switch *ds, int port, unsigned int sb_index, u16 tc_index, enum devlink_sb_pool_type pool_type, u32 *p_cur, u32 *p_max); /* * MTU change functionality. Switches can also adjust their MRU through * this method. By MTU, one understands the SDU (L2 payload) length. * If the switch needs to account for the DSA tag on the CPU port, this * method needs to do so privately. */ int (*port_change_mtu)(struct dsa_switch *ds, int port, int new_mtu); int (*port_max_mtu)(struct dsa_switch *ds, int port); /* * LAG integration */ int (*port_lag_change)(struct dsa_switch *ds, int port); int (*port_lag_join)(struct dsa_switch *ds, int port, struct dsa_lag lag, struct netdev_lag_upper_info *info, struct netlink_ext_ack *extack); int (*port_lag_leave)(struct dsa_switch *ds, int port, struct dsa_lag lag); /* * HSR integration */ int (*port_hsr_join)(struct dsa_switch *ds, int port, struct net_device *hsr, struct netlink_ext_ack *extack); int (*port_hsr_leave)(struct dsa_switch *ds, int port, struct net_device *hsr); /* * MRP integration */ int (*port_mrp_add)(struct dsa_switch *ds, int port, const struct switchdev_obj_mrp *mrp); int (*port_mrp_del)(struct dsa_switch *ds, int port, const struct switchdev_obj_mrp *mrp); int (*port_mrp_add_ring_role)(struct dsa_switch *ds, int port, const struct switchdev_obj_ring_role_mrp *mrp); int (*port_mrp_del_ring_role)(struct dsa_switch *ds, int port, const struct switchdev_obj_ring_role_mrp *mrp); /* * tag_8021q operations */ int (*tag_8021q_vlan_add)(struct dsa_switch *ds, int port, u16 vid, u16 flags); int (*tag_8021q_vlan_del)(struct dsa_switch *ds, int port, u16 vid); /* * DSA conduit tracking operations */ void (*conduit_state_change)(struct dsa_switch *ds, const struct net_device *conduit, bool operational); }; #define DSA_DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes) \ DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes, \ dsa_devlink_param_get, dsa_devlink_param_set, NULL) int dsa_devlink_param_get(struct devlink *dl, u32 id, struct devlink_param_gset_ctx *ctx); int dsa_devlink_param_set(struct devlink *dl, u32 id, struct devlink_param_gset_ctx *ctx, struct netlink_ext_ack *extack); int dsa_devlink_params_register(struct dsa_switch *ds, const struct devlink_param *params, size_t params_count); void dsa_devlink_params_unregister(struct dsa_switch *ds, const struct devlink_param *params, size_t params_count); int dsa_devlink_resource_register(struct dsa_switch *ds, const char *resource_name, u64 resource_size, u64 resource_id, u64 parent_resource_id, const struct devlink_resource_size_params *size_params); void dsa_devlink_resources_unregister(struct dsa_switch *ds); void dsa_devlink_resource_occ_get_register(struct dsa_switch *ds, u64 resource_id, devlink_resource_occ_get_t *occ_get, void *occ_get_priv); void dsa_devlink_resource_occ_get_unregister(struct dsa_switch *ds, u64 resource_id); struct devlink_region * dsa_devlink_region_create(struct dsa_switch *ds, const struct devlink_region_ops *ops, u32 region_max_snapshots, u64 region_size); struct devlink_region * dsa_devlink_port_region_create(struct dsa_switch *ds, int port, const struct devlink_port_region_ops *ops, u32 region_max_snapshots, u64 region_size); void dsa_devlink_region_destroy(struct devlink_region *region); struct dsa_port *dsa_port_from_netdev(struct net_device *netdev); struct dsa_devlink_priv { struct dsa_switch *ds; }; static inline struct dsa_switch *dsa_devlink_to_ds(struct devlink *dl) { struct dsa_devlink_priv *dl_priv = devlink_priv(dl); return dl_priv->ds; } static inline struct dsa_switch *dsa_devlink_port_to_ds(struct devlink_port *port) { struct devlink *dl = port->devlink; struct dsa_devlink_priv *dl_priv = devlink_priv(dl); return dl_priv->ds; } static inline int dsa_devlink_port_to_port(struct devlink_port *port) { return port->index; } struct dsa_switch_driver { struct list_head list; const struct dsa_switch_ops *ops; }; bool dsa_fdb_present_in_other_db(struct dsa_switch *ds, int port, const unsigned char *addr, u16 vid, struct dsa_db db); bool dsa_mdb_present_in_other_db(struct dsa_switch *ds, int port, const struct switchdev_obj_port_mdb *mdb, struct dsa_db db); /* Keep inline for faster access in hot path */ static inline bool netdev_uses_dsa(const struct net_device *dev) { #if IS_ENABLED(CONFIG_NET_DSA) return dev->dsa_ptr && dev->dsa_ptr->rcv; #endif return false; } /* All DSA tags that push the EtherType to the right (basically all except tail * tags, which don't break dissection) can be treated the same from the * perspective of the flow dissector. * * We need to return: * - offset: the (B - A) difference between: * A. the position of the real EtherType and * B. the current skb->data (aka ETH_HLEN bytes into the frame, aka 2 bytes * after the normal EtherType was supposed to be) * The offset in bytes is exactly equal to the tagger overhead (and half of * that, in __be16 shorts). * * - proto: the value of the real EtherType. */ static inline void dsa_tag_generic_flow_dissect(const struct sk_buff *skb, __be16 *proto, int *offset) { #if IS_ENABLED(CONFIG_NET_DSA) const struct dsa_device_ops *ops = skb->dev->dsa_ptr->tag_ops; int tag_len = ops->needed_headroom; *offset = tag_len; *proto = ((__be16 *)skb->data)[(tag_len / 2) - 1]; #endif } void dsa_unregister_switch(struct dsa_switch *ds); int dsa_register_switch(struct dsa_switch *ds); void dsa_switch_shutdown(struct dsa_switch *ds); struct dsa_switch *dsa_switch_find(int tree_index, int sw_index); void dsa_flush_workqueue(void); #ifdef CONFIG_PM_SLEEP int dsa_switch_suspend(struct dsa_switch *ds); int dsa_switch_resume(struct dsa_switch *ds); #else static inline int dsa_switch_suspend(struct dsa_switch *ds) { return 0; } static inline int dsa_switch_resume(struct dsa_switch *ds) { return 0; } #endif /* CONFIG_PM_SLEEP */ #if IS_ENABLED(CONFIG_NET_DSA) bool dsa_user_dev_check(const struct net_device *dev); #else static inline bool dsa_user_dev_check(const struct net_device *dev) { return false; } #endif netdev_tx_t dsa_enqueue_skb(struct sk_buff *skb, struct net_device *dev); void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up); bool dsa_supports_eee(struct dsa_switch *ds, int port); #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __FS_NOTIFY_FSNOTIFY_H_ #define __FS_NOTIFY_FSNOTIFY_H_ #include <linux/list.h> #include <linux/fsnotify.h> #include <linux/srcu.h> #include <linux/types.h> #include "../mount.h" /* * fsnotify_connp_t is what we embed in objects which connector can be attached * to. */ typedef struct fsnotify_mark_connector __rcu *fsnotify_connp_t; static inline struct inode *fsnotify_conn_inode( struct fsnotify_mark_connector *conn) { return conn->obj; } static inline struct mount *fsnotify_conn_mount( struct fsnotify_mark_connector *conn) { return real_mount(conn->obj); } static inline struct super_block *fsnotify_conn_sb( struct fsnotify_mark_connector *conn) { return conn->obj; } static inline struct mnt_namespace *fsnotify_conn_mntns( struct fsnotify_mark_connector *conn) { return conn->obj; } static inline struct super_block *fsnotify_object_sb(void *obj, enum fsnotify_obj_type obj_type) { switch (obj_type) { case FSNOTIFY_OBJ_TYPE_INODE: return ((struct inode *)obj)->i_sb; case FSNOTIFY_OBJ_TYPE_VFSMOUNT: return ((struct vfsmount *)obj)->mnt_sb; case FSNOTIFY_OBJ_TYPE_SB: return (struct super_block *)obj; default: return NULL; } } static inline struct super_block *fsnotify_connector_sb( struct fsnotify_mark_connector *conn) { return fsnotify_object_sb(conn->obj, conn->type); } static inline fsnotify_connp_t *fsnotify_sb_marks(struct super_block *sb) { struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb); return sbinfo ? &sbinfo->sb_marks : NULL; } /* destroy all events sitting in this groups notification queue */ extern void fsnotify_flush_notify(struct fsnotify_group *group); /* protects reads of inode and vfsmount marks list */ extern struct srcu_struct fsnotify_mark_srcu; /* compare two groups for sorting of marks lists */ extern int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b); /* Destroy all marks attached to an object via connector */ extern void fsnotify_destroy_marks(fsnotify_connp_t *connp); /* run the list of all marks associated with inode and destroy them */ static inline void fsnotify_clear_marks_by_inode(struct inode *inode) { fsnotify_destroy_marks(&inode->i_fsnotify_marks); } /* run the list of all marks associated with vfsmount and destroy them */ static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt) { fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks); } /* run the list of all marks associated with sb and destroy them */ static inline void fsnotify_clear_marks_by_sb(struct super_block *sb) { fsnotify_destroy_marks(fsnotify_sb_marks(sb)); } static inline void fsnotify_clear_marks_by_mntns(struct mnt_namespace *mntns) { fsnotify_destroy_marks(&mntns->n_fsnotify_marks); } /* * update the dentry->d_flags of all of inode's children to indicate if inode cares * about events that happen to its children. */ extern void fsnotify_set_children_dentry_flags(struct inode *inode); extern struct kmem_cache *fsnotify_mark_connector_cachep; #endif /* __FS_NOTIFY_FSNOTIFY_H_ */
3 1 2 2 1 6 1 5 5 5 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 // SPDX-License-Identifier: GPL-2.0-or-later /* linux/net/ipv4/arp.c * * Copyright (C) 1994 by Florian La Roche * * This module implements the Address Resolution Protocol ARP (RFC 826), * which is used to convert IP addresses (or in the future maybe other * high-level addresses) into a low-level hardware address (like an Ethernet * address). * * Fixes: * Alan Cox : Removed the Ethernet assumptions in * Florian's code * Alan Cox : Fixed some small errors in the ARP * logic * Alan Cox : Allow >4K in /proc * Alan Cox : Make ARP add its own protocol entry * Ross Martin : Rewrote arp_rcv() and arp_get_info() * Stephen Henson : Add AX25 support to arp_get_info() * Alan Cox : Drop data when a device is downed. * Alan Cox : Use init_timer(). * Alan Cox : Double lock fixes. * Martin Seine : Move the arphdr structure * to if_arp.h for compatibility. * with BSD based programs. * Andrew Tridgell : Added ARP netmask code and * re-arranged proxy handling. * Alan Cox : Changed to use notifiers. * Niibe Yutaka : Reply for this device or proxies only. * Alan Cox : Don't proxy across hardware types! * Jonathan Naylor : Added support for NET/ROM. * Mike Shaver : RFC1122 checks. * Jonathan Naylor : Only lookup the hardware address for * the correct hardware type. * Germano Caronni : Assorted subtle races. * Craig Schlenter : Don't modify permanent entry * during arp_rcv. * Russ Nelson : Tidied up a few bits. * Alexey Kuznetsov: Major changes to caching and behaviour, * eg intelligent arp probing and * generation * of host down events. * Alan Cox : Missing unlock in device events. * Eckes : ARP ioctl control errors. * Alexey Kuznetsov: Arp free fix. * Manuel Rodriguez: Gratuitous ARP. * Jonathan Layes : Added arpd support through kerneld * message queue (960314) * Mike Shaver : /proc/sys/net/ipv4/arp_* support * Mike McLagan : Routing by source * Stuart Cheshire : Metricom and grat arp fixes * *** FOR 2.1 clean this up *** * Lawrence V. Stefani: (08/12/96) Added FDDI support. * Alan Cox : Took the AP1000 nasty FDDI hack and * folded into the mainstream FDDI code. * Ack spit, Linus how did you allow that * one in... * Jes Sorensen : Make FDDI work again in 2.1.x and * clean up the APFDDI & gen. FDDI bits. * Alexey Kuznetsov: new arp state machine; * now it is in net/core/neighbour.c. * Krzysztof Halasa: Added Frame Relay ARP support. * Arnaldo C. Melo : convert /proc/net/arp to seq_file * Shmulik Hen: Split arp_send to arp_create and * arp_xmit so intermediate drivers like * bonding can change the skb before * sending (e.g. insert 8021q tag). * Harald Welte : convert to make use of jenkins hash * Jesper D. Brouer: Proxy ARP PVLAN RFC 3069 support. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/types.h> #include <linux/string.h> #include <linux/kernel.h> #include <linux/capability.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/errno.h> #include <linux/in.h> #include <linux/mm.h> #include <linux/inet.h> #include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/fddidevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/stat.h> #include <linux/init.h> #include <linux/net.h> #include <linux/rcupdate.h> #include <linux/slab.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif #include <net/net_namespace.h> #include <net/ip.h> #include <net/icmp.h> #include <net/route.h> #include <net/protocol.h> #include <net/tcp.h> #include <net/sock.h> #include <net/arp.h> #include <net/ax25.h> #include <net/netrom.h> #include <net/dst_metadata.h> #include <net/ip_tunnels.h> #include <linux/uaccess.h> #include <linux/netfilter_arp.h> /* * Interface to generic neighbour cache. */ static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd); static bool arp_key_eq(const struct neighbour *n, const void *pkey); static int arp_constructor(struct neighbour *neigh); static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb); static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb); static void parp_redo(struct sk_buff *skb); static int arp_is_multicast(const void *pkey); static const struct neigh_ops arp_generic_ops = { .family = AF_INET, .solicit = arp_solicit, .error_report = arp_error_report, .output = neigh_resolve_output, .connected_output = neigh_connected_output, }; static const struct neigh_ops arp_hh_ops = { .family = AF_INET, .solicit = arp_solicit, .error_report = arp_error_report, .output = neigh_resolve_output, .connected_output = neigh_resolve_output, }; static const struct neigh_ops arp_direct_ops = { .family = AF_INET, .output = neigh_direct_output, .connected_output = neigh_direct_output, }; struct neigh_table arp_tbl = { .family = AF_INET, .key_len = 4, .protocol = cpu_to_be16(ETH_P_IP), .hash = arp_hash, .key_eq = arp_key_eq, .constructor = arp_constructor, .proxy_redo = parp_redo, .is_multicast = arp_is_multicast, .id = "arp_cache", .parms = { .tbl = &arp_tbl, .reachable_time = 30 * HZ, .data = { [NEIGH_VAR_MCAST_PROBES] = 3, [NEIGH_VAR_UCAST_PROBES] = 3, [NEIGH_VAR_RETRANS_TIME] = 1 * HZ, [NEIGH_VAR_BASE_REACHABLE_TIME] = 30 * HZ, [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ, [NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ, [NEIGH_VAR_GC_STALETIME] = 60 * HZ, [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX, [NEIGH_VAR_PROXY_QLEN] = 64, [NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ, [NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10, [NEIGH_VAR_LOCKTIME] = 1 * HZ, }, }, .gc_interval = 30 * HZ, .gc_thresh1 = 128, .gc_thresh2 = 512, .gc_thresh3 = 1024, }; EXPORT_SYMBOL(arp_tbl); int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir) { switch (dev->type) { case ARPHRD_ETHER: case ARPHRD_FDDI: case ARPHRD_IEEE802: ip_eth_mc_map(addr, haddr); return 0; case ARPHRD_INFINIBAND: ip_ib_mc_map(addr, dev->broadcast, haddr); return 0; case ARPHRD_IPGRE: ip_ipgre_mc_map(addr, dev->broadcast, haddr); return 0; default: if (dir) { memcpy(haddr, dev->broadcast, dev->addr_len); return 0; } } return -EINVAL; } static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd) { return arp_hashfn(pkey, dev, hash_rnd); } static bool arp_key_eq(const struct neighbour *neigh, const void *pkey) { return neigh_key_eq32(neigh, pkey); } static int arp_constructor(struct neighbour *neigh) { __be32 addr; struct net_device *dev = neigh->dev; struct in_device *in_dev; struct neigh_parms *parms; u32 inaddr_any = INADDR_ANY; if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) memcpy(neigh->primary_key, &inaddr_any, arp_tbl.key_len); addr = *(__be32 *)neigh->primary_key; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (!in_dev) { rcu_read_unlock(); return -EINVAL; } neigh->type = inet_addr_type_dev_table(dev_net(dev), dev, addr); parms = in_dev->arp_parms; __neigh_parms_put(neigh->parms); neigh->parms = neigh_parms_clone(parms); rcu_read_unlock(); if (!dev->header_ops) { neigh->nud_state = NUD_NOARP; neigh->ops = &arp_direct_ops; neigh->output = neigh_direct_output; } else { /* Good devices (checked by reading texts, but only Ethernet is tested) ARPHRD_ETHER: (ethernet, apfddi) ARPHRD_FDDI: (fddi) ARPHRD_IEEE802: (tr) ARPHRD_METRICOM: (strip) ARPHRD_ARCNET: etc. etc. etc. ARPHRD_IPDDP will also work, if author repairs it. I did not it, because this driver does not work even in old paradigm. */ if (neigh->type == RTN_MULTICAST) { neigh->nud_state = NUD_NOARP; arp_mc_map(addr, neigh->ha, dev, 1); } else if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) { neigh->nud_state = NUD_NOARP; memcpy(neigh->ha, dev->dev_addr, dev->addr_len); } else if (neigh->type == RTN_BROADCAST || (dev->flags & IFF_POINTOPOINT)) { neigh->nud_state = NUD_NOARP; memcpy(neigh->ha, dev->broadcast, dev->addr_len); } if (dev->header_ops->cache) neigh->ops = &arp_hh_ops; else neigh->ops = &arp_generic_ops; if (neigh->nud_state & NUD_VALID) neigh->output = neigh->ops->connected_output; else neigh->output = neigh->ops->output; } return 0; } static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb) { dst_link_failure(skb); kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_FAILED); } /* Create and send an arp packet. */ static void arp_send_dst(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, const unsigned char *dest_hw, const unsigned char *src_hw, const unsigned char *target_hw, struct dst_entry *dst) { struct sk_buff *skb; /* arp on this interface. */ if (dev->flags & IFF_NOARP) return; skb = arp_create(type, ptype, dest_ip, dev, src_ip, dest_hw, src_hw, target_hw); if (!skb) return; skb_dst_set(skb, dst_clone(dst)); arp_xmit(skb); } void arp_send(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, const unsigned char *dest_hw, const unsigned char *src_hw, const unsigned char *target_hw) { arp_send_dst(type, ptype, dest_ip, dev, src_ip, dest_hw, src_hw, target_hw, NULL); } EXPORT_SYMBOL(arp_send); static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) { __be32 saddr = 0; u8 dst_ha[MAX_ADDR_LEN], *dst_hw = NULL; struct net_device *dev = neigh->dev; __be32 target = *(__be32 *)neigh->primary_key; int probes = atomic_read(&neigh->probes); struct in_device *in_dev; struct dst_entry *dst = NULL; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (!in_dev) { rcu_read_unlock(); return; } switch (IN_DEV_ARP_ANNOUNCE(in_dev)) { default: case 0: /* By default announce any local IP */ if (skb && inet_addr_type_dev_table(dev_net(dev), dev, ip_hdr(skb)->saddr) == RTN_LOCAL) saddr = ip_hdr(skb)->saddr; break; case 1: /* Restrict announcements of saddr in same subnet */ if (!skb) break; saddr = ip_hdr(skb)->saddr; if (inet_addr_type_dev_table(dev_net(dev), dev, saddr) == RTN_LOCAL) { /* saddr should be known to target */ if (inet_addr_onlink(in_dev, target, saddr)) break; } saddr = 0; break; case 2: /* Avoid secondary IPs, get a primary/preferred one */ break; } rcu_read_unlock(); if (!saddr) saddr = inet_select_addr(dev, target, RT_SCOPE_LINK); probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES); if (probes < 0) { if (!(READ_ONCE(neigh->nud_state) & NUD_VALID)) pr_debug("trying to ucast probe in NUD_INVALID\n"); neigh_ha_snapshot(dst_ha, neigh, dev); dst_hw = dst_ha; } else { probes -= NEIGH_VAR(neigh->parms, APP_PROBES); if (probes < 0) { neigh_app_ns(neigh); return; } } if (skb && !(dev->priv_flags & IFF_XMIT_DST_RELEASE)) dst = skb_dst(skb); arp_send_dst(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr, dst_hw, dev->dev_addr, NULL, dst); } static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip) { struct net *net = dev_net(in_dev->dev); int scope; switch (IN_DEV_ARP_IGNORE(in_dev)) { case 0: /* Reply, the tip is already validated */ return 0; case 1: /* Reply only if tip is configured on the incoming interface */ sip = 0; scope = RT_SCOPE_HOST; break; case 2: /* * Reply only if tip is configured on the incoming interface * and is in same subnet as sip */ scope = RT_SCOPE_HOST; break; case 3: /* Do not reply for scope host addresses */ sip = 0; scope = RT_SCOPE_LINK; in_dev = NULL; break; case 4: /* Reserved */ case 5: case 6: case 7: return 0; case 8: /* Do not reply */ return 1; default: return 0; } return !inet_confirm_addr(net, in_dev, sip, tip, scope); } static int arp_accept(struct in_device *in_dev, __be32 sip) { struct net *net = dev_net(in_dev->dev); int scope = RT_SCOPE_LINK; switch (IN_DEV_ARP_ACCEPT(in_dev)) { case 0: /* Don't create new entries from garp */ return 0; case 1: /* Create new entries from garp */ return 1; case 2: /* Create a neighbor in the arp table only if sip * is in the same subnet as an address configured * on the interface that received the garp message */ return !!inet_confirm_addr(net, in_dev, sip, 0, scope); default: return 0; } } static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) { struct rtable *rt; int flag = 0; /*unsigned long now; */ struct net *net = dev_net(dev); rt = ip_route_output(net, sip, tip, 0, l3mdev_master_ifindex_rcu(dev), RT_SCOPE_UNIVERSE); if (IS_ERR(rt)) return 1; if (rt->dst.dev != dev) { __NET_INC_STATS(net, LINUX_MIB_ARPFILTER); flag = 1; } ip_rt_put(rt); return flag; } /* * Check if we can use proxy ARP for this path */ static inline int arp_fwd_proxy(struct in_device *in_dev, struct net_device *dev, struct rtable *rt) { struct in_device *out_dev; int imi, omi = -1; if (rt->dst.dev == dev) return 0; if (!IN_DEV_PROXY_ARP(in_dev)) return 0; imi = IN_DEV_MEDIUM_ID(in_dev); if (imi == 0) return 1; if (imi == -1) return 0; /* place to check for proxy_arp for routes */ out_dev = __in_dev_get_rcu(rt->dst.dev); if (out_dev) omi = IN_DEV_MEDIUM_ID(out_dev); return omi != imi && omi != -1; } /* * Check for RFC3069 proxy arp private VLAN (allow to send back to same dev) * * RFC3069 supports proxy arp replies back to the same interface. This * is done to support (ethernet) switch features, like RFC 3069, where * the individual ports are not allowed to communicate with each * other, BUT they are allowed to talk to the upstream router. As * described in RFC 3069, it is possible to allow these hosts to * communicate through the upstream router, by proxy_arp'ing. * * RFC 3069: "VLAN Aggregation for Efficient IP Address Allocation" * * This technology is known by different names: * In RFC 3069 it is called VLAN Aggregation. * Cisco and Allied Telesyn call it Private VLAN. * Hewlett-Packard call it Source-Port filtering or port-isolation. * Ericsson call it MAC-Forced Forwarding (RFC Draft). * */ static inline int arp_fwd_pvlan(struct in_device *in_dev, struct net_device *dev, struct rtable *rt, __be32 sip, __be32 tip) { /* Private VLAN is only concerned about the same ethernet segment */ if (rt->dst.dev != dev) return 0; /* Don't reply on self probes (often done by windowz boxes)*/ if (sip == tip) return 0; if (IN_DEV_PROXY_ARP_PVLAN(in_dev)) return 1; else return 0; } /* * Interface to link layer: send routine and receive handler. */ /* * Create an arp packet. If dest_hw is not set, we create a broadcast * message. */ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, const unsigned char *dest_hw, const unsigned char *src_hw, const unsigned char *target_hw) { struct sk_buff *skb; struct arphdr *arp; unsigned char *arp_ptr; int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; /* * Allocate a buffer */ skb = alloc_skb(arp_hdr_len(dev) + hlen + tlen, GFP_ATOMIC); if (!skb) return NULL; skb_reserve(skb, hlen); skb_reset_network_header(skb); arp = skb_put(skb, arp_hdr_len(dev)); skb->dev = dev; skb->protocol = htons(ETH_P_ARP); if (!src_hw) src_hw = dev->dev_addr; if (!dest_hw) dest_hw = dev->broadcast; /* * Fill the device header for the ARP frame */ if (dev_hard_header(skb, dev, ptype, dest_hw, src_hw, skb->len) < 0) goto out; /* * Fill out the arp protocol part. * * The arp hardware type should match the device type, except for FDDI, * which (according to RFC 1390) should always equal 1 (Ethernet). */ /* * Exceptions everywhere. AX.25 uses the AX.25 PID value not the * DIX code for the protocol. Make these device structure fields. */ switch (dev->type) { default: arp->ar_hrd = htons(dev->type); arp->ar_pro = htons(ETH_P_IP); break; #if IS_ENABLED(CONFIG_AX25) case ARPHRD_AX25: arp->ar_hrd = htons(ARPHRD_AX25); arp->ar_pro = htons(AX25_P_IP); break; #if IS_ENABLED(CONFIG_NETROM) case ARPHRD_NETROM: arp->ar_hrd = htons(ARPHRD_NETROM); arp->ar_pro = htons(AX25_P_IP); break; #endif #endif #if IS_ENABLED(CONFIG_FDDI) case ARPHRD_FDDI: arp->ar_hrd = htons(ARPHRD_ETHER); arp->ar_pro = htons(ETH_P_IP); break; #endif } arp->ar_hln = dev->addr_len; arp->ar_pln = 4; arp->ar_op = htons(type); arp_ptr = (unsigned char *)(arp + 1); memcpy(arp_ptr, src_hw, dev->addr_len); arp_ptr += dev->addr_len; memcpy(arp_ptr, &src_ip, 4); arp_ptr += 4; switch (dev->type) { #if IS_ENABLED(CONFIG_FIREWIRE_NET) case ARPHRD_IEEE1394: break; #endif default: if (target_hw) memcpy(arp_ptr, target_hw, dev->addr_len); else memset(arp_ptr, 0, dev->addr_len); arp_ptr += dev->addr_len; } memcpy(arp_ptr, &dest_ip, 4); return skb; out: kfree_skb(skb); return NULL; } EXPORT_SYMBOL(arp_create); static int arp_xmit_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { return dev_queue_xmit(skb); } /* * Send an arp packet. */ void arp_xmit(struct sk_buff *skb) { rcu_read_lock(); /* Send it off, maybe filter it using firewalling first. */ NF_HOOK(NFPROTO_ARP, NF_ARP_OUT, dev_net_rcu(skb->dev), NULL, skb, NULL, skb->dev, arp_xmit_finish); rcu_read_unlock(); } EXPORT_SYMBOL(arp_xmit); static bool arp_is_garp(struct net *net, struct net_device *dev, int *addr_type, __be16 ar_op, __be32 sip, __be32 tip, unsigned char *sha, unsigned char *tha) { bool is_garp = tip == sip; /* Gratuitous ARP _replies_ also require target hwaddr to be * the same as source. */ if (is_garp && ar_op == htons(ARPOP_REPLY)) is_garp = /* IPv4 over IEEE 1394 doesn't provide target * hardware address field in its ARP payload. */ tha && !memcmp(tha, sha, dev->addr_len); if (is_garp) { *addr_type = inet_addr_type_dev_table(net, dev, sip); if (*addr_type != RTN_UNICAST) is_garp = false; } return is_garp; } /* * Process an arp request. */ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb->dev; struct in_device *in_dev = __in_dev_get_rcu(dev); struct arphdr *arp; unsigned char *arp_ptr; struct rtable *rt; unsigned char *sha; unsigned char *tha = NULL; __be32 sip, tip; u16 dev_type = dev->type; int addr_type; struct neighbour *n; struct dst_entry *reply_dst = NULL; bool is_garp = false; /* arp_rcv below verifies the ARP header and verifies the device * is ARP'able. */ if (!in_dev) goto out_free_skb; arp = arp_hdr(skb); switch (dev_type) { default: if (arp->ar_pro != htons(ETH_P_IP) || htons(dev_type) != arp->ar_hrd) goto out_free_skb; break; case ARPHRD_ETHER: case ARPHRD_FDDI: case ARPHRD_IEEE802: /* * ETHERNET, and Fibre Channel (which are IEEE 802 * devices, according to RFC 2625) devices will accept ARP * hardware types of either 1 (Ethernet) or 6 (IEEE 802.2). * This is the case also of FDDI, where the RFC 1390 says that * FDDI devices should accept ARP hardware of (1) Ethernet, * however, to be more robust, we'll accept both 1 (Ethernet) * or 6 (IEEE 802.2) */ if ((arp->ar_hrd != htons(ARPHRD_ETHER) && arp->ar_hrd != htons(ARPHRD_IEEE802)) || arp->ar_pro != htons(ETH_P_IP)) goto out_free_skb; break; case ARPHRD_AX25: if (arp->ar_pro != htons(AX25_P_IP) || arp->ar_hrd != htons(ARPHRD_AX25)) goto out_free_skb; break; case ARPHRD_NETROM: if (arp->ar_pro != htons(AX25_P_IP) || arp->ar_hrd != htons(ARPHRD_NETROM)) goto out_free_skb; break; } /* Understand only these message types */ if (arp->ar_op != htons(ARPOP_REPLY) && arp->ar_op != htons(ARPOP_REQUEST)) goto out_free_skb; /* * Extract fields */ arp_ptr = (unsigned char *)(arp + 1); sha = arp_ptr; arp_ptr += dev->addr_len; memcpy(&sip, arp_ptr, 4); arp_ptr += 4; switch (dev_type) { #if IS_ENABLED(CONFIG_FIREWIRE_NET) case ARPHRD_IEEE1394: break; #endif default: tha = arp_ptr; arp_ptr += dev->addr_len; } memcpy(&tip, arp_ptr, 4); /* * Check for bad requests for 127.x.x.x and requests for multicast * addresses. If this is one such, delete it. */ if (ipv4_is_multicast(tip) || (!IN_DEV_ROUTE_LOCALNET(in_dev) && ipv4_is_loopback(tip))) goto out_free_skb; /* * For some 802.11 wireless deployments (and possibly other networks), * there will be an ARP proxy and gratuitous ARP frames are attacks * and thus should not be accepted. */ if (sip == tip && IN_DEV_ORCONF(in_dev, DROP_GRATUITOUS_ARP)) goto out_free_skb; /* * Special case: We must set Frame Relay source Q.922 address */ if (dev_type == ARPHRD_DLCI) sha = dev->broadcast; /* * Process entry. The idea here is we want to send a reply if it is a * request for us or if it is a request for someone else that we hold * a proxy for. We want to add an entry to our cache if it is a reply * to us or if it is a request for our address. * (The assumption for this last is that if someone is requesting our * address, they are probably intending to talk to us, so it saves time * if we cache their address. Their address is also probably not in * our cache, since ours is not in their cache.) * * Putting this another way, we only care about replies if they are to * us, in which case we add them to the cache. For requests, we care * about those for us and those for our proxies. We reply to both, * and in the case of requests for us we add the requester to the arp * cache. */ if (arp->ar_op == htons(ARPOP_REQUEST) && skb_metadata_dst(skb)) reply_dst = (struct dst_entry *) iptunnel_metadata_reply(skb_metadata_dst(skb), GFP_ATOMIC); /* Special case: IPv4 duplicate address detection packet (RFC2131) */ if (sip == 0) { if (arp->ar_op == htons(ARPOP_REQUEST) && inet_addr_type_dev_table(net, dev, tip) == RTN_LOCAL && !arp_ignore(in_dev, sip, tip)) arp_send_dst(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, dev->dev_addr, sha, reply_dst); goto out_consume_skb; } if (arp->ar_op == htons(ARPOP_REQUEST) && ip_route_input_noref(skb, tip, sip, 0, dev) == 0) { rt = skb_rtable(skb); addr_type = rt->rt_type; if (addr_type == RTN_LOCAL) { int dont_send; dont_send = arp_ignore(in_dev, sip, tip); if (!dont_send && IN_DEV_ARPFILTER(in_dev)) dont_send = arp_filter(sip, tip, dev); if (!dont_send) { n = neigh_event_ns(&arp_tbl, sha, &sip, dev); if (n) { arp_send_dst(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, dev->dev_addr, sha, reply_dst); neigh_release(n); } } goto out_consume_skb; } else if (IN_DEV_FORWARD(in_dev)) { if (addr_type == RTN_UNICAST && (arp_fwd_proxy(in_dev, dev, rt) || arp_fwd_pvlan(in_dev, dev, rt, sip, tip) || (rt->dst.dev != dev && pneigh_lookup(&arp_tbl, net, &tip, dev, 0)))) { n = neigh_event_ns(&arp_tbl, sha, &sip, dev); if (n) neigh_release(n); if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED || skb->pkt_type == PACKET_HOST || NEIGH_VAR(in_dev->arp_parms, PROXY_DELAY) == 0) { arp_send_dst(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, dev->dev_addr, sha, reply_dst); } else { pneigh_enqueue(&arp_tbl, in_dev->arp_parms, skb); goto out_free_dst; } goto out_consume_skb; } } } /* Update our ARP tables */ n = __neigh_lookup(&arp_tbl, &sip, dev, 0); addr_type = -1; if (n || arp_accept(in_dev, sip)) { is_garp = arp_is_garp(net, dev, &addr_type, arp->ar_op, sip, tip, sha, tha); } if (arp_accept(in_dev, sip)) { /* Unsolicited ARP is not accepted by default. It is possible, that this option should be enabled for some devices (strip is candidate) */ if (!n && (is_garp || (arp->ar_op == htons(ARPOP_REPLY) && (addr_type == RTN_UNICAST || (addr_type < 0 && /* postpone calculation to as late as possible */ inet_addr_type_dev_table(net, dev, sip) == RTN_UNICAST))))) n = __neigh_lookup(&arp_tbl, &sip, dev, 1); } if (n) { int state = NUD_REACHABLE; int override; /* If several different ARP replies follows back-to-back, use the FIRST one. It is possible, if several proxy agents are active. Taking the first reply prevents arp trashing and chooses the fastest router. */ override = time_after(jiffies, n->updated + NEIGH_VAR(n->parms, LOCKTIME)) || is_garp; /* Broadcast replies and request packets do not assert neighbour reachability. */ if (arp->ar_op != htons(ARPOP_REPLY) || skb->pkt_type != PACKET_HOST) state = NUD_STALE; neigh_update(n, sha, state, override ? NEIGH_UPDATE_F_OVERRIDE : 0, 0); neigh_release(n); } out_consume_skb: consume_skb(skb); out_free_dst: dst_release(reply_dst); return NET_RX_SUCCESS; out_free_skb: kfree_skb(skb); return NET_RX_DROP; } static void parp_redo(struct sk_buff *skb) { arp_process(dev_net(skb->dev), NULL, skb); } static int arp_is_multicast(const void *pkey) { return ipv4_is_multicast(*((__be32 *)pkey)); } /* * Receive an arp request from the device layer. */ static int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { const struct arphdr *arp; /* do not tweak dropwatch on an ARP we will ignore */ if (dev->flags & IFF_NOARP || skb->pkt_type == PACKET_OTHERHOST || skb->pkt_type == PACKET_LOOPBACK) goto consumeskb; skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) goto out_of_mem; /* ARP header, plus 2 device addresses, plus 2 IP addresses. */ if (!pskb_may_pull(skb, arp_hdr_len(dev))) goto freeskb; arp = arp_hdr(skb); if (arp->ar_hln != dev->addr_len || arp->ar_pln != 4) goto freeskb; memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); return NF_HOOK(NFPROTO_ARP, NF_ARP_IN, dev_net(dev), NULL, skb, dev, NULL, arp_process); consumeskb: consume_skb(skb); return NET_RX_SUCCESS; freeskb: kfree_skb(skb); out_of_mem: return NET_RX_DROP; } /* * User level interface (ioctl) */ static struct net_device *arp_req_dev_by_name(struct net *net, struct arpreq *r, bool getarp) { struct net_device *dev; if (getarp) dev = dev_get_by_name_rcu(net, r->arp_dev); else dev = __dev_get_by_name(net, r->arp_dev); if (!dev) return ERR_PTR(-ENODEV); /* Mmmm... It is wrong... ARPHRD_NETROM == 0 */ if (!r->arp_ha.sa_family) r->arp_ha.sa_family = dev->type; if ((r->arp_flags & ATF_COM) && r->arp_ha.sa_family != dev->type) return ERR_PTR(-EINVAL); return dev; } static struct net_device *arp_req_dev(struct net *net, struct arpreq *r) { struct net_device *dev; struct rtable *rt; __be32 ip; if (r->arp_dev[0]) return arp_req_dev_by_name(net, r, false); if (r->arp_flags & ATF_PUBL) return NULL; ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; rt = ip_route_output(net, ip, 0, 0, 0, RT_SCOPE_LINK); if (IS_ERR(rt)) return ERR_CAST(rt); dev = rt->dst.dev; ip_rt_put(rt); if (!dev) return ERR_PTR(-EINVAL); return dev; } /* * Set (create) an ARP cache entry. */ static int arp_req_set_proxy(struct net *net, struct net_device *dev, int on) { if (!dev) { IPV4_DEVCONF_ALL(net, PROXY_ARP) = on; return 0; } if (__in_dev_get_rtnl_net(dev)) { IN_DEV_CONF_SET(__in_dev_get_rtnl_net(dev), PROXY_ARP, on); return 0; } return -ENXIO; } static int arp_req_set_public(struct net *net, struct arpreq *r, struct net_device *dev) { __be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr; if (!dev && (r->arp_flags & ATF_COM)) { dev = dev_getbyhwaddr(net, r->arp_ha.sa_family, r->arp_ha.sa_data); if (!dev) return -ENODEV; } if (mask) { __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; if (!pneigh_lookup(&arp_tbl, net, &ip, dev, 1)) return -ENOBUFS; return 0; } return arp_req_set_proxy(net, dev, 1); } static int arp_req_set(struct net *net, struct arpreq *r) { struct neighbour *neigh; struct net_device *dev; __be32 ip; int err; dev = arp_req_dev(net, r); if (IS_ERR(dev)) return PTR_ERR(dev); if (r->arp_flags & ATF_PUBL) return arp_req_set_public(net, r, dev); switch (dev->type) { #if IS_ENABLED(CONFIG_FDDI) case ARPHRD_FDDI: /* * According to RFC 1390, FDDI devices should accept ARP * hardware types of 1 (Ethernet). However, to be more * robust, we'll accept hardware types of either 1 (Ethernet) * or 6 (IEEE 802.2). */ if (r->arp_ha.sa_family != ARPHRD_FDDI && r->arp_ha.sa_family != ARPHRD_ETHER && r->arp_ha.sa_family != ARPHRD_IEEE802) return -EINVAL; break; #endif default: if (r->arp_ha.sa_family != dev->type) return -EINVAL; break; } ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; neigh = __neigh_lookup_errno(&arp_tbl, &ip, dev); err = PTR_ERR(neigh); if (!IS_ERR(neigh)) { unsigned int state = NUD_STALE; if (r->arp_flags & ATF_PERM) { r->arp_flags |= ATF_COM; state = NUD_PERMANENT; } err = neigh_update(neigh, (r->arp_flags & ATF_COM) ? r->arp_ha.sa_data : NULL, state, NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN, 0); neigh_release(neigh); } return err; } static unsigned int arp_state_to_flags(struct neighbour *neigh) { if (neigh->nud_state&NUD_PERMANENT) return ATF_PERM | ATF_COM; else if (neigh->nud_state&NUD_VALID) return ATF_COM; else return 0; } /* * Get an ARP cache entry. */ static int arp_req_get(struct net *net, struct arpreq *r) { __be32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr; struct neighbour *neigh; struct net_device *dev; if (!r->arp_dev[0]) return -ENODEV; dev = arp_req_dev_by_name(net, r, true); if (IS_ERR(dev)) return PTR_ERR(dev); neigh = neigh_lookup(&arp_tbl, &ip, dev); if (!neigh) return -ENXIO; if (READ_ONCE(neigh->nud_state) & NUD_NOARP) { neigh_release(neigh); return -ENXIO; } read_lock_bh(&neigh->lock); memcpy(r->arp_ha.sa_data, neigh->ha, min(dev->addr_len, sizeof(r->arp_ha.sa_data_min))); r->arp_flags = arp_state_to_flags(neigh); read_unlock_bh(&neigh->lock); neigh_release(neigh); r->arp_ha.sa_family = dev->type; netdev_copy_name(dev, r->arp_dev); return 0; } int arp_invalidate(struct net_device *dev, __be32 ip, bool force) { struct neighbour *neigh = neigh_lookup(&arp_tbl, &ip, dev); int err = -ENXIO; struct neigh_table *tbl = &arp_tbl; if (neigh) { if ((READ_ONCE(neigh->nud_state) & NUD_VALID) && !force) { neigh_release(neigh); return 0; } if (READ_ONCE(neigh->nud_state) & ~NUD_NOARP) err = neigh_update(neigh, NULL, NUD_FAILED, NEIGH_UPDATE_F_OVERRIDE| NEIGH_UPDATE_F_ADMIN, 0); write_lock_bh(&tbl->lock); neigh_release(neigh); neigh_remove_one(neigh); write_unlock_bh(&tbl->lock); } return err; } static int arp_req_delete_public(struct net *net, struct arpreq *r, struct net_device *dev) { __be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr; if (mask) { __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; return pneigh_delete(&arp_tbl, net, &ip, dev); } return arp_req_set_proxy(net, dev, 0); } static int arp_req_delete(struct net *net, struct arpreq *r) { struct net_device *dev; __be32 ip; dev = arp_req_dev(net, r); if (IS_ERR(dev)) return PTR_ERR(dev); if (r->arp_flags & ATF_PUBL) return arp_req_delete_public(net, r, dev); ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; return arp_invalidate(dev, ip, true); } /* * Handle an ARP layer I/O control request. */ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg) { struct arpreq r; __be32 *netmask; int err; switch (cmd) { case SIOCDARP: case SIOCSARP: if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; fallthrough; case SIOCGARP: err = copy_from_user(&r, arg, sizeof(struct arpreq)); if (err) return -EFAULT; break; default: return -EINVAL; } if (r.arp_pa.sa_family != AF_INET) return -EPFNOSUPPORT; if (!(r.arp_flags & ATF_PUBL) && (r.arp_flags & (ATF_NETMASK | ATF_DONTPUB))) return -EINVAL; netmask = &((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr; if (!(r.arp_flags & ATF_NETMASK)) *netmask = htonl(0xFFFFFFFFUL); else if (*netmask && *netmask != htonl(0xFFFFFFFFUL)) return -EINVAL; switch (cmd) { case SIOCDARP: rtnl_net_lock(net); err = arp_req_delete(net, &r); rtnl_net_unlock(net); break; case SIOCSARP: rtnl_net_lock(net); err = arp_req_set(net, &r); rtnl_net_unlock(net); break; case SIOCGARP: rcu_read_lock(); err = arp_req_get(net, &r); rcu_read_unlock(); if (!err && copy_to_user(arg, &r, sizeof(r))) err = -EFAULT; break; } return err; } static int arp_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct netdev_notifier_change_info *change_info; struct in_device *in_dev; bool evict_nocarrier; switch (event) { case NETDEV_CHANGEADDR: neigh_changeaddr(&arp_tbl, dev); rt_cache_flush(dev_net(dev)); break; case NETDEV_CHANGE: change_info = ptr; if (change_info->flags_changed & IFF_NOARP) neigh_changeaddr(&arp_tbl, dev); in_dev = __in_dev_get_rtnl(dev); if (!in_dev) evict_nocarrier = true; else evict_nocarrier = IN_DEV_ARP_EVICT_NOCARRIER(in_dev); if (evict_nocarrier && !netif_carrier_ok(dev)) neigh_carrier_down(&arp_tbl, dev); break; default: break; } return NOTIFY_DONE; } static struct notifier_block arp_netdev_notifier = { .notifier_call = arp_netdev_event, }; /* Note, that it is not on notifier chain. It is necessary, that this routine was called after route cache will be flushed. */ void arp_ifdown(struct net_device *dev) { neigh_ifdown(&arp_tbl, dev); } /* * Called once on startup. */ static struct packet_type arp_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_ARP), .func = arp_rcv, }; #ifdef CONFIG_PROC_FS #if IS_ENABLED(CONFIG_AX25) /* * ax25 -> ASCII conversion */ static void ax2asc2(ax25_address *a, char *buf) { char c, *s; int n; for (n = 0, s = buf; n < 6; n++) { c = (a->ax25_call[n] >> 1) & 0x7F; if (c != ' ') *s++ = c; } *s++ = '-'; n = (a->ax25_call[6] >> 1) & 0x0F; if (n > 9) { *s++ = '1'; n -= 10; } *s++ = n + '0'; *s++ = '\0'; if (*buf == '\0' || *buf == '-') { buf[0] = '*'; buf[1] = '\0'; } } #endif /* CONFIG_AX25 */ #define HBUFFERLEN 30 static void arp_format_neigh_entry(struct seq_file *seq, struct neighbour *n) { char hbuffer[HBUFFERLEN]; int k, j; char tbuf[16]; struct net_device *dev = n->dev; int hatype = dev->type; read_lock(&n->lock); /* Convert hardware address to XX:XX:XX:XX ... form. */ #if IS_ENABLED(CONFIG_AX25) if (hatype == ARPHRD_AX25 || hatype == ARPHRD_NETROM) ax2asc2((ax25_address *)n->ha, hbuffer); else { #endif for (k = 0, j = 0; k < HBUFFERLEN - 3 && j < dev->addr_len; j++) { hbuffer[k++] = hex_asc_hi(n->ha[j]); hbuffer[k++] = hex_asc_lo(n->ha[j]); hbuffer[k++] = ':'; } if (k != 0) --k; hbuffer[k] = 0; #if IS_ENABLED(CONFIG_AX25) } #endif sprintf(tbuf, "%pI4", n->primary_key); seq_printf(seq, "%-16s 0x%-10x0x%-10x%-17s * %s\n", tbuf, hatype, arp_state_to_flags(n), hbuffer, dev->name); read_unlock(&n->lock); } static void arp_format_pneigh_entry(struct seq_file *seq, struct pneigh_entry *n) { struct net_device *dev = n->dev; int hatype = dev ? dev->type : 0; char tbuf[16]; sprintf(tbuf, "%pI4", n->key); seq_printf(seq, "%-16s 0x%-10x0x%-10x%s * %s\n", tbuf, hatype, ATF_PUBL | ATF_PERM, "00:00:00:00:00:00", dev ? dev->name : "*"); } static int arp_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) { seq_puts(seq, "IP address HW type Flags " "HW address Mask Device\n"); } else { struct neigh_seq_state *state = seq->private; if (state->flags & NEIGH_SEQ_IS_PNEIGH) arp_format_pneigh_entry(seq, v); else arp_format_neigh_entry(seq, v); } return 0; } static void *arp_seq_start(struct seq_file *seq, loff_t *pos) { /* Don't want to confuse "arp -a" w/ magic entries, * so we tell the generic iterator to skip NUD_NOARP. */ return neigh_seq_start(seq, pos, &arp_tbl, NEIGH_SEQ_SKIP_NOARP); } static const struct seq_operations arp_seq_ops = { .start = arp_seq_start, .next = neigh_seq_next, .stop = neigh_seq_stop, .show = arp_seq_show, }; #endif /* CONFIG_PROC_FS */ static int __net_init arp_net_init(struct net *net) { if (!proc_create_net("arp", 0444, net->proc_net, &arp_seq_ops, sizeof(struct neigh_seq_state))) return -ENOMEM; return 0; } static void __net_exit arp_net_exit(struct net *net) { remove_proc_entry("arp", net->proc_net); } static struct pernet_operations arp_net_ops = { .init = arp_net_init, .exit = arp_net_exit, }; void __init arp_init(void) { neigh_table_init(NEIGH_ARP_TABLE, &arp_tbl); dev_add_pack(&arp_packet_type); register_pernet_subsys(&arp_net_ops); #ifdef CONFIG_SYSCTL neigh_sysctl_register(NULL, &arp_tbl.parms, NULL); #endif register_netdevice_notifier(&arp_netdev_notifier); }
4 4 3 4 1 1 2 1 1 2 11 1 10 1 7 6 1 3 1 3 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 // SPDX-License-Identifier: GPL-2.0 // Copyright (C) 2019 Arm Ltd. #include <linux/arm-smccc.h> #include <linux/kvm_host.h> #include <linux/sched/stat.h> #include <asm/kvm_mmu.h> #include <asm/pvclock-abi.h> #include <kvm/arm_hypercalls.h> void kvm_update_stolen_time(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; u64 base = vcpu->arch.steal.base; u64 last_steal = vcpu->arch.steal.last_steal; u64 offset = offsetof(struct pvclock_vcpu_stolen_time, stolen_time); u64 steal = 0; int idx; if (base == INVALID_GPA) return; idx = srcu_read_lock(&kvm->srcu); if (!kvm_get_guest(kvm, base + offset, steal)) { steal = le64_to_cpu(steal); vcpu->arch.steal.last_steal = READ_ONCE(current->sched_info.run_delay); steal += vcpu->arch.steal.last_steal - last_steal; kvm_put_guest(kvm, base + offset, cpu_to_le64(steal)); } srcu_read_unlock(&kvm->srcu, idx); } long kvm_hypercall_pv_features(struct kvm_vcpu *vcpu) { u32 feature = smccc_get_arg1(vcpu); long val = SMCCC_RET_NOT_SUPPORTED; switch (feature) { case ARM_SMCCC_HV_PV_TIME_FEATURES: case ARM_SMCCC_HV_PV_TIME_ST: if (vcpu->arch.steal.base != INVALID_GPA) val = SMCCC_RET_SUCCESS; break; } return val; } gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu) { struct pvclock_vcpu_stolen_time init_values = {}; struct kvm *kvm = vcpu->kvm; u64 base = vcpu->arch.steal.base; if (base == INVALID_GPA) return base; /* * Start counting stolen time from the time the guest requests * the feature enabled. */ vcpu->arch.steal.last_steal = current->sched_info.run_delay; kvm_write_guest_lock(kvm, base, &init_values, sizeof(init_values)); return base; } bool kvm_arm_pvtime_supported(void) { return !!sched_info_on(); } int kvm_arm_pvtime_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { u64 __user *user = (u64 __user *)attr->addr; struct kvm *kvm = vcpu->kvm; u64 ipa; int ret = 0; int idx; if (!kvm_arm_pvtime_supported() || attr->attr != KVM_ARM_VCPU_PVTIME_IPA) return -ENXIO; if (get_user(ipa, user)) return -EFAULT; if (!IS_ALIGNED(ipa, 64)) return -EINVAL; if (vcpu->arch.steal.base != INVALID_GPA) return -EEXIST; /* Check the address is in a valid memslot */ idx = srcu_read_lock(&kvm->srcu); if (kvm_is_error_hva(gfn_to_hva(kvm, ipa >> PAGE_SHIFT))) ret = -EINVAL; srcu_read_unlock(&kvm->srcu, idx); if (!ret) vcpu->arch.steal.base = ipa; return ret; } int kvm_arm_pvtime_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { u64 __user *user = (u64 __user *)attr->addr; u64 ipa; if (!kvm_arm_pvtime_supported() || attr->attr != KVM_ARM_VCPU_PVTIME_IPA) return -ENXIO; ipa = vcpu->arch.steal.base; if (put_user(ipa, user)) return -EFAULT; return 0; } int kvm_arm_pvtime_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { switch (attr->attr) { case KVM_ARM_VCPU_PVTIME_IPA: if (kvm_arm_pvtime_supported()) return 0; } return -ENXIO; }
37 4 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_USER_NAMESPACE_H #define _LINUX_USER_NAMESPACE_H #include <linux/kref.h> #include <linux/nsproxy.h> #include <linux/ns_common.h> #include <linux/rculist_nulls.h> #include <linux/sched.h> #include <linux/workqueue.h> #include <linux/rcuref.h> #include <linux/rwsem.h> #include <linux/sysctl.h> #include <linux/err.h> #define UID_GID_MAP_MAX_BASE_EXTENTS 5 #define UID_GID_MAP_MAX_EXTENTS 340 struct uid_gid_extent { u32 first; u32 lower_first; u32 count; }; struct uid_gid_map { /* 64 bytes -- 1 cache line */ union { struct { struct uid_gid_extent extent[UID_GID_MAP_MAX_BASE_EXTENTS]; u32 nr_extents; }; struct { struct uid_gid_extent *forward; struct uid_gid_extent *reverse; }; }; }; #define USERNS_SETGROUPS_ALLOWED 1UL #define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED struct ucounts; enum ucount_type { UCOUNT_USER_NAMESPACES, UCOUNT_PID_NAMESPACES, UCOUNT_UTS_NAMESPACES, UCOUNT_IPC_NAMESPACES, UCOUNT_NET_NAMESPACES, UCOUNT_MNT_NAMESPACES, UCOUNT_CGROUP_NAMESPACES, UCOUNT_TIME_NAMESPACES, #ifdef CONFIG_INOTIFY_USER UCOUNT_INOTIFY_INSTANCES, UCOUNT_INOTIFY_WATCHES, #endif #ifdef CONFIG_FANOTIFY UCOUNT_FANOTIFY_GROUPS, UCOUNT_FANOTIFY_MARKS, #endif UCOUNT_COUNTS, }; enum rlimit_type { UCOUNT_RLIMIT_NPROC, UCOUNT_RLIMIT_MSGQUEUE, UCOUNT_RLIMIT_SIGPENDING, UCOUNT_RLIMIT_MEMLOCK, UCOUNT_RLIMIT_COUNTS, }; #if IS_ENABLED(CONFIG_BINFMT_MISC) struct binfmt_misc; #endif struct user_namespace { struct uid_gid_map uid_map; struct uid_gid_map gid_map; struct uid_gid_map projid_map; struct user_namespace *parent; int level; kuid_t owner; kgid_t group; struct ns_common ns; unsigned long flags; /* parent_could_setfcap: true if the creator if this ns had CAP_SETFCAP * in its effective capability set at the child ns creation time. */ bool parent_could_setfcap; #ifdef CONFIG_KEYS /* List of joinable keyrings in this namespace. Modification access of * these pointers is controlled by keyring_sem. Once * user_keyring_register is set, it won't be changed, so it can be * accessed directly with READ_ONCE(). */ struct list_head keyring_name_list; struct key *user_keyring_register; struct rw_semaphore keyring_sem; #endif /* Register of per-UID persistent keyrings for this namespace */ #ifdef CONFIG_PERSISTENT_KEYRINGS struct key *persistent_keyring_register; #endif struct work_struct work; #ifdef CONFIG_SYSCTL struct ctl_table_set set; struct ctl_table_header *sysctls; #endif struct ucounts *ucounts; long ucount_max[UCOUNT_COUNTS]; long rlimit_max[UCOUNT_RLIMIT_COUNTS]; #if IS_ENABLED(CONFIG_BINFMT_MISC) struct binfmt_misc *binfmt_misc; #endif } __randomize_layout; struct ucounts { struct hlist_nulls_node node; struct user_namespace *ns; kuid_t uid; struct rcu_head rcu; rcuref_t count; atomic_long_t ucount[UCOUNT_COUNTS]; atomic_long_t rlimit[UCOUNT_RLIMIT_COUNTS]; }; extern struct user_namespace init_user_ns; extern struct ucounts init_ucounts; bool setup_userns_sysctls(struct user_namespace *ns); void retire_userns_sysctls(struct user_namespace *ns); struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_type type); void dec_ucount(struct ucounts *ucounts, enum ucount_type type); struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid); void put_ucounts(struct ucounts *ucounts); static inline struct ucounts * __must_check get_ucounts(struct ucounts *ucounts) { if (rcuref_get(&ucounts->count)) return ucounts; return NULL; } static inline long get_rlimit_value(struct ucounts *ucounts, enum rlimit_type type) { return atomic_long_read(&ucounts->rlimit[type]); } long inc_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v); bool dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v); long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type, bool override_rlimit); void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type); bool is_rlimit_overlimit(struct ucounts *ucounts, enum rlimit_type type, unsigned long max); static inline long get_userns_rlimit_max(struct user_namespace *ns, enum rlimit_type type) { return READ_ONCE(ns->rlimit_max[type]); } static inline void set_userns_rlimit_max(struct user_namespace *ns, enum rlimit_type type, unsigned long max) { ns->rlimit_max[type] = max <= LONG_MAX ? max : LONG_MAX; } #ifdef CONFIG_USER_NS static inline struct user_namespace *get_user_ns(struct user_namespace *ns) { if (ns) refcount_inc(&ns->ns.count); return ns; } extern int create_user_ns(struct cred *new); extern int unshare_userns(unsigned long unshare_flags, struct cred **new_cred); extern void __put_user_ns(struct user_namespace *ns); static inline void put_user_ns(struct user_namespace *ns) { if (ns && refcount_dec_and_test(&ns->ns.count)) __put_user_ns(ns); } struct seq_operations; extern const struct seq_operations proc_uid_seq_operations; extern const struct seq_operations proc_gid_seq_operations; extern const struct seq_operations proc_projid_seq_operations; extern ssize_t proc_uid_map_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t proc_gid_map_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t proc_projid_map_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t proc_setgroups_write(struct file *, const char __user *, size_t, loff_t *); extern int proc_setgroups_show(struct seq_file *m, void *v); extern bool userns_may_setgroups(const struct user_namespace *ns); extern bool in_userns(const struct user_namespace *ancestor, const struct user_namespace *child); extern bool current_in_userns(const struct user_namespace *target_ns); struct ns_common *ns_get_owner(struct ns_common *ns); #else static inline struct user_namespace *get_user_ns(struct user_namespace *ns) { return &init_user_ns; } static inline int create_user_ns(struct cred *new) { return -EINVAL; } static inline int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) { if (unshare_flags & CLONE_NEWUSER) return -EINVAL; return 0; } static inline void put_user_ns(struct user_namespace *ns) { } static inline bool userns_may_setgroups(const struct user_namespace *ns) { return true; } static inline bool in_userns(const struct user_namespace *ancestor, const struct user_namespace *child) { return true; } static inline bool current_in_userns(const struct user_namespace *target_ns) { return true; } static inline struct ns_common *ns_get_owner(struct ns_common *ns) { return ERR_PTR(-EPERM); } #endif #endif /* _LINUX_USER_H */
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 // SPDX-License-Identifier: GPL-2.0-or-later /* * Generic address resolution entity * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * * Fixes: * Vitaly E. Lavrov releasing NULL neighbor in neigh_add. * Harald Welte Add neighbour cache statistics like rtstat */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/slab.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/socket.h> #include <linux/netdevice.h> #include <linux/proc_fs.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif #include <linux/times.h> #include <net/net_namespace.h> #include <net/neighbour.h> #include <net/arp.h> #include <net/dst.h> #include <net/sock.h> #include <net/netevent.h> #include <net/netlink.h> #include <linux/rtnetlink.h> #include <linux/random.h> #include <linux/string.h> #include <linux/log2.h> #include <linux/inetdevice.h> #include <net/addrconf.h> #include <trace/events/neigh.h> #define NEIGH_DEBUG 1 #define neigh_dbg(level, fmt, ...) \ do { \ if (level <= NEIGH_DEBUG) \ pr_debug(fmt, ##__VA_ARGS__); \ } while (0) #define PNEIGH_HASHMASK 0xF static void neigh_timer_handler(struct timer_list *t); static void __neigh_notify(struct neighbour *n, int type, int flags, u32 pid); static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid); static int pneigh_ifdown_and_unlock(struct neigh_table *tbl, struct net_device *dev); #ifdef CONFIG_PROC_FS static const struct seq_operations neigh_stat_seq_ops; #endif static struct hlist_head *neigh_get_dev_table(struct net_device *dev, int family) { int i; switch (family) { default: DEBUG_NET_WARN_ON_ONCE(1); fallthrough; /* to avoid panic by null-ptr-deref */ case AF_INET: i = NEIGH_ARP_TABLE; break; case AF_INET6: i = NEIGH_ND_TABLE; break; } return &dev->neighbours[i]; } /* Neighbour hash table buckets are protected with rwlock tbl->lock. - All the scans/updates to hash buckets MUST be made under this lock. - NOTHING clever should be made under this lock: no callbacks to protocol backends, no attempts to send something to network. It will result in deadlocks, if backend/driver wants to use neighbour cache. - If the entry requires some non-trivial actions, increase its reference count and release table lock. Neighbour entries are protected: - with reference count. - with rwlock neigh->lock Reference count prevents destruction. neigh->lock mainly serializes ll address data and its validity state. However, the same lock is used to protect another entry fields: - timer - resolution queue Again, nothing clever shall be made under neigh->lock, the most complicated procedure, which we allow is dev->hard_header. It is supposed, that dev->hard_header is simplistic and does not make callbacks to neighbour tables. */ static int neigh_blackhole(struct neighbour *neigh, struct sk_buff *skb) { kfree_skb(skb); return -ENETDOWN; } static void neigh_cleanup_and_release(struct neighbour *neigh) { trace_neigh_cleanup_and_release(neigh, 0); __neigh_notify(neigh, RTM_DELNEIGH, 0, 0); call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh); neigh_release(neigh); } /* * It is random distribution in the interval (1/2)*base...(3/2)*base. * It corresponds to default IPv6 settings and is not overridable, * because it is really reasonable choice. */ unsigned long neigh_rand_reach_time(unsigned long base) { return base ? get_random_u32_below(base) + (base >> 1) : 0; } EXPORT_SYMBOL(neigh_rand_reach_time); static void neigh_mark_dead(struct neighbour *n) { n->dead = 1; if (!list_empty(&n->gc_list)) { list_del_init(&n->gc_list); atomic_dec(&n->tbl->gc_entries); } if (!list_empty(&n->managed_list)) list_del_init(&n->managed_list); } static void neigh_update_gc_list(struct neighbour *n) { bool on_gc_list, exempt_from_gc; write_lock_bh(&n->tbl->lock); write_lock(&n->lock); if (n->dead) goto out; /* remove from the gc list if new state is permanent or if neighbor * is externally learned; otherwise entry should be on the gc list */ exempt_from_gc = n->nud_state & NUD_PERMANENT || n->flags & NTF_EXT_LEARNED; on_gc_list = !list_empty(&n->gc_list); if (exempt_from_gc && on_gc_list) { list_del_init(&n->gc_list); atomic_dec(&n->tbl->gc_entries); } else if (!exempt_from_gc && !on_gc_list) { /* add entries to the tail; cleaning removes from the front */ list_add_tail(&n->gc_list, &n->tbl->gc_list); atomic_inc(&n->tbl->gc_entries); } out: write_unlock(&n->lock); write_unlock_bh(&n->tbl->lock); } static void neigh_update_managed_list(struct neighbour *n) { bool on_managed_list, add_to_managed; write_lock_bh(&n->tbl->lock); write_lock(&n->lock); if (n->dead) goto out; add_to_managed = n->flags & NTF_MANAGED; on_managed_list = !list_empty(&n->managed_list); if (!add_to_managed && on_managed_list) list_del_init(&n->managed_list); else if (add_to_managed && !on_managed_list) list_add_tail(&n->managed_list, &n->tbl->managed_list); out: write_unlock(&n->lock); write_unlock_bh(&n->tbl->lock); } static void neigh_update_flags(struct neighbour *neigh, u32 flags, int *notify, bool *gc_update, bool *managed_update) { u32 ndm_flags, old_flags = neigh->flags; if (!(flags & NEIGH_UPDATE_F_ADMIN)) return; ndm_flags = (flags & NEIGH_UPDATE_F_EXT_LEARNED) ? NTF_EXT_LEARNED : 0; ndm_flags |= (flags & NEIGH_UPDATE_F_MANAGED) ? NTF_MANAGED : 0; if ((old_flags ^ ndm_flags) & NTF_EXT_LEARNED) { if (ndm_flags & NTF_EXT_LEARNED) neigh->flags |= NTF_EXT_LEARNED; else neigh->flags &= ~NTF_EXT_LEARNED; *notify = 1; *gc_update = true; } if ((old_flags ^ ndm_flags) & NTF_MANAGED) { if (ndm_flags & NTF_MANAGED) neigh->flags |= NTF_MANAGED; else neigh->flags &= ~NTF_MANAGED; *notify = 1; *managed_update = true; } } bool neigh_remove_one(struct neighbour *n) { bool retval = false; write_lock(&n->lock); if (refcount_read(&n->refcnt) == 1) { hlist_del_rcu(&n->hash); hlist_del_rcu(&n->dev_list); neigh_mark_dead(n); retval = true; } write_unlock(&n->lock); if (retval) neigh_cleanup_and_release(n); return retval; } static int neigh_forced_gc(struct neigh_table *tbl) { int max_clean = atomic_read(&tbl->gc_entries) - READ_ONCE(tbl->gc_thresh2); u64 tmax = ktime_get_ns() + NSEC_PER_MSEC; unsigned long tref = jiffies - 5 * HZ; struct neighbour *n, *tmp; int shrunk = 0; int loop = 0; NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs); write_lock_bh(&tbl->lock); list_for_each_entry_safe(n, tmp, &tbl->gc_list, gc_list) { if (refcount_read(&n->refcnt) == 1) { bool remove = false; write_lock(&n->lock); if ((n->nud_state == NUD_FAILED) || (n->nud_state == NUD_NOARP) || (tbl->is_multicast && tbl->is_multicast(n->primary_key)) || !time_in_range(n->updated, tref, jiffies)) remove = true; write_unlock(&n->lock); if (remove && neigh_remove_one(n)) shrunk++; if (shrunk >= max_clean) break; if (++loop == 16) { if (ktime_get_ns() > tmax) goto unlock; loop = 0; } } } WRITE_ONCE(tbl->last_flush, jiffies); unlock: write_unlock_bh(&tbl->lock); return shrunk; } static void neigh_add_timer(struct neighbour *n, unsigned long when) { /* Use safe distance from the jiffies - LONG_MAX point while timer * is running in DELAY/PROBE state but still show to user space * large times in the past. */ unsigned long mint = jiffies - (LONG_MAX - 86400 * HZ); neigh_hold(n); if (!time_in_range(n->confirmed, mint, jiffies)) n->confirmed = mint; if (time_before(n->used, n->confirmed)) n->used = n->confirmed; if (unlikely(mod_timer(&n->timer, when))) { printk("NEIGH: BUG, double timer add, state is %x\n", n->nud_state); dump_stack(); } } static int neigh_del_timer(struct neighbour *n) { if ((n->nud_state & NUD_IN_TIMER) && timer_delete(&n->timer)) { neigh_release(n); return 1; } return 0; } static struct neigh_parms *neigh_get_dev_parms_rcu(struct net_device *dev, int family) { switch (family) { case AF_INET: return __in_dev_arp_parms_get_rcu(dev); case AF_INET6: return __in6_dev_nd_parms_get_rcu(dev); } return NULL; } static void neigh_parms_qlen_dec(struct net_device *dev, int family) { struct neigh_parms *p; rcu_read_lock(); p = neigh_get_dev_parms_rcu(dev, family); if (p) p->qlen--; rcu_read_unlock(); } static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net, int family) { struct sk_buff_head tmp; unsigned long flags; struct sk_buff *skb; skb_queue_head_init(&tmp); spin_lock_irqsave(&list->lock, flags); skb = skb_peek(list); while (skb != NULL) { struct sk_buff *skb_next = skb_peek_next(skb, list); struct net_device *dev = skb->dev; if (net == NULL || net_eq(dev_net(dev), net)) { neigh_parms_qlen_dec(dev, family); __skb_unlink(skb, list); __skb_queue_tail(&tmp, skb); } skb = skb_next; } spin_unlock_irqrestore(&list->lock, flags); while ((skb = __skb_dequeue(&tmp))) { dev_put(skb->dev); kfree_skb(skb); } } static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev, bool skip_perm) { struct hlist_head *dev_head; struct hlist_node *tmp; struct neighbour *n; dev_head = neigh_get_dev_table(dev, tbl->family); hlist_for_each_entry_safe(n, tmp, dev_head, dev_list) { if (skip_perm && n->nud_state & NUD_PERMANENT) continue; hlist_del_rcu(&n->hash); hlist_del_rcu(&n->dev_list); write_lock(&n->lock); neigh_del_timer(n); neigh_mark_dead(n); if (refcount_read(&n->refcnt) != 1) { /* The most unpleasant situation. * We must destroy neighbour entry, * but someone still uses it. * * The destroy will be delayed until * the last user releases us, but * we must kill timers etc. and move * it to safe state. */ __skb_queue_purge(&n->arp_queue); n->arp_queue_len_bytes = 0; WRITE_ONCE(n->output, neigh_blackhole); if (n->nud_state & NUD_VALID) n->nud_state = NUD_NOARP; else n->nud_state = NUD_NONE; neigh_dbg(2, "neigh %p is stray\n", n); } write_unlock(&n->lock); neigh_cleanup_and_release(n); } } void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev) { write_lock_bh(&tbl->lock); neigh_flush_dev(tbl, dev, false); write_unlock_bh(&tbl->lock); } EXPORT_SYMBOL(neigh_changeaddr); static int __neigh_ifdown(struct neigh_table *tbl, struct net_device *dev, bool skip_perm) { write_lock_bh(&tbl->lock); neigh_flush_dev(tbl, dev, skip_perm); pneigh_ifdown_and_unlock(tbl, dev); pneigh_queue_purge(&tbl->proxy_queue, dev ? dev_net(dev) : NULL, tbl->family); if (skb_queue_empty_lockless(&tbl->proxy_queue)) timer_delete_sync(&tbl->proxy_timer); return 0; } int neigh_carrier_down(struct neigh_table *tbl, struct net_device *dev) { __neigh_ifdown(tbl, dev, true); return 0; } EXPORT_SYMBOL(neigh_carrier_down); int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev) { __neigh_ifdown(tbl, dev, false); return 0; } EXPORT_SYMBOL(neigh_ifdown); static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev, u32 flags, bool exempt_from_gc) { struct neighbour *n = NULL; unsigned long now = jiffies; int entries, gc_thresh3; if (exempt_from_gc) goto do_alloc; entries = atomic_inc_return(&tbl->gc_entries) - 1; gc_thresh3 = READ_ONCE(tbl->gc_thresh3); if (entries >= gc_thresh3 || (entries >= READ_ONCE(tbl->gc_thresh2) && time_after(now, READ_ONCE(tbl->last_flush) + 5 * HZ))) { if (!neigh_forced_gc(tbl) && entries >= gc_thresh3) { net_info_ratelimited("%s: neighbor table overflow!\n", tbl->id); NEIGH_CACHE_STAT_INC(tbl, table_fulls); goto out_entries; } } do_alloc: n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC); if (!n) goto out_entries; __skb_queue_head_init(&n->arp_queue); rwlock_init(&n->lock); seqlock_init(&n->ha_lock); n->updated = n->used = now; n->nud_state = NUD_NONE; n->output = neigh_blackhole; n->flags = flags; seqlock_init(&n->hh.hh_lock); n->parms = neigh_parms_clone(&tbl->parms); timer_setup(&n->timer, neigh_timer_handler, 0); NEIGH_CACHE_STAT_INC(tbl, allocs); n->tbl = tbl; refcount_set(&n->refcnt, 1); n->dead = 1; INIT_LIST_HEAD(&n->gc_list); INIT_LIST_HEAD(&n->managed_list); atomic_inc(&tbl->entries); out: return n; out_entries: if (!exempt_from_gc) atomic_dec(&tbl->gc_entries); goto out; } static void neigh_get_hash_rnd(u32 *x) { *x = get_random_u32() | 1; } static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift) { size_t size = (1 << shift) * sizeof(struct hlist_head); struct hlist_head *hash_heads; struct neigh_hash_table *ret; int i; ret = kmalloc(sizeof(*ret), GFP_ATOMIC); if (!ret) return NULL; hash_heads = kzalloc(size, GFP_ATOMIC); if (!hash_heads) { kfree(ret); return NULL; } ret->hash_heads = hash_heads; ret->hash_shift = shift; for (i = 0; i < NEIGH_NUM_HASH_RND; i++) neigh_get_hash_rnd(&ret->hash_rnd[i]); return ret; } static void neigh_hash_free_rcu(struct rcu_head *head) { struct neigh_hash_table *nht = container_of(head, struct neigh_hash_table, rcu); kfree(nht->hash_heads); kfree(nht); } static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl, unsigned long new_shift) { unsigned int i, hash; struct neigh_hash_table *new_nht, *old_nht; NEIGH_CACHE_STAT_INC(tbl, hash_grows); old_nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); new_nht = neigh_hash_alloc(new_shift); if (!new_nht) return old_nht; for (i = 0; i < (1 << old_nht->hash_shift); i++) { struct hlist_node *tmp; struct neighbour *n; neigh_for_each_in_bucket_safe(n, tmp, &old_nht->hash_heads[i]) { hash = tbl->hash(n->primary_key, n->dev, new_nht->hash_rnd); hash >>= (32 - new_nht->hash_shift); hlist_del_rcu(&n->hash); hlist_add_head_rcu(&n->hash, &new_nht->hash_heads[hash]); } } rcu_assign_pointer(tbl->nht, new_nht); call_rcu(&old_nht->rcu, neigh_hash_free_rcu); return new_nht; } struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, struct net_device *dev) { struct neighbour *n; NEIGH_CACHE_STAT_INC(tbl, lookups); rcu_read_lock(); n = __neigh_lookup_noref(tbl, pkey, dev); if (n) { if (!refcount_inc_not_zero(&n->refcnt)) n = NULL; NEIGH_CACHE_STAT_INC(tbl, hits); } rcu_read_unlock(); return n; } EXPORT_SYMBOL(neigh_lookup); static struct neighbour * ___neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev, u32 flags, bool exempt_from_gc, bool want_ref) { u32 hash_val, key_len = tbl->key_len; struct neighbour *n1, *rc, *n; struct neigh_hash_table *nht; int error; n = neigh_alloc(tbl, dev, flags, exempt_from_gc); trace_neigh_create(tbl, dev, pkey, n, exempt_from_gc); if (!n) { rc = ERR_PTR(-ENOBUFS); goto out; } memcpy(n->primary_key, pkey, key_len); n->dev = dev; netdev_hold(dev, &n->dev_tracker, GFP_ATOMIC); /* Protocol specific setup. */ if (tbl->constructor && (error = tbl->constructor(n)) < 0) { rc = ERR_PTR(error); goto out_neigh_release; } if (dev->netdev_ops->ndo_neigh_construct) { error = dev->netdev_ops->ndo_neigh_construct(dev, n); if (error < 0) { rc = ERR_PTR(error); goto out_neigh_release; } } /* Device specific setup. */ if (n->parms->neigh_setup && (error = n->parms->neigh_setup(n)) < 0) { rc = ERR_PTR(error); goto out_neigh_release; } n->confirmed = jiffies - (NEIGH_VAR(n->parms, BASE_REACHABLE_TIME) << 1); write_lock_bh(&tbl->lock); nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); if (atomic_read(&tbl->entries) > (1 << nht->hash_shift)) nht = neigh_hash_grow(tbl, nht->hash_shift + 1); hash_val = tbl->hash(n->primary_key, dev, nht->hash_rnd) >> (32 - nht->hash_shift); if (n->parms->dead) { rc = ERR_PTR(-EINVAL); goto out_tbl_unlock; } neigh_for_each_in_bucket(n1, &nht->hash_heads[hash_val]) { if (dev == n1->dev && !memcmp(n1->primary_key, n->primary_key, key_len)) { if (want_ref) neigh_hold(n1); rc = n1; goto out_tbl_unlock; } } n->dead = 0; if (!exempt_from_gc) list_add_tail(&n->gc_list, &n->tbl->gc_list); if (n->flags & NTF_MANAGED) list_add_tail(&n->managed_list, &n->tbl->managed_list); if (want_ref) neigh_hold(n); hlist_add_head_rcu(&n->hash, &nht->hash_heads[hash_val]); hlist_add_head_rcu(&n->dev_list, neigh_get_dev_table(dev, tbl->family)); write_unlock_bh(&tbl->lock); neigh_dbg(2, "neigh %p is created\n", n); rc = n; out: return rc; out_tbl_unlock: write_unlock_bh(&tbl->lock); out_neigh_release: if (!exempt_from_gc) atomic_dec(&tbl->gc_entries); neigh_release(n); goto out; } struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev, bool want_ref) { bool exempt_from_gc = !!(dev->flags & IFF_LOOPBACK); return ___neigh_create(tbl, pkey, dev, 0, exempt_from_gc, want_ref); } EXPORT_SYMBOL(__neigh_create); static u32 pneigh_hash(const void *pkey, unsigned int key_len) { u32 hash_val = *(u32 *)(pkey + key_len - 4); hash_val ^= (hash_val >> 16); hash_val ^= hash_val >> 8; hash_val ^= hash_val >> 4; hash_val &= PNEIGH_HASHMASK; return hash_val; } static struct pneigh_entry *__pneigh_lookup_1(struct pneigh_entry *n, struct net *net, const void *pkey, unsigned int key_len, struct net_device *dev) { while (n) { if (!memcmp(n->key, pkey, key_len) && net_eq(pneigh_net(n), net) && (n->dev == dev || !n->dev)) return n; n = n->next; } return NULL; } struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl, struct net *net, const void *pkey, struct net_device *dev) { unsigned int key_len = tbl->key_len; u32 hash_val = pneigh_hash(pkey, key_len); return __pneigh_lookup_1(tbl->phash_buckets[hash_val], net, pkey, key_len, dev); } EXPORT_SYMBOL_GPL(__pneigh_lookup); struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, struct net *net, const void *pkey, struct net_device *dev, int creat) { struct pneigh_entry *n; unsigned int key_len = tbl->key_len; u32 hash_val = pneigh_hash(pkey, key_len); read_lock_bh(&tbl->lock); n = __pneigh_lookup_1(tbl->phash_buckets[hash_val], net, pkey, key_len, dev); read_unlock_bh(&tbl->lock); if (n || !creat) goto out; ASSERT_RTNL(); n = kzalloc(sizeof(*n) + key_len, GFP_KERNEL); if (!n) goto out; write_pnet(&n->net, net); memcpy(n->key, pkey, key_len); n->dev = dev; netdev_hold(dev, &n->dev_tracker, GFP_KERNEL); if (tbl->pconstructor && tbl->pconstructor(n)) { netdev_put(dev, &n->dev_tracker); kfree(n); n = NULL; goto out; } write_lock_bh(&tbl->lock); n->next = tbl->phash_buckets[hash_val]; tbl->phash_buckets[hash_val] = n; write_unlock_bh(&tbl->lock); out: return n; } EXPORT_SYMBOL(pneigh_lookup); int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, struct net_device *dev) { struct pneigh_entry *n, **np; unsigned int key_len = tbl->key_len; u32 hash_val = pneigh_hash(pkey, key_len); write_lock_bh(&tbl->lock); for (np = &tbl->phash_buckets[hash_val]; (n = *np) != NULL; np = &n->next) { if (!memcmp(n->key, pkey, key_len) && n->dev == dev && net_eq(pneigh_net(n), net)) { *np = n->next; write_unlock_bh(&tbl->lock); if (tbl->pdestructor) tbl->pdestructor(n); netdev_put(n->dev, &n->dev_tracker); kfree(n); return 0; } } write_unlock_bh(&tbl->lock); return -ENOENT; } static int pneigh_ifdown_and_unlock(struct neigh_table *tbl, struct net_device *dev) { struct pneigh_entry *n, **np, *freelist = NULL; u32 h; for (h = 0; h <= PNEIGH_HASHMASK; h++) { np = &tbl->phash_buckets[h]; while ((n = *np) != NULL) { if (!dev || n->dev == dev) { *np = n->next; n->next = freelist; freelist = n; continue; } np = &n->next; } } write_unlock_bh(&tbl->lock); while ((n = freelist)) { freelist = n->next; n->next = NULL; if (tbl->pdestructor) tbl->pdestructor(n); netdev_put(n->dev, &n->dev_tracker); kfree(n); } return -ENOENT; } static inline void neigh_parms_put(struct neigh_parms *parms) { if (refcount_dec_and_test(&parms->refcnt)) kfree(parms); } /* * neighbour must already be out of the table; * */ void neigh_destroy(struct neighbour *neigh) { struct net_device *dev = neigh->dev; NEIGH_CACHE_STAT_INC(neigh->tbl, destroys); if (!neigh->dead) { pr_warn("Destroying alive neighbour %p\n", neigh); dump_stack(); return; } if (neigh_del_timer(neigh)) pr_warn("Impossible event\n"); write_lock_bh(&neigh->lock); __skb_queue_purge(&neigh->arp_queue); write_unlock_bh(&neigh->lock); neigh->arp_queue_len_bytes = 0; if (dev->netdev_ops->ndo_neigh_destroy) dev->netdev_ops->ndo_neigh_destroy(dev, neigh); netdev_put(dev, &neigh->dev_tracker); neigh_parms_put(neigh->parms); neigh_dbg(2, "neigh %p is destroyed\n", neigh); atomic_dec(&neigh->tbl->entries); kfree_rcu(neigh, rcu); } EXPORT_SYMBOL(neigh_destroy); /* Neighbour state is suspicious; disable fast path. Called with write_locked neigh. */ static void neigh_suspect(struct neighbour *neigh) { neigh_dbg(2, "neigh %p is suspected\n", neigh); WRITE_ONCE(neigh->output, neigh->ops->output); } /* Neighbour state is OK; enable fast path. Called with write_locked neigh. */ static void neigh_connect(struct neighbour *neigh) { neigh_dbg(2, "neigh %p is connected\n", neigh); WRITE_ONCE(neigh->output, neigh->ops->connected_output); } static void neigh_periodic_work(struct work_struct *work) { struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work); struct neigh_hash_table *nht; struct hlist_node *tmp; struct neighbour *n; unsigned int i; NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs); write_lock_bh(&tbl->lock); nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); /* * periodically recompute ReachableTime from random function */ if (time_after(jiffies, tbl->last_rand + 300 * HZ)) { struct neigh_parms *p; WRITE_ONCE(tbl->last_rand, jiffies); list_for_each_entry(p, &tbl->parms_list, list) p->reachable_time = neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); } if (atomic_read(&tbl->entries) < READ_ONCE(tbl->gc_thresh1)) goto out; for (i = 0 ; i < (1 << nht->hash_shift); i++) { neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[i]) { unsigned int state; write_lock(&n->lock); state = n->nud_state; if ((state & (NUD_PERMANENT | NUD_IN_TIMER)) || (n->flags & NTF_EXT_LEARNED)) { write_unlock(&n->lock); continue; } if (time_before(n->used, n->confirmed) && time_is_before_eq_jiffies(n->confirmed)) n->used = n->confirmed; if (refcount_read(&n->refcnt) == 1 && (state == NUD_FAILED || !time_in_range_open(jiffies, n->used, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) { hlist_del_rcu(&n->hash); hlist_del_rcu(&n->dev_list); neigh_mark_dead(n); write_unlock(&n->lock); neigh_cleanup_and_release(n); continue; } write_unlock(&n->lock); } /* * It's fine to release lock here, even if hash table * grows while we are preempted. */ write_unlock_bh(&tbl->lock); cond_resched(); write_lock_bh(&tbl->lock); nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); } out: /* Cycle through all hash buckets every BASE_REACHABLE_TIME/2 ticks. * ARP entry timeouts range from 1/2 BASE_REACHABLE_TIME to 3/2 * BASE_REACHABLE_TIME. */ queue_delayed_work(system_power_efficient_wq, &tbl->gc_work, NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME) >> 1); write_unlock_bh(&tbl->lock); } static __inline__ int neigh_max_probes(struct neighbour *n) { struct neigh_parms *p = n->parms; return NEIGH_VAR(p, UCAST_PROBES) + NEIGH_VAR(p, APP_PROBES) + (n->nud_state & NUD_PROBE ? NEIGH_VAR(p, MCAST_REPROBES) : NEIGH_VAR(p, MCAST_PROBES)); } static void neigh_invalidate(struct neighbour *neigh) __releases(neigh->lock) __acquires(neigh->lock) { struct sk_buff *skb; NEIGH_CACHE_STAT_INC(neigh->tbl, res_failed); neigh_dbg(2, "neigh %p is failed\n", neigh); neigh->updated = jiffies; /* It is very thin place. report_unreachable is very complicated routine. Particularly, it can hit the same neighbour entry! So that, we try to be accurate and avoid dead loop. --ANK */ while (neigh->nud_state == NUD_FAILED && (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) { write_unlock(&neigh->lock); neigh->ops->error_report(neigh, skb); write_lock(&neigh->lock); } __skb_queue_purge(&neigh->arp_queue); neigh->arp_queue_len_bytes = 0; } static void neigh_probe(struct neighbour *neigh) __releases(neigh->lock) { struct sk_buff *skb = skb_peek_tail(&neigh->arp_queue); /* keep skb alive even if arp_queue overflows */ if (skb) skb = skb_clone(skb, GFP_ATOMIC); write_unlock(&neigh->lock); if (neigh->ops->solicit) neigh->ops->solicit(neigh, skb); atomic_inc(&neigh->probes); consume_skb(skb); } /* Called when a timer expires for a neighbour entry. */ static void neigh_timer_handler(struct timer_list *t) { unsigned long now, next; struct neighbour *neigh = timer_container_of(neigh, t, timer); unsigned int state; int notify = 0; write_lock(&neigh->lock); state = neigh->nud_state; now = jiffies; next = now + HZ; if (!(state & NUD_IN_TIMER)) goto out; if (state & NUD_REACHABLE) { if (time_before_eq(now, neigh->confirmed + neigh->parms->reachable_time)) { neigh_dbg(2, "neigh %p is still alive\n", neigh); next = neigh->confirmed + neigh->parms->reachable_time; } else if (time_before_eq(now, neigh->used + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) { neigh_dbg(2, "neigh %p is delayed\n", neigh); WRITE_ONCE(neigh->nud_state, NUD_DELAY); neigh->updated = jiffies; neigh_suspect(neigh); next = now + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME); } else { neigh_dbg(2, "neigh %p is suspected\n", neigh); WRITE_ONCE(neigh->nud_state, NUD_STALE); neigh->updated = jiffies; neigh_suspect(neigh); notify = 1; } } else if (state & NUD_DELAY) { if (time_before_eq(now, neigh->confirmed + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) { neigh_dbg(2, "neigh %p is now reachable\n", neigh); WRITE_ONCE(neigh->nud_state, NUD_REACHABLE); neigh->updated = jiffies; neigh_connect(neigh); notify = 1; next = neigh->confirmed + neigh->parms->reachable_time; } else { neigh_dbg(2, "neigh %p is probed\n", neigh); WRITE_ONCE(neigh->nud_state, NUD_PROBE); neigh->updated = jiffies; atomic_set(&neigh->probes, 0); notify = 1; next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), HZ/100); } } else { /* NUD_PROBE|NUD_INCOMPLETE */ next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), HZ/100); } if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) && atomic_read(&neigh->probes) >= neigh_max_probes(neigh)) { WRITE_ONCE(neigh->nud_state, NUD_FAILED); notify = 1; neigh_invalidate(neigh); goto out; } if (neigh->nud_state & NUD_IN_TIMER) { if (time_before(next, jiffies + HZ/100)) next = jiffies + HZ/100; if (!mod_timer(&neigh->timer, next)) neigh_hold(neigh); } if (neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) { neigh_probe(neigh); } else { out: write_unlock(&neigh->lock); } if (notify) neigh_update_notify(neigh, 0); trace_neigh_timer_handler(neigh, 0); neigh_release(neigh); } int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb, const bool immediate_ok) { int rc; bool immediate_probe = false; write_lock_bh(&neigh->lock); rc = 0; if (neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE)) goto out_unlock_bh; if (neigh->dead) goto out_dead; if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) { if (NEIGH_VAR(neigh->parms, MCAST_PROBES) + NEIGH_VAR(neigh->parms, APP_PROBES)) { unsigned long next, now = jiffies; atomic_set(&neigh->probes, NEIGH_VAR(neigh->parms, UCAST_PROBES)); neigh_del_timer(neigh); WRITE_ONCE(neigh->nud_state, NUD_INCOMPLETE); neigh->updated = now; if (!immediate_ok) { next = now + 1; } else { immediate_probe = true; next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), HZ / 100); } neigh_add_timer(neigh, next); } else { WRITE_ONCE(neigh->nud_state, NUD_FAILED); neigh->updated = jiffies; write_unlock_bh(&neigh->lock); kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_FAILED); return 1; } } else if (neigh->nud_state & NUD_STALE) { neigh_dbg(2, "neigh %p is delayed\n", neigh); neigh_del_timer(neigh); WRITE_ONCE(neigh->nud_state, NUD_DELAY); neigh->updated = jiffies; neigh_add_timer(neigh, jiffies + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME)); } if (neigh->nud_state == NUD_INCOMPLETE) { if (skb) { while (neigh->arp_queue_len_bytes + skb->truesize > NEIGH_VAR(neigh->parms, QUEUE_LEN_BYTES)) { struct sk_buff *buff; buff = __skb_dequeue(&neigh->arp_queue); if (!buff) break; neigh->arp_queue_len_bytes -= buff->truesize; kfree_skb_reason(buff, SKB_DROP_REASON_NEIGH_QUEUEFULL); NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards); } skb_dst_force(skb); __skb_queue_tail(&neigh->arp_queue, skb); neigh->arp_queue_len_bytes += skb->truesize; } rc = 1; } out_unlock_bh: if (immediate_probe) neigh_probe(neigh); else write_unlock(&neigh->lock); local_bh_enable(); trace_neigh_event_send_done(neigh, rc); return rc; out_dead: if (neigh->nud_state & NUD_STALE) goto out_unlock_bh; write_unlock_bh(&neigh->lock); kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_DEAD); trace_neigh_event_send_dead(neigh, 1); return 1; } EXPORT_SYMBOL(__neigh_event_send); static void neigh_update_hhs(struct neighbour *neigh) { struct hh_cache *hh; void (*update)(struct hh_cache*, const struct net_device*, const unsigned char *) = NULL; if (neigh->dev->header_ops) update = neigh->dev->header_ops->cache_update; if (update) { hh = &neigh->hh; if (READ_ONCE(hh->hh_len)) { write_seqlock_bh(&hh->hh_lock); update(hh, neigh->dev, neigh->ha); write_sequnlock_bh(&hh->hh_lock); } } } /* Generic update routine. -- lladdr is new lladdr or NULL, if it is not supplied. -- new is new state. -- flags NEIGH_UPDATE_F_OVERRIDE allows to override existing lladdr, if it is different. NEIGH_UPDATE_F_WEAK_OVERRIDE will suspect existing "connected" lladdr instead of overriding it if it is different. NEIGH_UPDATE_F_ADMIN means that the change is administrative. NEIGH_UPDATE_F_USE means that the entry is user triggered. NEIGH_UPDATE_F_MANAGED means that the entry will be auto-refreshed. NEIGH_UPDATE_F_OVERRIDE_ISROUTER allows to override existing NTF_ROUTER flag. NEIGH_UPDATE_F_ISROUTER indicates if the neighbour is known as a router. Caller MUST hold reference count on the entry. */ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags, u32 nlmsg_pid, struct netlink_ext_ack *extack) { bool gc_update = false, managed_update = false; int update_isrouter = 0; struct net_device *dev; int err, notify = 0; u8 old; trace_neigh_update(neigh, lladdr, new, flags, nlmsg_pid); write_lock_bh(&neigh->lock); dev = neigh->dev; old = neigh->nud_state; err = -EPERM; if (neigh->dead) { NL_SET_ERR_MSG(extack, "Neighbor entry is now dead"); new = old; goto out; } if (!(flags & NEIGH_UPDATE_F_ADMIN) && (old & (NUD_NOARP | NUD_PERMANENT))) goto out; neigh_update_flags(neigh, flags, &notify, &gc_update, &managed_update); if (flags & (NEIGH_UPDATE_F_USE | NEIGH_UPDATE_F_MANAGED)) { new = old & ~NUD_PERMANENT; WRITE_ONCE(neigh->nud_state, new); err = 0; goto out; } if (!(new & NUD_VALID)) { neigh_del_timer(neigh); if (old & NUD_CONNECTED) neigh_suspect(neigh); WRITE_ONCE(neigh->nud_state, new); err = 0; notify = old & NUD_VALID; if ((old & (NUD_INCOMPLETE | NUD_PROBE)) && (new & NUD_FAILED)) { neigh_invalidate(neigh); notify = 1; } goto out; } /* Compare new lladdr with cached one */ if (!dev->addr_len) { /* First case: device needs no address. */ lladdr = neigh->ha; } else if (lladdr) { /* The second case: if something is already cached and a new address is proposed: - compare new & old - if they are different, check override flag */ if ((old & NUD_VALID) && !memcmp(lladdr, neigh->ha, dev->addr_len)) lladdr = neigh->ha; } else { /* No address is supplied; if we know something, use it, otherwise discard the request. */ err = -EINVAL; if (!(old & NUD_VALID)) { NL_SET_ERR_MSG(extack, "No link layer address given"); goto out; } lladdr = neigh->ha; } /* Update confirmed timestamp for neighbour entry after we * received ARP packet even if it doesn't change IP to MAC binding. */ if (new & NUD_CONNECTED) neigh->confirmed = jiffies; /* If entry was valid and address is not changed, do not change entry state, if new one is STALE. */ err = 0; update_isrouter = flags & NEIGH_UPDATE_F_OVERRIDE_ISROUTER; if (old & NUD_VALID) { if (lladdr != neigh->ha && !(flags & NEIGH_UPDATE_F_OVERRIDE)) { update_isrouter = 0; if ((flags & NEIGH_UPDATE_F_WEAK_OVERRIDE) && (old & NUD_CONNECTED)) { lladdr = neigh->ha; new = NUD_STALE; } else goto out; } else { if (lladdr == neigh->ha && new == NUD_STALE && !(flags & NEIGH_UPDATE_F_ADMIN)) new = old; } } /* Update timestamp only once we know we will make a change to the * neighbour entry. Otherwise we risk to move the locktime window with * noop updates and ignore relevant ARP updates. */ if (new != old || lladdr != neigh->ha) neigh->updated = jiffies; if (new != old) { neigh_del_timer(neigh); if (new & NUD_PROBE) atomic_set(&neigh->probes, 0); if (new & NUD_IN_TIMER) neigh_add_timer(neigh, (jiffies + ((new & NUD_REACHABLE) ? neigh->parms->reachable_time : 0))); WRITE_ONCE(neigh->nud_state, new); notify = 1; } if (lladdr != neigh->ha) { write_seqlock(&neigh->ha_lock); memcpy(&neigh->ha, lladdr, dev->addr_len); write_sequnlock(&neigh->ha_lock); neigh_update_hhs(neigh); if (!(new & NUD_CONNECTED)) neigh->confirmed = jiffies - (NEIGH_VAR(neigh->parms, BASE_REACHABLE_TIME) << 1); notify = 1; } if (new == old) goto out; if (new & NUD_CONNECTED) neigh_connect(neigh); else neigh_suspect(neigh); if (!(old & NUD_VALID)) { struct sk_buff *skb; /* Again: avoid dead loop if something went wrong */ while (neigh->nud_state & NUD_VALID && (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) { struct dst_entry *dst = skb_dst(skb); struct neighbour *n2, *n1 = neigh; write_unlock_bh(&neigh->lock); rcu_read_lock(); /* Why not just use 'neigh' as-is? The problem is that * things such as shaper, eql, and sch_teql can end up * using alternative, different, neigh objects to output * the packet in the output path. So what we need to do * here is re-lookup the top-level neigh in the path so * we can reinject the packet there. */ n2 = NULL; if (dst && dst->obsolete != DST_OBSOLETE_DEAD) { n2 = dst_neigh_lookup_skb(dst, skb); if (n2) n1 = n2; } READ_ONCE(n1->output)(n1, skb); if (n2) neigh_release(n2); rcu_read_unlock(); write_lock_bh(&neigh->lock); } __skb_queue_purge(&neigh->arp_queue); neigh->arp_queue_len_bytes = 0; } out: if (update_isrouter) neigh_update_is_router(neigh, flags, &notify); write_unlock_bh(&neigh->lock); if (((new ^ old) & NUD_PERMANENT) || gc_update) neigh_update_gc_list(neigh); if (managed_update) neigh_update_managed_list(neigh); if (notify) neigh_update_notify(neigh, nlmsg_pid); trace_neigh_update_done(neigh, err); return err; } int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags, u32 nlmsg_pid) { return __neigh_update(neigh, lladdr, new, flags, nlmsg_pid, NULL); } EXPORT_SYMBOL(neigh_update); /* Update the neigh to listen temporarily for probe responses, even if it is * in a NUD_FAILED state. The caller has to hold neigh->lock for writing. */ void __neigh_set_probe_once(struct neighbour *neigh) { if (neigh->dead) return; neigh->updated = jiffies; if (!(neigh->nud_state & NUD_FAILED)) return; WRITE_ONCE(neigh->nud_state, NUD_INCOMPLETE); atomic_set(&neigh->probes, neigh_max_probes(neigh)); neigh_add_timer(neigh, jiffies + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), HZ/100)); } EXPORT_SYMBOL(__neigh_set_probe_once); struct neighbour *neigh_event_ns(struct neigh_table *tbl, u8 *lladdr, void *saddr, struct net_device *dev) { struct neighbour *neigh = __neigh_lookup(tbl, saddr, dev, lladdr || !dev->addr_len); if (neigh) neigh_update(neigh, lladdr, NUD_STALE, NEIGH_UPDATE_F_OVERRIDE, 0); return neigh; } EXPORT_SYMBOL(neigh_event_ns); /* called with read_lock_bh(&n->lock); */ static void neigh_hh_init(struct neighbour *n) { struct net_device *dev = n->dev; __be16 prot = n->tbl->protocol; struct hh_cache *hh = &n->hh; write_lock_bh(&n->lock); /* Only one thread can come in here and initialize the * hh_cache entry. */ if (!hh->hh_len) dev->header_ops->cache(n, hh, prot); write_unlock_bh(&n->lock); } /* Slow and careful. */ int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb) { int rc = 0; if (!neigh_event_send(neigh, skb)) { int err; struct net_device *dev = neigh->dev; unsigned int seq; if (dev->header_ops->cache && !READ_ONCE(neigh->hh.hh_len)) neigh_hh_init(neigh); do { __skb_pull(skb, skb_network_offset(skb)); seq = read_seqbegin(&neigh->ha_lock); err = dev_hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len); } while (read_seqretry(&neigh->ha_lock, seq)); if (err >= 0) rc = dev_queue_xmit(skb); else goto out_kfree_skb; } out: return rc; out_kfree_skb: rc = -EINVAL; kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_HH_FILLFAIL); goto out; } EXPORT_SYMBOL(neigh_resolve_output); /* As fast as possible without hh cache */ int neigh_connected_output(struct neighbour *neigh, struct sk_buff *skb) { struct net_device *dev = neigh->dev; unsigned int seq; int err; do { __skb_pull(skb, skb_network_offset(skb)); seq = read_seqbegin(&neigh->ha_lock); err = dev_hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len); } while (read_seqretry(&neigh->ha_lock, seq)); if (err >= 0) err = dev_queue_xmit(skb); else { err = -EINVAL; kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_HH_FILLFAIL); } return err; } EXPORT_SYMBOL(neigh_connected_output); int neigh_direct_output(struct neighbour *neigh, struct sk_buff *skb) { return dev_queue_xmit(skb); } EXPORT_SYMBOL(neigh_direct_output); static void neigh_managed_work(struct work_struct *work) { struct neigh_table *tbl = container_of(work, struct neigh_table, managed_work.work); struct neighbour *neigh; write_lock_bh(&tbl->lock); list_for_each_entry(neigh, &tbl->managed_list, managed_list) neigh_event_send_probe(neigh, NULL, false); queue_delayed_work(system_power_efficient_wq, &tbl->managed_work, NEIGH_VAR(&tbl->parms, INTERVAL_PROBE_TIME_MS)); write_unlock_bh(&tbl->lock); } static void neigh_proxy_process(struct timer_list *t) { struct neigh_table *tbl = timer_container_of(tbl, t, proxy_timer); long sched_next = 0; unsigned long now = jiffies; struct sk_buff *skb, *n; spin_lock(&tbl->proxy_queue.lock); skb_queue_walk_safe(&tbl->proxy_queue, skb, n) { long tdif = NEIGH_CB(skb)->sched_next - now; if (tdif <= 0) { struct net_device *dev = skb->dev; neigh_parms_qlen_dec(dev, tbl->family); __skb_unlink(skb, &tbl->proxy_queue); if (tbl->proxy_redo && netif_running(dev)) { rcu_read_lock(); tbl->proxy_redo(skb); rcu_read_unlock(); } else { kfree_skb(skb); } dev_put(dev); } else if (!sched_next || tdif < sched_next) sched_next = tdif; } timer_delete(&tbl->proxy_timer); if (sched_next) mod_timer(&tbl->proxy_timer, jiffies + sched_next); spin_unlock(&tbl->proxy_queue.lock); } static unsigned long neigh_proxy_delay(struct neigh_parms *p) { /* If proxy_delay is zero, do not call get_random_u32_below() * as it is undefined behavior. */ unsigned long proxy_delay = NEIGH_VAR(p, PROXY_DELAY); return proxy_delay ? jiffies + get_random_u32_below(proxy_delay) : jiffies; } void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, struct sk_buff *skb) { unsigned long sched_next = neigh_proxy_delay(p); if (p->qlen > NEIGH_VAR(p, PROXY_QLEN)) { kfree_skb(skb); return; } NEIGH_CB(skb)->sched_next = sched_next; NEIGH_CB(skb)->flags |= LOCALLY_ENQUEUED; spin_lock(&tbl->proxy_queue.lock); if (timer_delete(&tbl->proxy_timer)) { if (time_before(tbl->proxy_timer.expires, sched_next)) sched_next = tbl->proxy_timer.expires; } skb_dst_drop(skb); dev_hold(skb->dev); __skb_queue_tail(&tbl->proxy_queue, skb); p->qlen++; mod_timer(&tbl->proxy_timer, sched_next); spin_unlock(&tbl->proxy_queue.lock); } EXPORT_SYMBOL(pneigh_enqueue); static inline struct neigh_parms *lookup_neigh_parms(struct neigh_table *tbl, struct net *net, int ifindex) { struct neigh_parms *p; list_for_each_entry(p, &tbl->parms_list, list) { if ((p->dev && p->dev->ifindex == ifindex && net_eq(neigh_parms_net(p), net)) || (!p->dev && !ifindex && net_eq(net, &init_net))) return p; } return NULL; } struct neigh_parms *neigh_parms_alloc(struct net_device *dev, struct neigh_table *tbl) { struct neigh_parms *p; struct net *net = dev_net(dev); const struct net_device_ops *ops = dev->netdev_ops; p = kmemdup(&tbl->parms, sizeof(*p), GFP_KERNEL); if (p) { p->tbl = tbl; refcount_set(&p->refcnt, 1); p->reachable_time = neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); p->qlen = 0; netdev_hold(dev, &p->dev_tracker, GFP_KERNEL); p->dev = dev; write_pnet(&p->net, net); p->sysctl_table = NULL; if (ops->ndo_neigh_setup && ops->ndo_neigh_setup(dev, p)) { netdev_put(dev, &p->dev_tracker); kfree(p); return NULL; } write_lock_bh(&tbl->lock); list_add(&p->list, &tbl->parms.list); write_unlock_bh(&tbl->lock); neigh_parms_data_state_cleanall(p); } return p; } EXPORT_SYMBOL(neigh_parms_alloc); static void neigh_rcu_free_parms(struct rcu_head *head) { struct neigh_parms *parms = container_of(head, struct neigh_parms, rcu_head); neigh_parms_put(parms); } void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms) { if (!parms || parms == &tbl->parms) return; write_lock_bh(&tbl->lock); list_del(&parms->list); parms->dead = 1; write_unlock_bh(&tbl->lock); netdev_put(parms->dev, &parms->dev_tracker); call_rcu(&parms->rcu_head, neigh_rcu_free_parms); } EXPORT_SYMBOL(neigh_parms_release); static struct lock_class_key neigh_table_proxy_queue_class; static struct neigh_table __rcu *neigh_tables[NEIGH_NR_TABLES] __read_mostly; void neigh_table_init(int index, struct neigh_table *tbl) { unsigned long now = jiffies; unsigned long phsize; INIT_LIST_HEAD(&tbl->parms_list); INIT_LIST_HEAD(&tbl->gc_list); INIT_LIST_HEAD(&tbl->managed_list); list_add(&tbl->parms.list, &tbl->parms_list); write_pnet(&tbl->parms.net, &init_net); refcount_set(&tbl->parms.refcnt, 1); tbl->parms.reachable_time = neigh_rand_reach_time(NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME)); tbl->parms.qlen = 0; tbl->stats = alloc_percpu(struct neigh_statistics); if (!tbl->stats) panic("cannot create neighbour cache statistics"); #ifdef CONFIG_PROC_FS if (!proc_create_seq_data(tbl->id, 0, init_net.proc_net_stat, &neigh_stat_seq_ops, tbl)) panic("cannot create neighbour proc dir entry"); #endif RCU_INIT_POINTER(tbl->nht, neigh_hash_alloc(3)); phsize = (PNEIGH_HASHMASK + 1) * sizeof(struct pneigh_entry *); tbl->phash_buckets = kzalloc(phsize, GFP_KERNEL); if (!tbl->nht || !tbl->phash_buckets) panic("cannot allocate neighbour cache hashes"); if (!tbl->entry_size) tbl->entry_size = ALIGN(offsetof(struct neighbour, primary_key) + tbl->key_len, NEIGH_PRIV_ALIGN); else WARN_ON(tbl->entry_size % NEIGH_PRIV_ALIGN); rwlock_init(&tbl->lock); INIT_DEFERRABLE_WORK(&tbl->gc_work, neigh_periodic_work); queue_delayed_work(system_power_efficient_wq, &tbl->gc_work, tbl->parms.reachable_time); INIT_DEFERRABLE_WORK(&tbl->managed_work, neigh_managed_work); queue_delayed_work(system_power_efficient_wq, &tbl->managed_work, 0); timer_setup(&tbl->proxy_timer, neigh_proxy_process, 0); skb_queue_head_init_class(&tbl->proxy_queue, &neigh_table_proxy_queue_class); tbl->last_flush = now; tbl->last_rand = now + tbl->parms.reachable_time * 20; rcu_assign_pointer(neigh_tables[index], tbl); } EXPORT_SYMBOL(neigh_table_init); /* * Only called from ndisc_cleanup(), which means this is dead code * because we no longer can unload IPv6 module. */ int neigh_table_clear(int index, struct neigh_table *tbl) { RCU_INIT_POINTER(neigh_tables[index], NULL); synchronize_rcu(); /* It is not clean... Fix it to unload IPv6 module safely */ cancel_delayed_work_sync(&tbl->managed_work); cancel_delayed_work_sync(&tbl->gc_work); timer_delete_sync(&tbl->proxy_timer); pneigh_queue_purge(&tbl->proxy_queue, NULL, tbl->family); neigh_ifdown(tbl, NULL); if (atomic_read(&tbl->entries)) pr_crit("neighbour leakage\n"); call_rcu(&rcu_dereference_protected(tbl->nht, 1)->rcu, neigh_hash_free_rcu); tbl->nht = NULL; kfree(tbl->phash_buckets); tbl->phash_buckets = NULL; remove_proc_entry(tbl->id, init_net.proc_net_stat); free_percpu(tbl->stats); tbl->stats = NULL; return 0; } EXPORT_SYMBOL(neigh_table_clear); static struct neigh_table *neigh_find_table(int family) { struct neigh_table *tbl = NULL; switch (family) { case AF_INET: tbl = rcu_dereference_rtnl(neigh_tables[NEIGH_ARP_TABLE]); break; case AF_INET6: tbl = rcu_dereference_rtnl(neigh_tables[NEIGH_ND_TABLE]); break; } return tbl; } const struct nla_policy nda_policy[NDA_MAX+1] = { [NDA_UNSPEC] = { .strict_start_type = NDA_NH_ID }, [NDA_DST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [NDA_LLADDR] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [NDA_CACHEINFO] = { .len = sizeof(struct nda_cacheinfo) }, [NDA_PROBES] = { .type = NLA_U32 }, [NDA_VLAN] = { .type = NLA_U16 }, [NDA_PORT] = { .type = NLA_U16 }, [NDA_VNI] = { .type = NLA_U32 }, [NDA_IFINDEX] = { .type = NLA_U32 }, [NDA_MASTER] = { .type = NLA_U32 }, [NDA_PROTOCOL] = { .type = NLA_U8 }, [NDA_NH_ID] = { .type = NLA_U32 }, [NDA_FLAGS_EXT] = NLA_POLICY_MASK(NLA_U32, NTF_EXT_MASK), [NDA_FDB_EXT_ATTRS] = { .type = NLA_NESTED }, }; static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct ndmsg *ndm; struct nlattr *dst_attr; struct neigh_table *tbl; struct neighbour *neigh; struct net_device *dev = NULL; int err = -EINVAL; ASSERT_RTNL(); if (nlmsg_len(nlh) < sizeof(*ndm)) goto out; dst_attr = nlmsg_find_attr(nlh, sizeof(*ndm), NDA_DST); if (!dst_attr) { NL_SET_ERR_MSG(extack, "Network address not specified"); goto out; } ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex) { dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (dev == NULL) { err = -ENODEV; goto out; } } tbl = neigh_find_table(ndm->ndm_family); if (tbl == NULL) return -EAFNOSUPPORT; if (nla_len(dst_attr) < (int)tbl->key_len) { NL_SET_ERR_MSG(extack, "Invalid network address"); goto out; } if (ndm->ndm_flags & NTF_PROXY) { err = pneigh_delete(tbl, net, nla_data(dst_attr), dev); goto out; } if (dev == NULL) goto out; neigh = neigh_lookup(tbl, nla_data(dst_attr), dev); if (neigh == NULL) { err = -ENOENT; goto out; } err = __neigh_update(neigh, NULL, NUD_FAILED, NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN, NETLINK_CB(skb).portid, extack); write_lock_bh(&tbl->lock); neigh_release(neigh); neigh_remove_one(neigh); write_unlock_bh(&tbl->lock); out: return err; } static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_OVERRIDE_ISROUTER; struct net *net = sock_net(skb->sk); struct ndmsg *ndm; struct nlattr *tb[NDA_MAX+1]; struct neigh_table *tbl; struct net_device *dev = NULL; struct neighbour *neigh; void *dst, *lladdr; u8 protocol = 0; u32 ndm_flags; int err; ASSERT_RTNL(); err = nlmsg_parse_deprecated(nlh, sizeof(*ndm), tb, NDA_MAX, nda_policy, extack); if (err < 0) goto out; err = -EINVAL; if (!tb[NDA_DST]) { NL_SET_ERR_MSG(extack, "Network address not specified"); goto out; } ndm = nlmsg_data(nlh); ndm_flags = ndm->ndm_flags; if (tb[NDA_FLAGS_EXT]) { u32 ext = nla_get_u32(tb[NDA_FLAGS_EXT]); BUILD_BUG_ON(sizeof(neigh->flags) * BITS_PER_BYTE < (sizeof(ndm->ndm_flags) * BITS_PER_BYTE + hweight32(NTF_EXT_MASK))); ndm_flags |= (ext << NTF_EXT_SHIFT); } if (ndm->ndm_ifindex) { dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (dev == NULL) { err = -ENODEV; goto out; } if (tb[NDA_LLADDR] && nla_len(tb[NDA_LLADDR]) < dev->addr_len) { NL_SET_ERR_MSG(extack, "Invalid link address"); goto out; } } tbl = neigh_find_table(ndm->ndm_family); if (tbl == NULL) return -EAFNOSUPPORT; if (nla_len(tb[NDA_DST]) < (int)tbl->key_len) { NL_SET_ERR_MSG(extack, "Invalid network address"); goto out; } dst = nla_data(tb[NDA_DST]); lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL; if (tb[NDA_PROTOCOL]) protocol = nla_get_u8(tb[NDA_PROTOCOL]); if (ndm_flags & NTF_PROXY) { struct pneigh_entry *pn; if (ndm_flags & NTF_MANAGED) { NL_SET_ERR_MSG(extack, "Invalid NTF_* flag combination"); goto out; } err = -ENOBUFS; pn = pneigh_lookup(tbl, net, dst, dev, 1); if (pn) { pn->flags = ndm_flags; if (protocol) pn->protocol = protocol; err = 0; } goto out; } if (!dev) { NL_SET_ERR_MSG(extack, "Device not specified"); goto out; } if (tbl->allow_add && !tbl->allow_add(dev, extack)) { err = -EINVAL; goto out; } neigh = neigh_lookup(tbl, dst, dev); if (neigh == NULL) { bool ndm_permanent = ndm->ndm_state & NUD_PERMANENT; bool exempt_from_gc = ndm_permanent || ndm_flags & NTF_EXT_LEARNED; if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { err = -ENOENT; goto out; } if (ndm_permanent && (ndm_flags & NTF_MANAGED)) { NL_SET_ERR_MSG(extack, "Invalid NTF_* flag for permanent entry"); err = -EINVAL; goto out; } neigh = ___neigh_create(tbl, dst, dev, ndm_flags & (NTF_EXT_LEARNED | NTF_MANAGED), exempt_from_gc, true); if (IS_ERR(neigh)) { err = PTR_ERR(neigh); goto out; } } else { if (nlh->nlmsg_flags & NLM_F_EXCL) { err = -EEXIST; neigh_release(neigh); goto out; } if (!(nlh->nlmsg_flags & NLM_F_REPLACE)) flags &= ~(NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_OVERRIDE_ISROUTER); } if (protocol) neigh->protocol = protocol; if (ndm_flags & NTF_EXT_LEARNED) flags |= NEIGH_UPDATE_F_EXT_LEARNED; if (ndm_flags & NTF_ROUTER) flags |= NEIGH_UPDATE_F_ISROUTER; if (ndm_flags & NTF_MANAGED) flags |= NEIGH_UPDATE_F_MANAGED; if (ndm_flags & NTF_USE) flags |= NEIGH_UPDATE_F_USE; err = __neigh_update(neigh, lladdr, ndm->ndm_state, flags, NETLINK_CB(skb).portid, extack); if (!err && ndm_flags & (NTF_USE | NTF_MANAGED)) { neigh_event_send(neigh, NULL); err = 0; } neigh_release(neigh); out: return err; } static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms) { struct nlattr *nest; nest = nla_nest_start_noflag(skb, NDTA_PARMS); if (nest == NULL) return -ENOBUFS; if ((parms->dev && nla_put_u32(skb, NDTPA_IFINDEX, parms->dev->ifindex)) || nla_put_u32(skb, NDTPA_REFCNT, refcount_read(&parms->refcnt)) || nla_put_u32(skb, NDTPA_QUEUE_LENBYTES, NEIGH_VAR(parms, QUEUE_LEN_BYTES)) || /* approximative value for deprecated QUEUE_LEN (in packets) */ nla_put_u32(skb, NDTPA_QUEUE_LEN, NEIGH_VAR(parms, QUEUE_LEN_BYTES) / SKB_TRUESIZE(ETH_FRAME_LEN)) || nla_put_u32(skb, NDTPA_PROXY_QLEN, NEIGH_VAR(parms, PROXY_QLEN)) || nla_put_u32(skb, NDTPA_APP_PROBES, NEIGH_VAR(parms, APP_PROBES)) || nla_put_u32(skb, NDTPA_UCAST_PROBES, NEIGH_VAR(parms, UCAST_PROBES)) || nla_put_u32(skb, NDTPA_MCAST_PROBES, NEIGH_VAR(parms, MCAST_PROBES)) || nla_put_u32(skb, NDTPA_MCAST_REPROBES, NEIGH_VAR(parms, MCAST_REPROBES)) || nla_put_msecs(skb, NDTPA_REACHABLE_TIME, parms->reachable_time, NDTPA_PAD) || nla_put_msecs(skb, NDTPA_BASE_REACHABLE_TIME, NEIGH_VAR(parms, BASE_REACHABLE_TIME), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_GC_STALETIME, NEIGH_VAR(parms, GC_STALETIME), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_DELAY_PROBE_TIME, NEIGH_VAR(parms, DELAY_PROBE_TIME), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_RETRANS_TIME, NEIGH_VAR(parms, RETRANS_TIME), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_ANYCAST_DELAY, NEIGH_VAR(parms, ANYCAST_DELAY), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_PROXY_DELAY, NEIGH_VAR(parms, PROXY_DELAY), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_LOCKTIME, NEIGH_VAR(parms, LOCKTIME), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_INTERVAL_PROBE_TIME_MS, NEIGH_VAR(parms, INTERVAL_PROBE_TIME_MS), NDTPA_PAD)) goto nla_put_failure; return nla_nest_end(skb, nest); nla_put_failure: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, u32 pid, u32 seq, int type, int flags) { struct nlmsghdr *nlh; struct ndtmsg *ndtmsg; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndtmsg), flags); if (nlh == NULL) return -EMSGSIZE; ndtmsg = nlmsg_data(nlh); read_lock_bh(&tbl->lock); ndtmsg->ndtm_family = tbl->family; ndtmsg->ndtm_pad1 = 0; ndtmsg->ndtm_pad2 = 0; if (nla_put_string(skb, NDTA_NAME, tbl->id) || nla_put_msecs(skb, NDTA_GC_INTERVAL, READ_ONCE(tbl->gc_interval), NDTA_PAD) || nla_put_u32(skb, NDTA_THRESH1, READ_ONCE(tbl->gc_thresh1)) || nla_put_u32(skb, NDTA_THRESH2, READ_ONCE(tbl->gc_thresh2)) || nla_put_u32(skb, NDTA_THRESH3, READ_ONCE(tbl->gc_thresh3))) goto nla_put_failure; { unsigned long now = jiffies; long flush_delta = now - READ_ONCE(tbl->last_flush); long rand_delta = now - READ_ONCE(tbl->last_rand); struct neigh_hash_table *nht; struct ndt_config ndc = { .ndtc_key_len = tbl->key_len, .ndtc_entry_size = tbl->entry_size, .ndtc_entries = atomic_read(&tbl->entries), .ndtc_last_flush = jiffies_to_msecs(flush_delta), .ndtc_last_rand = jiffies_to_msecs(rand_delta), .ndtc_proxy_qlen = READ_ONCE(tbl->proxy_queue.qlen), }; rcu_read_lock(); nht = rcu_dereference(tbl->nht); ndc.ndtc_hash_rnd = nht->hash_rnd[0]; ndc.ndtc_hash_mask = ((1 << nht->hash_shift) - 1); rcu_read_unlock(); if (nla_put(skb, NDTA_CONFIG, sizeof(ndc), &ndc)) goto nla_put_failure; } { int cpu; struct ndt_stats ndst; memset(&ndst, 0, sizeof(ndst)); for_each_possible_cpu(cpu) { struct neigh_statistics *st; st = per_cpu_ptr(tbl->stats, cpu); ndst.ndts_allocs += READ_ONCE(st->allocs); ndst.ndts_destroys += READ_ONCE(st->destroys); ndst.ndts_hash_grows += READ_ONCE(st->hash_grows); ndst.ndts_res_failed += READ_ONCE(st->res_failed); ndst.ndts_lookups += READ_ONCE(st->lookups); ndst.ndts_hits += READ_ONCE(st->hits); ndst.ndts_rcv_probes_mcast += READ_ONCE(st->rcv_probes_mcast); ndst.ndts_rcv_probes_ucast += READ_ONCE(st->rcv_probes_ucast); ndst.ndts_periodic_gc_runs += READ_ONCE(st->periodic_gc_runs); ndst.ndts_forced_gc_runs += READ_ONCE(st->forced_gc_runs); ndst.ndts_table_fulls += READ_ONCE(st->table_fulls); } if (nla_put_64bit(skb, NDTA_STATS, sizeof(ndst), &ndst, NDTA_PAD)) goto nla_put_failure; } BUG_ON(tbl->parms.dev); if (neightbl_fill_parms(skb, &tbl->parms) < 0) goto nla_put_failure; read_unlock_bh(&tbl->lock); nlmsg_end(skb, nlh); return 0; nla_put_failure: read_unlock_bh(&tbl->lock); nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int neightbl_fill_param_info(struct sk_buff *skb, struct neigh_table *tbl, struct neigh_parms *parms, u32 pid, u32 seq, int type, unsigned int flags) { struct ndtmsg *ndtmsg; struct nlmsghdr *nlh; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndtmsg), flags); if (nlh == NULL) return -EMSGSIZE; ndtmsg = nlmsg_data(nlh); read_lock_bh(&tbl->lock); ndtmsg->ndtm_family = tbl->family; ndtmsg->ndtm_pad1 = 0; ndtmsg->ndtm_pad2 = 0; if (nla_put_string(skb, NDTA_NAME, tbl->id) < 0 || neightbl_fill_parms(skb, parms) < 0) goto errout; read_unlock_bh(&tbl->lock); nlmsg_end(skb, nlh); return 0; errout: read_unlock_bh(&tbl->lock); nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static const struct nla_policy nl_neightbl_policy[NDTA_MAX+1] = { [NDTA_NAME] = { .type = NLA_STRING }, [NDTA_THRESH1] = { .type = NLA_U32 }, [NDTA_THRESH2] = { .type = NLA_U32 }, [NDTA_THRESH3] = { .type = NLA_U32 }, [NDTA_GC_INTERVAL] = { .type = NLA_U64 }, [NDTA_PARMS] = { .type = NLA_NESTED }, }; static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = { [NDTPA_IFINDEX] = { .type = NLA_U32 }, [NDTPA_QUEUE_LEN] = { .type = NLA_U32 }, [NDTPA_QUEUE_LENBYTES] = { .type = NLA_U32 }, [NDTPA_PROXY_QLEN] = { .type = NLA_U32 }, [NDTPA_APP_PROBES] = { .type = NLA_U32 }, [NDTPA_UCAST_PROBES] = { .type = NLA_U32 }, [NDTPA_MCAST_PROBES] = { .type = NLA_U32 }, [NDTPA_MCAST_REPROBES] = { .type = NLA_U32 }, [NDTPA_BASE_REACHABLE_TIME] = { .type = NLA_U64 }, [NDTPA_GC_STALETIME] = { .type = NLA_U64 }, [NDTPA_DELAY_PROBE_TIME] = { .type = NLA_U64 }, [NDTPA_RETRANS_TIME] = { .type = NLA_U64 }, [NDTPA_ANYCAST_DELAY] = { .type = NLA_U64 }, [NDTPA_PROXY_DELAY] = { .type = NLA_U64 }, [NDTPA_LOCKTIME] = { .type = NLA_U64 }, [NDTPA_INTERVAL_PROBE_TIME_MS] = { .type = NLA_U64, .min = 1 }, }; static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct neigh_table *tbl; struct ndtmsg *ndtmsg; struct nlattr *tb[NDTA_MAX+1]; bool found = false; int err, tidx; err = nlmsg_parse_deprecated(nlh, sizeof(*ndtmsg), tb, NDTA_MAX, nl_neightbl_policy, extack); if (err < 0) goto errout; if (tb[NDTA_NAME] == NULL) { err = -EINVAL; goto errout; } ndtmsg = nlmsg_data(nlh); for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) { tbl = rcu_dereference_rtnl(neigh_tables[tidx]); if (!tbl) continue; if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family) continue; if (nla_strcmp(tb[NDTA_NAME], tbl->id) == 0) { found = true; break; } } if (!found) return -ENOENT; /* * We acquire tbl->lock to be nice to the periodic timers and * make sure they always see a consistent set of values. */ write_lock_bh(&tbl->lock); if (tb[NDTA_PARMS]) { struct nlattr *tbp[NDTPA_MAX+1]; struct neigh_parms *p; int i, ifindex = 0; err = nla_parse_nested_deprecated(tbp, NDTPA_MAX, tb[NDTA_PARMS], nl_ntbl_parm_policy, extack); if (err < 0) goto errout_tbl_lock; if (tbp[NDTPA_IFINDEX]) ifindex = nla_get_u32(tbp[NDTPA_IFINDEX]); p = lookup_neigh_parms(tbl, net, ifindex); if (p == NULL) { err = -ENOENT; goto errout_tbl_lock; } for (i = 1; i <= NDTPA_MAX; i++) { if (tbp[i] == NULL) continue; switch (i) { case NDTPA_QUEUE_LEN: NEIGH_VAR_SET(p, QUEUE_LEN_BYTES, nla_get_u32(tbp[i]) * SKB_TRUESIZE(ETH_FRAME_LEN)); break; case NDTPA_QUEUE_LENBYTES: NEIGH_VAR_SET(p, QUEUE_LEN_BYTES, nla_get_u32(tbp[i])); break; case NDTPA_PROXY_QLEN: NEIGH_VAR_SET(p, PROXY_QLEN, nla_get_u32(tbp[i])); break; case NDTPA_APP_PROBES: NEIGH_VAR_SET(p, APP_PROBES, nla_get_u32(tbp[i])); break; case NDTPA_UCAST_PROBES: NEIGH_VAR_SET(p, UCAST_PROBES, nla_get_u32(tbp[i])); break; case NDTPA_MCAST_PROBES: NEIGH_VAR_SET(p, MCAST_PROBES, nla_get_u32(tbp[i])); break; case NDTPA_MCAST_REPROBES: NEIGH_VAR_SET(p, MCAST_REPROBES, nla_get_u32(tbp[i])); break; case NDTPA_BASE_REACHABLE_TIME: NEIGH_VAR_SET(p, BASE_REACHABLE_TIME, nla_get_msecs(tbp[i])); /* update reachable_time as well, otherwise, the change will * only be effective after the next time neigh_periodic_work * decides to recompute it (can be multiple minutes) */ p->reachable_time = neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); break; case NDTPA_GC_STALETIME: NEIGH_VAR_SET(p, GC_STALETIME, nla_get_msecs(tbp[i])); break; case NDTPA_DELAY_PROBE_TIME: NEIGH_VAR_SET(p, DELAY_PROBE_TIME, nla_get_msecs(tbp[i])); call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p); break; case NDTPA_INTERVAL_PROBE_TIME_MS: NEIGH_VAR_SET(p, INTERVAL_PROBE_TIME_MS, nla_get_msecs(tbp[i])); break; case NDTPA_RETRANS_TIME: NEIGH_VAR_SET(p, RETRANS_TIME, nla_get_msecs(tbp[i])); break; case NDTPA_ANYCAST_DELAY: NEIGH_VAR_SET(p, ANYCAST_DELAY, nla_get_msecs(tbp[i])); break; case NDTPA_PROXY_DELAY: NEIGH_VAR_SET(p, PROXY_DELAY, nla_get_msecs(tbp[i])); break; case NDTPA_LOCKTIME: NEIGH_VAR_SET(p, LOCKTIME, nla_get_msecs(tbp[i])); break; } } } err = -ENOENT; if ((tb[NDTA_THRESH1] || tb[NDTA_THRESH2] || tb[NDTA_THRESH3] || tb[NDTA_GC_INTERVAL]) && !net_eq(net, &init_net)) goto errout_tbl_lock; if (tb[NDTA_THRESH1]) WRITE_ONCE(tbl->gc_thresh1, nla_get_u32(tb[NDTA_THRESH1])); if (tb[NDTA_THRESH2]) WRITE_ONCE(tbl->gc_thresh2, nla_get_u32(tb[NDTA_THRESH2])); if (tb[NDTA_THRESH3]) WRITE_ONCE(tbl->gc_thresh3, nla_get_u32(tb[NDTA_THRESH3])); if (tb[NDTA_GC_INTERVAL]) WRITE_ONCE(tbl->gc_interval, nla_get_msecs(tb[NDTA_GC_INTERVAL])); err = 0; errout_tbl_lock: write_unlock_bh(&tbl->lock); errout: return err; } static int neightbl_valid_dump_info(const struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct ndtmsg *ndtm; ndtm = nlmsg_payload(nlh, sizeof(*ndtm)); if (!ndtm) { NL_SET_ERR_MSG(extack, "Invalid header for neighbor table dump request"); return -EINVAL; } if (ndtm->ndtm_pad1 || ndtm->ndtm_pad2) { NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor table dump request"); return -EINVAL; } if (nlmsg_attrlen(nlh, sizeof(*ndtm))) { NL_SET_ERR_MSG(extack, "Invalid data after header in neighbor table dump request"); return -EINVAL; } return 0; } static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); int family, tidx, nidx = 0; int tbl_skip = cb->args[0]; int neigh_skip = cb->args[1]; struct neigh_table *tbl; if (cb->strict_check) { int err = neightbl_valid_dump_info(nlh, cb->extack); if (err < 0) return err; } family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family; for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) { struct neigh_parms *p; tbl = rcu_dereference_rtnl(neigh_tables[tidx]); if (!tbl) continue; if (tidx < tbl_skip || (family && tbl->family != family)) continue; if (neightbl_fill_info(skb, tbl, NETLINK_CB(cb->skb).portid, nlh->nlmsg_seq, RTM_NEWNEIGHTBL, NLM_F_MULTI) < 0) break; nidx = 0; p = list_next_entry(&tbl->parms, list); list_for_each_entry_from(p, &tbl->parms_list, list) { if (!net_eq(neigh_parms_net(p), net)) continue; if (nidx < neigh_skip) goto next; if (neightbl_fill_param_info(skb, tbl, p, NETLINK_CB(cb->skb).portid, nlh->nlmsg_seq, RTM_NEWNEIGHTBL, NLM_F_MULTI) < 0) goto out; next: nidx++; } neigh_skip = 0; } out: cb->args[0] = tidx; cb->args[1] = nidx; return skb->len; } static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh, u32 pid, u32 seq, int type, unsigned int flags) { u32 neigh_flags, neigh_flags_ext; unsigned long now = jiffies; struct nda_cacheinfo ci; struct nlmsghdr *nlh; struct ndmsg *ndm; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), flags); if (nlh == NULL) return -EMSGSIZE; neigh_flags_ext = neigh->flags >> NTF_EXT_SHIFT; neigh_flags = neigh->flags & NTF_OLD_MASK; ndm = nlmsg_data(nlh); ndm->ndm_family = neigh->ops->family; ndm->ndm_pad1 = 0; ndm->ndm_pad2 = 0; ndm->ndm_flags = neigh_flags; ndm->ndm_type = neigh->type; ndm->ndm_ifindex = neigh->dev->ifindex; if (nla_put(skb, NDA_DST, neigh->tbl->key_len, neigh->primary_key)) goto nla_put_failure; read_lock_bh(&neigh->lock); ndm->ndm_state = neigh->nud_state; if (neigh->nud_state & NUD_VALID) { char haddr[MAX_ADDR_LEN]; neigh_ha_snapshot(haddr, neigh, neigh->dev); if (nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, haddr) < 0) { read_unlock_bh(&neigh->lock); goto nla_put_failure; } } ci.ndm_used = jiffies_to_clock_t(now - neigh->used); ci.ndm_confirmed = jiffies_to_clock_t(now - neigh->confirmed); ci.ndm_updated = jiffies_to_clock_t(now - neigh->updated); ci.ndm_refcnt = refcount_read(&neigh->refcnt) - 1; read_unlock_bh(&neigh->lock); if (nla_put_u32(skb, NDA_PROBES, atomic_read(&neigh->probes)) || nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci)) goto nla_put_failure; if (neigh->protocol && nla_put_u8(skb, NDA_PROTOCOL, neigh->protocol)) goto nla_put_failure; if (neigh_flags_ext && nla_put_u32(skb, NDA_FLAGS_EXT, neigh_flags_ext)) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn, u32 pid, u32 seq, int type, unsigned int flags, struct neigh_table *tbl) { u32 neigh_flags, neigh_flags_ext; struct nlmsghdr *nlh; struct ndmsg *ndm; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), flags); if (nlh == NULL) return -EMSGSIZE; neigh_flags_ext = pn->flags >> NTF_EXT_SHIFT; neigh_flags = pn->flags & NTF_OLD_MASK; ndm = nlmsg_data(nlh); ndm->ndm_family = tbl->family; ndm->ndm_pad1 = 0; ndm->ndm_pad2 = 0; ndm->ndm_flags = neigh_flags | NTF_PROXY; ndm->ndm_type = RTN_UNICAST; ndm->ndm_ifindex = pn->dev ? pn->dev->ifindex : 0; ndm->ndm_state = NUD_NONE; if (nla_put(skb, NDA_DST, tbl->key_len, pn->key)) goto nla_put_failure; if (pn->protocol && nla_put_u8(skb, NDA_PROTOCOL, pn->protocol)) goto nla_put_failure; if (neigh_flags_ext && nla_put_u32(skb, NDA_FLAGS_EXT, neigh_flags_ext)) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid) { call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh); __neigh_notify(neigh, RTM_NEWNEIGH, 0, nlmsg_pid); } static bool neigh_master_filtered(struct net_device *dev, int master_idx) { struct net_device *master; if (!master_idx) return false; master = dev ? netdev_master_upper_dev_get_rcu(dev) : NULL; /* 0 is already used to denote NDA_MASTER wasn't passed, therefore need another * invalid value for ifindex to denote "no master". */ if (master_idx == -1) return !!master; if (!master || master->ifindex != master_idx) return true; return false; } static bool neigh_ifindex_filtered(struct net_device *dev, int filter_idx) { if (filter_idx && (!dev || dev->ifindex != filter_idx)) return true; return false; } struct neigh_dump_filter { int master_idx; int dev_idx; }; static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, struct netlink_callback *cb, struct neigh_dump_filter *filter) { struct net *net = sock_net(skb->sk); struct neighbour *n; int err = 0, h, s_h = cb->args[1]; int idx, s_idx = idx = cb->args[2]; struct neigh_hash_table *nht; unsigned int flags = NLM_F_MULTI; if (filter->dev_idx || filter->master_idx) flags |= NLM_F_DUMP_FILTERED; nht = rcu_dereference(tbl->nht); for (h = s_h; h < (1 << nht->hash_shift); h++) { if (h > s_h) s_idx = 0; idx = 0; neigh_for_each_in_bucket_rcu(n, &nht->hash_heads[h]) { if (idx < s_idx || !net_eq(dev_net(n->dev), net)) goto next; if (neigh_ifindex_filtered(n->dev, filter->dev_idx) || neigh_master_filtered(n->dev, filter->master_idx)) goto next; err = neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNEIGH, flags); if (err < 0) goto out; next: idx++; } } out: cb->args[1] = h; cb->args[2] = idx; return err; } static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, struct netlink_callback *cb, struct neigh_dump_filter *filter) { struct pneigh_entry *n; struct net *net = sock_net(skb->sk); int err = 0, h, s_h = cb->args[3]; int idx, s_idx = idx = cb->args[4]; unsigned int flags = NLM_F_MULTI; if (filter->dev_idx || filter->master_idx) flags |= NLM_F_DUMP_FILTERED; read_lock_bh(&tbl->lock); for (h = s_h; h <= PNEIGH_HASHMASK; h++) { if (h > s_h) s_idx = 0; for (n = tbl->phash_buckets[h], idx = 0; n; n = n->next) { if (idx < s_idx || pneigh_net(n) != net) goto next; if (neigh_ifindex_filtered(n->dev, filter->dev_idx) || neigh_master_filtered(n->dev, filter->master_idx)) goto next; err = pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNEIGH, flags, tbl); if (err < 0) { read_unlock_bh(&tbl->lock); goto out; } next: idx++; } } read_unlock_bh(&tbl->lock); out: cb->args[3] = h; cb->args[4] = idx; return err; } static int neigh_valid_dump_req(const struct nlmsghdr *nlh, bool strict_check, struct neigh_dump_filter *filter, struct netlink_ext_ack *extack) { struct nlattr *tb[NDA_MAX + 1]; int err, i; if (strict_check) { struct ndmsg *ndm; ndm = nlmsg_payload(nlh, sizeof(*ndm)); if (!ndm) { NL_SET_ERR_MSG(extack, "Invalid header for neighbor dump request"); return -EINVAL; } if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_ifindex || ndm->ndm_state || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor dump request"); return -EINVAL; } if (ndm->ndm_flags & ~NTF_PROXY) { NL_SET_ERR_MSG(extack, "Invalid flags in header for neighbor dump request"); return -EINVAL; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, nda_policy, extack); } else { err = nlmsg_parse_deprecated(nlh, sizeof(struct ndmsg), tb, NDA_MAX, nda_policy, extack); } if (err < 0) return err; for (i = 0; i <= NDA_MAX; ++i) { if (!tb[i]) continue; /* all new attributes should require strict_check */ switch (i) { case NDA_IFINDEX: filter->dev_idx = nla_get_u32(tb[i]); break; case NDA_MASTER: filter->master_idx = nla_get_u32(tb[i]); break; default: if (strict_check) { NL_SET_ERR_MSG(extack, "Unsupported attribute in neighbor dump request"); return -EINVAL; } } } return 0; } static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct neigh_dump_filter filter = {}; struct neigh_table *tbl; int t, family, s_t; int proxy = 0; int err; family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family; /* check for full ndmsg structure presence, family member is * the same for both structures */ if (nlmsg_len(nlh) >= sizeof(struct ndmsg) && ((struct ndmsg *)nlmsg_data(nlh))->ndm_flags == NTF_PROXY) proxy = 1; err = neigh_valid_dump_req(nlh, cb->strict_check, &filter, cb->extack); if (err < 0 && cb->strict_check) return err; err = 0; s_t = cb->args[0]; rcu_read_lock(); for (t = 0; t < NEIGH_NR_TABLES; t++) { tbl = rcu_dereference(neigh_tables[t]); if (!tbl) continue; if (t < s_t || (family && tbl->family != family)) continue; if (t > s_t) memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); if (proxy) err = pneigh_dump_table(tbl, skb, cb, &filter); else err = neigh_dump_table(tbl, skb, cb, &filter); if (err < 0) break; } rcu_read_unlock(); cb->args[0] = t; return err; } static int neigh_valid_get_req(const struct nlmsghdr *nlh, struct neigh_table **tbl, void **dst, int *dev_idx, u8 *ndm_flags, struct netlink_ext_ack *extack) { struct nlattr *tb[NDA_MAX + 1]; struct ndmsg *ndm; int err, i; ndm = nlmsg_payload(nlh, sizeof(*ndm)); if (!ndm) { NL_SET_ERR_MSG(extack, "Invalid header for neighbor get request"); return -EINVAL; } if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor get request"); return -EINVAL; } if (ndm->ndm_flags & ~NTF_PROXY) { NL_SET_ERR_MSG(extack, "Invalid flags in header for neighbor get request"); return -EINVAL; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, nda_policy, extack); if (err < 0) return err; *ndm_flags = ndm->ndm_flags; *dev_idx = ndm->ndm_ifindex; *tbl = neigh_find_table(ndm->ndm_family); if (*tbl == NULL) { NL_SET_ERR_MSG(extack, "Unsupported family in header for neighbor get request"); return -EAFNOSUPPORT; } for (i = 0; i <= NDA_MAX; ++i) { if (!tb[i]) continue; switch (i) { case NDA_DST: if (nla_len(tb[i]) != (int)(*tbl)->key_len) { NL_SET_ERR_MSG(extack, "Invalid network address in neighbor get request"); return -EINVAL; } *dst = nla_data(tb[i]); break; default: NL_SET_ERR_MSG(extack, "Unsupported attribute in neighbor get request"); return -EINVAL; } } return 0; } static inline size_t neigh_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct ndmsg)) + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */ + nla_total_size(MAX_ADDR_LEN) /* NDA_LLADDR */ + nla_total_size(sizeof(struct nda_cacheinfo)) + nla_total_size(4) /* NDA_PROBES */ + nla_total_size(4) /* NDA_FLAGS_EXT */ + nla_total_size(1); /* NDA_PROTOCOL */ } static int neigh_get_reply(struct net *net, struct neighbour *neigh, u32 pid, u32 seq) { struct sk_buff *skb; int err = 0; skb = nlmsg_new(neigh_nlmsg_size(), GFP_KERNEL); if (!skb) return -ENOBUFS; err = neigh_fill_info(skb, neigh, pid, seq, RTM_NEWNEIGH, 0); if (err) { kfree_skb(skb); goto errout; } err = rtnl_unicast(skb, net, pid); errout: return err; } static inline size_t pneigh_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct ndmsg)) + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */ + nla_total_size(4) /* NDA_FLAGS_EXT */ + nla_total_size(1); /* NDA_PROTOCOL */ } static int pneigh_get_reply(struct net *net, struct pneigh_entry *neigh, u32 pid, u32 seq, struct neigh_table *tbl) { struct sk_buff *skb; int err = 0; skb = nlmsg_new(pneigh_nlmsg_size(), GFP_KERNEL); if (!skb) return -ENOBUFS; err = pneigh_fill_info(skb, neigh, pid, seq, RTM_NEWNEIGH, 0, tbl); if (err) { kfree_skb(skb); goto errout; } err = rtnl_unicast(skb, net, pid); errout: return err; } static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct net_device *dev = NULL; struct neigh_table *tbl = NULL; struct neighbour *neigh; void *dst = NULL; u8 ndm_flags = 0; int dev_idx = 0; int err; err = neigh_valid_get_req(nlh, &tbl, &dst, &dev_idx, &ndm_flags, extack); if (err < 0) return err; if (dev_idx) { dev = __dev_get_by_index(net, dev_idx); if (!dev) { NL_SET_ERR_MSG(extack, "Unknown device ifindex"); return -ENODEV; } } if (!dst) { NL_SET_ERR_MSG(extack, "Network address not specified"); return -EINVAL; } if (ndm_flags & NTF_PROXY) { struct pneigh_entry *pn; pn = pneigh_lookup(tbl, net, dst, dev, 0); if (!pn) { NL_SET_ERR_MSG(extack, "Proxy neighbour entry not found"); return -ENOENT; } return pneigh_get_reply(net, pn, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, tbl); } if (!dev) { NL_SET_ERR_MSG(extack, "No device specified"); return -EINVAL; } neigh = neigh_lookup(tbl, dst, dev); if (!neigh) { NL_SET_ERR_MSG(extack, "Neighbour entry not found"); return -ENOENT; } err = neigh_get_reply(net, neigh, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq); neigh_release(neigh); return err; } void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void *), void *cookie) { int chain; struct neigh_hash_table *nht; rcu_read_lock(); nht = rcu_dereference(tbl->nht); read_lock_bh(&tbl->lock); /* avoid resizes */ for (chain = 0; chain < (1 << nht->hash_shift); chain++) { struct neighbour *n; neigh_for_each_in_bucket(n, &nht->hash_heads[chain]) cb(n, cookie); } read_unlock_bh(&tbl->lock); rcu_read_unlock(); } EXPORT_SYMBOL(neigh_for_each); /* The tbl->lock must be held as a writer and BH disabled. */ void __neigh_for_each_release(struct neigh_table *tbl, int (*cb)(struct neighbour *)) { struct neigh_hash_table *nht; int chain; nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); for (chain = 0; chain < (1 << nht->hash_shift); chain++) { struct hlist_node *tmp; struct neighbour *n; neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[chain]) { int release; write_lock(&n->lock); release = cb(n); if (release) { hlist_del_rcu(&n->hash); hlist_del_rcu(&n->dev_list); neigh_mark_dead(n); } write_unlock(&n->lock); if (release) neigh_cleanup_and_release(n); } } } EXPORT_SYMBOL(__neigh_for_each_release); int neigh_xmit(int index, struct net_device *dev, const void *addr, struct sk_buff *skb) { int err = -EAFNOSUPPORT; if (likely(index < NEIGH_NR_TABLES)) { struct neigh_table *tbl; struct neighbour *neigh; rcu_read_lock(); tbl = rcu_dereference(neigh_tables[index]); if (!tbl) goto out_unlock; if (index == NEIGH_ARP_TABLE) { u32 key = *((u32 *)addr); neigh = __ipv4_neigh_lookup_noref(dev, key); } else { neigh = __neigh_lookup_noref(tbl, addr, dev); } if (!neigh) neigh = __neigh_create(tbl, addr, dev, false); err = PTR_ERR(neigh); if (IS_ERR(neigh)) { rcu_read_unlock(); goto out_kfree_skb; } err = READ_ONCE(neigh->output)(neigh, skb); out_unlock: rcu_read_unlock(); } else if (index == NEIGH_LINK_TABLE) { err = dev_hard_header(skb, dev, ntohs(skb->protocol), addr, NULL, skb->len); if (err < 0) goto out_kfree_skb; err = dev_queue_xmit(skb); } out: return err; out_kfree_skb: kfree_skb(skb); goto out; } EXPORT_SYMBOL(neigh_xmit); #ifdef CONFIG_PROC_FS static struct neighbour *neigh_get_valid(struct seq_file *seq, struct neighbour *n, loff_t *pos) { struct neigh_seq_state *state = seq->private; struct net *net = seq_file_net(seq); if (!net_eq(dev_net(n->dev), net)) return NULL; if (state->neigh_sub_iter) { loff_t fakep = 0; void *v; v = state->neigh_sub_iter(state, n, pos ? pos : &fakep); if (!v) return NULL; if (pos) return v; } if (!(state->flags & NEIGH_SEQ_SKIP_NOARP)) return n; if (READ_ONCE(n->nud_state) & ~NUD_NOARP) return n; return NULL; } static struct neighbour *neigh_get_first(struct seq_file *seq) { struct neigh_seq_state *state = seq->private; struct neigh_hash_table *nht = state->nht; struct neighbour *n, *tmp; state->flags &= ~NEIGH_SEQ_IS_PNEIGH; while (++state->bucket < (1 << nht->hash_shift)) { neigh_for_each_in_bucket(n, &nht->hash_heads[state->bucket]) { tmp = neigh_get_valid(seq, n, NULL); if (tmp) return tmp; } } return NULL; } static struct neighbour *neigh_get_next(struct seq_file *seq, struct neighbour *n, loff_t *pos) { struct neigh_seq_state *state = seq->private; struct neighbour *tmp; if (state->neigh_sub_iter) { void *v = state->neigh_sub_iter(state, n, pos); if (v) return n; } hlist_for_each_entry_continue(n, hash) { tmp = neigh_get_valid(seq, n, pos); if (tmp) { n = tmp; goto out; } } n = neigh_get_first(seq); out: if (n && pos) --(*pos); return n; } static struct neighbour *neigh_get_idx(struct seq_file *seq, loff_t *pos) { struct neighbour *n = neigh_get_first(seq); if (n) { --(*pos); while (*pos) { n = neigh_get_next(seq, n, pos); if (!n) break; } } return *pos ? NULL : n; } static struct pneigh_entry *pneigh_get_first(struct seq_file *seq) { struct neigh_seq_state *state = seq->private; struct net *net = seq_file_net(seq); struct neigh_table *tbl = state->tbl; struct pneigh_entry *pn = NULL; int bucket; state->flags |= NEIGH_SEQ_IS_PNEIGH; for (bucket = 0; bucket <= PNEIGH_HASHMASK; bucket++) { pn = tbl->phash_buckets[bucket]; while (pn && !net_eq(pneigh_net(pn), net)) pn = pn->next; if (pn) break; } state->bucket = bucket; return pn; } static struct pneigh_entry *pneigh_get_next(struct seq_file *seq, struct pneigh_entry *pn, loff_t *pos) { struct neigh_seq_state *state = seq->private; struct net *net = seq_file_net(seq); struct neigh_table *tbl = state->tbl; do { pn = pn->next; } while (pn && !net_eq(pneigh_net(pn), net)); while (!pn) { if (++state->bucket > PNEIGH_HASHMASK) break; pn = tbl->phash_buckets[state->bucket]; while (pn && !net_eq(pneigh_net(pn), net)) pn = pn->next; if (pn) break; } if (pn && pos) --(*pos); return pn; } static struct pneigh_entry *pneigh_get_idx(struct seq_file *seq, loff_t *pos) { struct pneigh_entry *pn = pneigh_get_first(seq); if (pn) { --(*pos); while (*pos) { pn = pneigh_get_next(seq, pn, pos); if (!pn) break; } } return *pos ? NULL : pn; } static void *neigh_get_idx_any(struct seq_file *seq, loff_t *pos) { struct neigh_seq_state *state = seq->private; void *rc; loff_t idxpos = *pos; rc = neigh_get_idx(seq, &idxpos); if (!rc && !(state->flags & NEIGH_SEQ_NEIGH_ONLY)) rc = pneigh_get_idx(seq, &idxpos); return rc; } void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl, unsigned int neigh_seq_flags) __acquires(tbl->lock) __acquires(rcu) { struct neigh_seq_state *state = seq->private; state->tbl = tbl; state->bucket = -1; state->flags = (neigh_seq_flags & ~NEIGH_SEQ_IS_PNEIGH); rcu_read_lock(); state->nht = rcu_dereference(tbl->nht); read_lock_bh(&tbl->lock); return *pos ? neigh_get_idx_any(seq, pos) : SEQ_START_TOKEN; } EXPORT_SYMBOL(neigh_seq_start); void *neigh_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct neigh_seq_state *state; void *rc; if (v == SEQ_START_TOKEN) { rc = neigh_get_first(seq); goto out; } state = seq->private; if (!(state->flags & NEIGH_SEQ_IS_PNEIGH)) { rc = neigh_get_next(seq, v, NULL); if (rc) goto out; if (!(state->flags & NEIGH_SEQ_NEIGH_ONLY)) rc = pneigh_get_first(seq); } else { BUG_ON(state->flags & NEIGH_SEQ_NEIGH_ONLY); rc = pneigh_get_next(seq, v, NULL); } out: ++(*pos); return rc; } EXPORT_SYMBOL(neigh_seq_next); void neigh_seq_stop(struct seq_file *seq, void *v) __releases(tbl->lock) __releases(rcu) { struct neigh_seq_state *state = seq->private; struct neigh_table *tbl = state->tbl; read_unlock_bh(&tbl->lock); rcu_read_unlock(); } EXPORT_SYMBOL(neigh_seq_stop); /* statistics via seq_file */ static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos) { struct neigh_table *tbl = pde_data(file_inode(seq->file)); int cpu; if (*pos == 0) return SEQ_START_TOKEN; for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu+1; return per_cpu_ptr(tbl->stats, cpu); } return NULL; } static void *neigh_stat_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct neigh_table *tbl = pde_data(file_inode(seq->file)); int cpu; for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu+1; return per_cpu_ptr(tbl->stats, cpu); } (*pos)++; return NULL; } static void neigh_stat_seq_stop(struct seq_file *seq, void *v) { } static int neigh_stat_seq_show(struct seq_file *seq, void *v) { struct neigh_table *tbl = pde_data(file_inode(seq->file)); struct neigh_statistics *st = v; if (v == SEQ_START_TOKEN) { seq_puts(seq, "entries allocs destroys hash_grows lookups hits res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs unresolved_discards table_fulls\n"); return 0; } seq_printf(seq, "%08x %08lx %08lx %08lx %08lx %08lx %08lx " "%08lx %08lx %08lx " "%08lx %08lx %08lx\n", atomic_read(&tbl->entries), st->allocs, st->destroys, st->hash_grows, st->lookups, st->hits, st->res_failed, st->rcv_probes_mcast, st->rcv_probes_ucast, st->periodic_gc_runs, st->forced_gc_runs, st->unres_discards, st->table_fulls ); return 0; } static const struct seq_operations neigh_stat_seq_ops = { .start = neigh_stat_seq_start, .next = neigh_stat_seq_next, .stop = neigh_stat_seq_stop, .show = neigh_stat_seq_show, }; #endif /* CONFIG_PROC_FS */ static void __neigh_notify(struct neighbour *n, int type, int flags, u32 pid) { struct sk_buff *skb; int err = -ENOBUFS; struct net *net; rcu_read_lock(); net = dev_net_rcu(n->dev); skb = nlmsg_new(neigh_nlmsg_size(), GFP_ATOMIC); if (skb == NULL) goto errout; err = neigh_fill_info(skb, n, pid, 0, type, flags); if (err < 0) { /* -EMSGSIZE implies BUG in neigh_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); goto out; errout: rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); out: rcu_read_unlock(); } void neigh_app_ns(struct neighbour *n) { __neigh_notify(n, RTM_GETNEIGH, NLM_F_REQUEST, 0); } EXPORT_SYMBOL(neigh_app_ns); #ifdef CONFIG_SYSCTL static int unres_qlen_max = INT_MAX / SKB_TRUESIZE(ETH_FRAME_LEN); static int proc_unres_qlen(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int size, ret; struct ctl_table tmp = *ctl; tmp.extra1 = SYSCTL_ZERO; tmp.extra2 = &unres_qlen_max; tmp.data = &size; size = *(int *)ctl->data / SKB_TRUESIZE(ETH_FRAME_LEN); ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && !ret) *(int *)ctl->data = size * SKB_TRUESIZE(ETH_FRAME_LEN); return ret; } static void neigh_copy_dflt_parms(struct net *net, struct neigh_parms *p, int index) { struct net_device *dev; int family = neigh_parms_family(p); rcu_read_lock(); for_each_netdev_rcu(net, dev) { struct neigh_parms *dst_p = neigh_get_dev_parms_rcu(dev, family); if (dst_p && !test_bit(index, dst_p->data_state)) dst_p->data[index] = p->data[index]; } rcu_read_unlock(); } static void neigh_proc_update(const struct ctl_table *ctl, int write) { struct net_device *dev = ctl->extra1; struct neigh_parms *p = ctl->extra2; struct net *net = neigh_parms_net(p); int index = (int *) ctl->data - p->data; if (!write) return; set_bit(index, p->data_state); if (index == NEIGH_VAR_DELAY_PROBE_TIME) call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p); if (!dev) /* NULL dev means this is default value */ neigh_copy_dflt_parms(net, p, index); } static int neigh_proc_dointvec_zero_intmax(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table tmp = *ctl; int ret; tmp.extra1 = SYSCTL_ZERO; tmp.extra2 = SYSCTL_INT_MAX; ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); neigh_proc_update(ctl, write); return ret; } static int neigh_proc_dointvec_ms_jiffies_positive(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table tmp = *ctl; int ret; int min = msecs_to_jiffies(1); tmp.extra1 = &min; tmp.extra2 = NULL; ret = proc_dointvec_ms_jiffies_minmax(&tmp, write, buffer, lenp, ppos); neigh_proc_update(ctl, write); return ret; } int neigh_proc_dointvec(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); neigh_proc_update(ctl, write); return ret; } EXPORT_SYMBOL(neigh_proc_dointvec); int neigh_proc_dointvec_jiffies(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos); neigh_proc_update(ctl, write); return ret; } EXPORT_SYMBOL(neigh_proc_dointvec_jiffies); static int neigh_proc_dointvec_userhz_jiffies(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_userhz_jiffies(ctl, write, buffer, lenp, ppos); neigh_proc_update(ctl, write); return ret; } int neigh_proc_dointvec_ms_jiffies(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos); neigh_proc_update(ctl, write); return ret; } EXPORT_SYMBOL(neigh_proc_dointvec_ms_jiffies); static int neigh_proc_dointvec_unres_qlen(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_unres_qlen(ctl, write, buffer, lenp, ppos); neigh_proc_update(ctl, write); return ret; } static int neigh_proc_base_reachable_time(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct neigh_parms *p = ctl->extra2; int ret; if (strcmp(ctl->procname, "base_reachable_time") == 0) ret = neigh_proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos); else if (strcmp(ctl->procname, "base_reachable_time_ms") == 0) ret = neigh_proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos); else ret = -1; if (write && ret == 0) { /* update reachable_time as well, otherwise, the change will * only be effective after the next time neigh_periodic_work * decides to recompute it */ p->reachable_time = neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); } return ret; } #define NEIGH_PARMS_DATA_OFFSET(index) \ (&((struct neigh_parms *) 0)->data[index]) #define NEIGH_SYSCTL_ENTRY(attr, data_attr, name, mval, proc) \ [NEIGH_VAR_ ## attr] = { \ .procname = name, \ .data = NEIGH_PARMS_DATA_OFFSET(NEIGH_VAR_ ## data_attr), \ .maxlen = sizeof(int), \ .mode = mval, \ .proc_handler = proc, \ } #define NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(attr, name) \ NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_zero_intmax) #define NEIGH_SYSCTL_JIFFIES_ENTRY(attr, name) \ NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_jiffies) #define NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(attr, name) \ NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_userhz_jiffies) #define NEIGH_SYSCTL_MS_JIFFIES_POSITIVE_ENTRY(attr, name) \ NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_ms_jiffies_positive) #define NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(attr, data_attr, name) \ NEIGH_SYSCTL_ENTRY(attr, data_attr, name, 0644, neigh_proc_dointvec_ms_jiffies) #define NEIGH_SYSCTL_UNRES_QLEN_REUSED_ENTRY(attr, data_attr, name) \ NEIGH_SYSCTL_ENTRY(attr, data_attr, name, 0644, neigh_proc_dointvec_unres_qlen) static struct neigh_sysctl_table { struct ctl_table_header *sysctl_header; struct ctl_table neigh_vars[NEIGH_VAR_MAX]; } neigh_sysctl_template __read_mostly = { .neigh_vars = { NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_PROBES, "mcast_solicit"), NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(UCAST_PROBES, "ucast_solicit"), NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(APP_PROBES, "app_solicit"), NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_REPROBES, "mcast_resolicit"), NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(RETRANS_TIME, "retrans_time"), NEIGH_SYSCTL_JIFFIES_ENTRY(BASE_REACHABLE_TIME, "base_reachable_time"), NEIGH_SYSCTL_JIFFIES_ENTRY(DELAY_PROBE_TIME, "delay_first_probe_time"), NEIGH_SYSCTL_MS_JIFFIES_POSITIVE_ENTRY(INTERVAL_PROBE_TIME_MS, "interval_probe_time_ms"), NEIGH_SYSCTL_JIFFIES_ENTRY(GC_STALETIME, "gc_stale_time"), NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(QUEUE_LEN_BYTES, "unres_qlen_bytes"), NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(PROXY_QLEN, "proxy_qlen"), NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(ANYCAST_DELAY, "anycast_delay"), NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(PROXY_DELAY, "proxy_delay"), NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(LOCKTIME, "locktime"), NEIGH_SYSCTL_UNRES_QLEN_REUSED_ENTRY(QUEUE_LEN, QUEUE_LEN_BYTES, "unres_qlen"), NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(RETRANS_TIME_MS, RETRANS_TIME, "retrans_time_ms"), NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(BASE_REACHABLE_TIME_MS, BASE_REACHABLE_TIME, "base_reachable_time_ms"), [NEIGH_VAR_GC_INTERVAL] = { .procname = "gc_interval", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NEIGH_VAR_GC_THRESH1] = { .procname = "gc_thresh1", .maxlen = sizeof(int), .mode = 0644, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_INT_MAX, .proc_handler = proc_dointvec_minmax, }, [NEIGH_VAR_GC_THRESH2] = { .procname = "gc_thresh2", .maxlen = sizeof(int), .mode = 0644, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_INT_MAX, .proc_handler = proc_dointvec_minmax, }, [NEIGH_VAR_GC_THRESH3] = { .procname = "gc_thresh3", .maxlen = sizeof(int), .mode = 0644, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_INT_MAX, .proc_handler = proc_dointvec_minmax, }, }, }; int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, proc_handler *handler) { int i; struct neigh_sysctl_table *t; const char *dev_name_source; char neigh_path[ sizeof("net//neigh/") + IFNAMSIZ + IFNAMSIZ ]; char *p_name; size_t neigh_vars_size; t = kmemdup(&neigh_sysctl_template, sizeof(*t), GFP_KERNEL_ACCOUNT); if (!t) goto err; for (i = 0; i < NEIGH_VAR_GC_INTERVAL; i++) { t->neigh_vars[i].data += (long) p; t->neigh_vars[i].extra1 = dev; t->neigh_vars[i].extra2 = p; } neigh_vars_size = ARRAY_SIZE(t->neigh_vars); if (dev) { dev_name_source = dev->name; /* Terminate the table early */ neigh_vars_size = NEIGH_VAR_BASE_REACHABLE_TIME_MS + 1; } else { struct neigh_table *tbl = p->tbl; dev_name_source = "default"; t->neigh_vars[NEIGH_VAR_GC_INTERVAL].data = &tbl->gc_interval; t->neigh_vars[NEIGH_VAR_GC_THRESH1].data = &tbl->gc_thresh1; t->neigh_vars[NEIGH_VAR_GC_THRESH2].data = &tbl->gc_thresh2; t->neigh_vars[NEIGH_VAR_GC_THRESH3].data = &tbl->gc_thresh3; } if (handler) { /* RetransTime */ t->neigh_vars[NEIGH_VAR_RETRANS_TIME].proc_handler = handler; /* ReachableTime */ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].proc_handler = handler; /* RetransTime (in milliseconds)*/ t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].proc_handler = handler; /* ReachableTime (in milliseconds) */ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].proc_handler = handler; } else { /* Those handlers will update p->reachable_time after * base_reachable_time(_ms) is set to ensure the new timer starts being * applied after the next neighbour update instead of waiting for * neigh_periodic_work to update its value (can be multiple minutes) * So any handler that replaces them should do this as well */ /* ReachableTime */ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].proc_handler = neigh_proc_base_reachable_time; /* ReachableTime (in milliseconds) */ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].proc_handler = neigh_proc_base_reachable_time; } switch (neigh_parms_family(p)) { case AF_INET: p_name = "ipv4"; break; case AF_INET6: p_name = "ipv6"; break; default: BUG(); } snprintf(neigh_path, sizeof(neigh_path), "net/%s/neigh/%s", p_name, dev_name_source); t->sysctl_header = register_net_sysctl_sz(neigh_parms_net(p), neigh_path, t->neigh_vars, neigh_vars_size); if (!t->sysctl_header) goto free; p->sysctl_table = t; return 0; free: kfree(t); err: return -ENOBUFS; } EXPORT_SYMBOL(neigh_sysctl_register); void neigh_sysctl_unregister(struct neigh_parms *p) { if (p->sysctl_table) { struct neigh_sysctl_table *t = p->sysctl_table; p->sysctl_table = NULL; unregister_net_sysctl_table(t->sysctl_header); kfree(t); } } EXPORT_SYMBOL(neigh_sysctl_unregister); #endif /* CONFIG_SYSCTL */ static const struct rtnl_msg_handler neigh_rtnl_msg_handlers[] __initconst = { {.msgtype = RTM_NEWNEIGH, .doit = neigh_add}, {.msgtype = RTM_DELNEIGH, .doit = neigh_delete}, {.msgtype = RTM_GETNEIGH, .doit = neigh_get, .dumpit = neigh_dump_info, .flags = RTNL_FLAG_DUMP_UNLOCKED}, {.msgtype = RTM_GETNEIGHTBL, .dumpit = neightbl_dump_info}, {.msgtype = RTM_SETNEIGHTBL, .doit = neightbl_set}, }; static int __init neigh_init(void) { rtnl_register_many(neigh_rtnl_msg_handlers); return 0; } subsys_initcall(neigh_init);
65 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_TIME64_H #define _LINUX_TIME64_H #include <linux/math64.h> #include <vdso/time64.h> typedef __s64 time64_t; typedef __u64 timeu64_t; #include <uapi/linux/time.h> struct timespec64 { time64_t tv_sec; /* seconds */ long tv_nsec; /* nanoseconds */ }; struct itimerspec64 { struct timespec64 it_interval; struct timespec64 it_value; }; /* Parameters used to convert the timespec values: */ #define PSEC_PER_NSEC 1000L /* Located here for timespec[64]_valid_strict */ #define TIME64_MAX ((s64)~((u64)1 << 63)) #define TIME64_MIN (-TIME64_MAX - 1) #define KTIME_MAX ((s64)~((u64)1 << 63)) #define KTIME_MIN (-KTIME_MAX - 1) #define KTIME_SEC_MAX (KTIME_MAX / NSEC_PER_SEC) #define KTIME_SEC_MIN (KTIME_MIN / NSEC_PER_SEC) /* * Limits for settimeofday(): * * To prevent setting the time close to the wraparound point time setting * is limited so a reasonable uptime can be accomodated. Uptime of 30 years * should be really sufficient, which means the cutoff is 2232. At that * point the cutoff is just a small part of the larger problem. */ #define TIME_UPTIME_SEC_MAX (30LL * 365 * 24 *3600) #define TIME_SETTOD_SEC_MAX (KTIME_SEC_MAX - TIME_UPTIME_SEC_MAX) static inline int timespec64_equal(const struct timespec64 *a, const struct timespec64 *b) { return (a->tv_sec == b->tv_sec) && (a->tv_nsec == b->tv_nsec); } static inline bool timespec64_is_epoch(const struct timespec64 *ts) { return ts->tv_sec == 0 && ts->tv_nsec == 0; } /* * lhs < rhs: return <0 * lhs == rhs: return 0 * lhs > rhs: return >0 */ static inline int timespec64_compare(const struct timespec64 *lhs, const struct timespec64 *rhs) { if (lhs->tv_sec < rhs->tv_sec) return -1; if (lhs->tv_sec > rhs->tv_sec) return 1; return lhs->tv_nsec - rhs->tv_nsec; } extern void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec); static inline struct timespec64 timespec64_add(struct timespec64 lhs, struct timespec64 rhs) { struct timespec64 ts_delta; set_normalized_timespec64(&ts_delta, lhs.tv_sec + rhs.tv_sec, lhs.tv_nsec + rhs.tv_nsec); return ts_delta; } /* * sub = lhs - rhs, in normalized form */ static inline struct timespec64 timespec64_sub(struct timespec64 lhs, struct timespec64 rhs) { struct timespec64 ts_delta; set_normalized_timespec64(&ts_delta, lhs.tv_sec - rhs.tv_sec, lhs.tv_nsec - rhs.tv_nsec); return ts_delta; } /* * Returns true if the timespec64 is norm, false if denorm: */ static inline bool timespec64_valid(const struct timespec64 *ts) { /* Dates before 1970 are bogus */ if (ts->tv_sec < 0) return false; /* Can't have more nanoseconds then a second */ if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) return false; return true; } static inline bool timespec64_valid_strict(const struct timespec64 *ts) { if (!timespec64_valid(ts)) return false; /* Disallow values that could overflow ktime_t */ if ((unsigned long long)ts->tv_sec >= KTIME_SEC_MAX) return false; return true; } static inline bool timespec64_valid_settod(const struct timespec64 *ts) { if (!timespec64_valid(ts)) return false; /* Disallow values which cause overflow issues vs. CLOCK_REALTIME */ if ((unsigned long long)ts->tv_sec >= TIME_SETTOD_SEC_MAX) return false; return true; } /** * timespec64_to_ns - Convert timespec64 to nanoseconds * @ts: pointer to the timespec64 variable to be converted * * Returns the scalar nanosecond representation of the timespec64 * parameter. */ static inline s64 timespec64_to_ns(const struct timespec64 *ts) { /* Prevent multiplication overflow / underflow */ if (ts->tv_sec >= KTIME_SEC_MAX) return KTIME_MAX; if (ts->tv_sec <= KTIME_SEC_MIN) return KTIME_MIN; return ((s64) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec; } /** * ns_to_timespec64 - Convert nanoseconds to timespec64 * @nsec: the nanoseconds value to be converted * * Returns the timespec64 representation of the nsec parameter. */ extern struct timespec64 ns_to_timespec64(s64 nsec); /** * timespec64_add_ns - Adds nanoseconds to a timespec64 * @a: pointer to timespec64 to be incremented * @ns: unsigned nanoseconds value to be added * * This must always be inlined because its used from the x86-64 vdso, * which cannot call other kernel functions. */ static __always_inline void timespec64_add_ns(struct timespec64 *a, u64 ns) { a->tv_sec += __iter_div_u64_rem(a->tv_nsec + ns, NSEC_PER_SEC, &ns); a->tv_nsec = ns; } /* * timespec64_add_safe assumes both values are positive and checks for * overflow. It will return TIME64_MAX in case of overflow. */ extern struct timespec64 timespec64_add_safe(const struct timespec64 lhs, const struct timespec64 rhs); #endif /* _LINUX_TIME64_H */
307 307 311 311 310 300 300 301 302 302 249 248 250 250 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 // SPDX-License-Identifier: GPL-2.0 /* * arch/arm64/kvm/fpsimd.c: Guest/host FPSIMD context coordination helpers * * Copyright 2018 Arm Limited * Author: Dave Martin <Dave.Martin@arm.com> */ #include <linux/irqflags.h> #include <linux/sched.h> #include <linux/kvm_host.h> #include <asm/fpsimd.h> #include <asm/kvm_asm.h> #include <asm/kvm_hyp.h> #include <asm/kvm_mmu.h> #include <asm/sysreg.h> /* * Called on entry to KVM_RUN unless this vcpu previously ran at least * once and the most recent prior KVM_RUN for this vcpu was called from * the same task as current (highly likely). * * This is guaranteed to execute before kvm_arch_vcpu_load_fp(vcpu), * such that on entering hyp the relevant parts of current are already * mapped. */ int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu) { struct user_fpsimd_state *fpsimd = &current->thread.uw.fpsimd_state; int ret; /* pKVM has its own tracking of the host fpsimd state. */ if (is_protected_kvm_enabled()) return 0; /* Make sure the host task fpsimd state is visible to hyp: */ ret = kvm_share_hyp(fpsimd, fpsimd + 1); if (ret) return ret; return 0; } /* * Prepare vcpu for saving the host's FPSIMD state and loading the guest's. * The actual loading is done by the FPSIMD access trap taken to hyp. * * Here, we just set the correct metadata to indicate that the FPSIMD * state in the cpu regs (if any) belongs to current on the host. */ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu) { BUG_ON(!current->mm); if (!system_supports_fpsimd()) return; /* * Ensure that any host FPSIMD/SVE/SME state is saved and unbound such * that the host kernel is responsible for restoring this state upon * return to userspace, and the hyp code doesn't need to save anything. * * When the host may use SME, fpsimd_save_and_flush_cpu_state() ensures * that PSTATE.{SM,ZA} == {0,0}. */ fpsimd_save_and_flush_cpu_state(); *host_data_ptr(fp_owner) = FP_STATE_FREE; WARN_ON_ONCE(system_supports_sme() && read_sysreg_s(SYS_SVCR)); } /* * Called just before entering the guest once we are no longer preemptible * and interrupts are disabled. If we have managed to run anything using * FP while we were preemptible (such as off the back of an interrupt), * then neither the host nor the guest own the FP hardware (and it was the * responsibility of the code that used FP to save the existing state). */ void kvm_arch_vcpu_ctxflush_fp(struct kvm_vcpu *vcpu) { if (test_thread_flag(TIF_FOREIGN_FPSTATE)) *host_data_ptr(fp_owner) = FP_STATE_FREE; } /* * Called just after exiting the guest. If the guest FPSIMD state * was loaded, update the host's context tracking data mark the CPU * FPSIMD regs as dirty and belonging to vcpu so that they will be * written back if the kernel clobbers them due to kernel-mode NEON * before re-entry into the guest. */ void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu) { struct cpu_fp_state fp_state; WARN_ON_ONCE(!irqs_disabled()); if (guest_owns_fp_regs()) { /* * Currently we do not support SME guests so SVCR is * always 0 and we just need a variable to point to. */ fp_state.st = &vcpu->arch.ctxt.fp_regs; fp_state.sve_state = vcpu->arch.sve_state; fp_state.sve_vl = vcpu->arch.sve_max_vl; fp_state.sme_state = NULL; fp_state.svcr = __ctxt_sys_reg(&vcpu->arch.ctxt, SVCR); fp_state.fpmr = __ctxt_sys_reg(&vcpu->arch.ctxt, FPMR); fp_state.fp_type = &vcpu->arch.fp_type; if (vcpu_has_sve(vcpu)) fp_state.to_save = FP_STATE_SVE; else fp_state.to_save = FP_STATE_FPSIMD; fpsimd_bind_state_to_cpu(&fp_state); clear_thread_flag(TIF_FOREIGN_FPSTATE); } } /* * Write back the vcpu FPSIMD regs if they are dirty, and invalidate the * cpu FPSIMD regs so that they can't be spuriously reused if this vcpu * disappears and another task or vcpu appears that recycles the same * struct fpsimd_state. */ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu) { unsigned long flags; local_irq_save(flags); if (guest_owns_fp_regs()) { /* * Flush (save and invalidate) the fpsimd/sve state so that if * the host tries to use fpsimd/sve, it's not using stale data * from the guest. * * Flushing the state sets the TIF_FOREIGN_FPSTATE bit for the * context unconditionally, in both nVHE and VHE. This allows * the kernel to restore the fpsimd/sve state, including ZCR_EL1 * when needed. */ fpsimd_save_and_flush_cpu_state(); } local_irq_restore(flags); }
6 6 6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET 802.1Q VLAN * Ethernet-type device handling. * * Authors: Ben Greear <greearb@candelatech.com> * Please send support related email to: netdev@vger.kernel.org * VLAN Home Page: http://www.candelatech.com/~greear/vlan.html * * Fixes: * Fix for packet capture - Nick Eggleston <nick@dccinc.com>; * Add HW acceleration hooks - David S. Miller <davem@redhat.com>; * Correct all the locking - David S. Miller <davem@redhat.com>; * Use hash table for VLAN groups - David S. Miller <davem@redhat.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/capability.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/rculist.h> #include <net/arp.h> #include <linux/rtnetlink.h> #include <linux/notifier.h> #include <net/rtnetlink.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <linux/uaccess.h> #include <linux/if_vlan.h> #include "vlan.h" #include "vlanproc.h" #define DRV_VERSION "1.8" /* Global VLAN variables */ unsigned int vlan_net_id __read_mostly; const char vlan_fullname[] = "802.1Q VLAN Support"; const char vlan_version[] = DRV_VERSION; /* End of global variables definitions. */ static int vlan_group_prealloc_vid(struct vlan_group *vg, __be16 vlan_proto, u16 vlan_id) { struct net_device **array; unsigned int vidx; unsigned int size; int pidx; ASSERT_RTNL(); pidx = vlan_proto_idx(vlan_proto); if (pidx < 0) return -EINVAL; vidx = vlan_id / VLAN_GROUP_ARRAY_PART_LEN; array = vg->vlan_devices_arrays[pidx][vidx]; if (array != NULL) return 0; size = sizeof(struct net_device *) * VLAN_GROUP_ARRAY_PART_LEN; array = kzalloc(size, GFP_KERNEL_ACCOUNT); if (array == NULL) return -ENOBUFS; /* paired with smp_rmb() in __vlan_group_get_device() */ smp_wmb(); vg->vlan_devices_arrays[pidx][vidx] = array; return 0; } static void vlan_stacked_transfer_operstate(const struct net_device *rootdev, struct net_device *dev, struct vlan_dev_priv *vlan) { if (!(vlan->flags & VLAN_FLAG_BRIDGE_BINDING)) netif_stacked_transfer_operstate(rootdev, dev); } void unregister_vlan_dev(struct net_device *dev, struct list_head *head) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev = vlan->real_dev; struct vlan_info *vlan_info; struct vlan_group *grp; u16 vlan_id = vlan->vlan_id; ASSERT_RTNL(); vlan_info = rtnl_dereference(real_dev->vlan_info); BUG_ON(!vlan_info); grp = &vlan_info->grp; grp->nr_vlan_devs--; if (vlan->flags & VLAN_FLAG_MVRP) vlan_mvrp_request_leave(dev); if (vlan->flags & VLAN_FLAG_GVRP) vlan_gvrp_request_leave(dev); vlan_group_set_device(grp, vlan->vlan_proto, vlan_id, NULL); netdev_upper_dev_unlink(real_dev, dev); /* Because unregister_netdevice_queue() makes sure at least one rcu * grace period is respected before device freeing, * we dont need to call synchronize_net() here. */ unregister_netdevice_queue(dev, head); if (grp->nr_vlan_devs == 0) { vlan_mvrp_uninit_applicant(real_dev); vlan_gvrp_uninit_applicant(real_dev); } vlan_vid_del(real_dev, vlan->vlan_proto, vlan_id); } int vlan_check_real_dev(struct net_device *real_dev, __be16 protocol, u16 vlan_id, struct netlink_ext_ack *extack) { const char *name = real_dev->name; if (real_dev->features & NETIF_F_VLAN_CHALLENGED || real_dev->type != ARPHRD_ETHER) { pr_info("VLANs not supported on %s\n", name); NL_SET_ERR_MSG_MOD(extack, "VLANs not supported on device"); return -EOPNOTSUPP; } if (vlan_find_dev(real_dev, protocol, vlan_id) != NULL) { NL_SET_ERR_MSG_MOD(extack, "VLAN device already exists"); return -EEXIST; } return 0; } int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev = vlan->real_dev; u16 vlan_id = vlan->vlan_id; struct vlan_info *vlan_info; struct vlan_group *grp; int err; err = vlan_vid_add(real_dev, vlan->vlan_proto, vlan_id); if (err) return err; vlan_info = rtnl_dereference(real_dev->vlan_info); /* vlan_info should be there now. vlan_vid_add took care of it */ BUG_ON(!vlan_info); grp = &vlan_info->grp; if (grp->nr_vlan_devs == 0) { err = vlan_gvrp_init_applicant(real_dev); if (err < 0) goto out_vid_del; err = vlan_mvrp_init_applicant(real_dev); if (err < 0) goto out_uninit_gvrp; } err = vlan_group_prealloc_vid(grp, vlan->vlan_proto, vlan_id); if (err < 0) goto out_uninit_mvrp; err = register_netdevice(dev); if (err < 0) goto out_uninit_mvrp; err = netdev_upper_dev_link(real_dev, dev, extack); if (err) goto out_unregister_netdev; vlan_stacked_transfer_operstate(real_dev, dev, vlan); linkwatch_fire_event(dev); /* _MUST_ call rfc2863_policy() */ /* So, got the sucker initialized, now lets place * it into our local structure. */ vlan_group_set_device(grp, vlan->vlan_proto, vlan_id, dev); grp->nr_vlan_devs++; return 0; out_unregister_netdev: unregister_netdevice(dev); out_uninit_mvrp: if (grp->nr_vlan_devs == 0) vlan_mvrp_uninit_applicant(real_dev); out_uninit_gvrp: if (grp->nr_vlan_devs == 0) vlan_gvrp_uninit_applicant(real_dev); out_vid_del: vlan_vid_del(real_dev, vlan->vlan_proto, vlan_id); return err; } /* Attach a VLAN device to a mac address (ie Ethernet Card). * Returns 0 if the device was created or a negative error code otherwise. */ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id) { struct net_device *new_dev; struct vlan_dev_priv *vlan; struct net *net = dev_net(real_dev); struct vlan_net *vn = net_generic(net, vlan_net_id); char name[IFNAMSIZ]; int err; if (vlan_id >= VLAN_VID_MASK) return -ERANGE; err = vlan_check_real_dev(real_dev, htons(ETH_P_8021Q), vlan_id, NULL); if (err < 0) return err; /* Gotta set up the fields for the device. */ switch (vn->name_type) { case VLAN_NAME_TYPE_RAW_PLUS_VID: /* name will look like: eth1.0005 */ snprintf(name, IFNAMSIZ, "%s.%.4i", real_dev->name, vlan_id); break; case VLAN_NAME_TYPE_PLUS_VID_NO_PAD: /* Put our vlan.VID in the name. * Name will look like: vlan5 */ snprintf(name, IFNAMSIZ, "vlan%i", vlan_id); break; case VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD: /* Put our vlan.VID in the name. * Name will look like: eth0.5 */ snprintf(name, IFNAMSIZ, "%s.%i", real_dev->name, vlan_id); break; case VLAN_NAME_TYPE_PLUS_VID: /* Put our vlan.VID in the name. * Name will look like: vlan0005 */ default: snprintf(name, IFNAMSIZ, "vlan%.4i", vlan_id); } new_dev = alloc_netdev(sizeof(struct vlan_dev_priv), name, NET_NAME_UNKNOWN, vlan_setup); if (new_dev == NULL) return -ENOBUFS; dev_net_set(new_dev, net); /* need 4 bytes for extra VLAN header info, * hope the underlying device can handle it. */ new_dev->mtu = real_dev->mtu; vlan = vlan_dev_priv(new_dev); vlan->vlan_proto = htons(ETH_P_8021Q); vlan->vlan_id = vlan_id; vlan->real_dev = real_dev; vlan->dent = NULL; vlan->flags = VLAN_FLAG_REORDER_HDR; new_dev->rtnl_link_ops = &vlan_link_ops; err = register_vlan_dev(new_dev, NULL); if (err < 0) goto out_free_newdev; return 0; out_free_newdev: free_netdev(new_dev); return err; } static void vlan_sync_address(struct net_device *dev, struct net_device *vlandev) { struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev); /* May be called without an actual change */ if (ether_addr_equal(vlan->real_dev_addr, dev->dev_addr)) return; /* vlan continues to inherit address of lower device */ if (vlan_dev_inherit_address(vlandev, dev)) goto out; /* vlan address was different from the old address and is equal to * the new address */ if (!ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) && ether_addr_equal(vlandev->dev_addr, dev->dev_addr)) dev_uc_del(dev, vlandev->dev_addr); /* vlan address was equal to the old address and is different from * the new address */ if (ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) && !ether_addr_equal(vlandev->dev_addr, dev->dev_addr)) dev_uc_add(dev, vlandev->dev_addr); out: ether_addr_copy(vlan->real_dev_addr, dev->dev_addr); } static void vlan_transfer_features(struct net_device *dev, struct net_device *vlandev) { struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev); netif_inherit_tso_max(vlandev, dev); if (vlan_hw_offload_capable(dev->features, vlan->vlan_proto)) vlandev->hard_header_len = dev->hard_header_len; else vlandev->hard_header_len = dev->hard_header_len + VLAN_HLEN; #if IS_ENABLED(CONFIG_FCOE) vlandev->fcoe_ddp_xid = dev->fcoe_ddp_xid; #endif vlandev->priv_flags &= ~IFF_XMIT_DST_RELEASE; vlandev->priv_flags |= (vlan->real_dev->priv_flags & IFF_XMIT_DST_RELEASE); vlandev->hw_enc_features = vlan_tnl_features(vlan->real_dev); netdev_update_features(vlandev); } static int __vlan_device_event(struct net_device *dev, unsigned long event) { int err = 0; switch (event) { case NETDEV_CHANGENAME: vlan_proc_rem_dev(dev); err = vlan_proc_add_dev(dev); break; case NETDEV_REGISTER: err = vlan_proc_add_dev(dev); break; case NETDEV_UNREGISTER: vlan_proc_rem_dev(dev); break; } return err; } static int vlan_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr); struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct vlan_group *grp; struct vlan_info *vlan_info; int i, flgs; struct net_device *vlandev; struct vlan_dev_priv *vlan; bool last = false; LIST_HEAD(list); int err; if (is_vlan_dev(dev)) { int err = __vlan_device_event(dev, event); if (err) return notifier_from_errno(err); } if ((event == NETDEV_UP) && (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)) { pr_info("adding VLAN 0 to HW filter on device %s\n", dev->name); vlan_vid_add(dev, htons(ETH_P_8021Q), 0); } if (event == NETDEV_DOWN && (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)) vlan_vid_del(dev, htons(ETH_P_8021Q), 0); vlan_info = rtnl_dereference(dev->vlan_info); if (!vlan_info) goto out; grp = &vlan_info->grp; /* It is OK that we do not hold the group lock right now, * as we run under the RTNL lock. */ switch (event) { case NETDEV_CHANGE: /* Propagate real device state to vlan devices */ vlan_group_for_each_dev(grp, i, vlandev) vlan_stacked_transfer_operstate(dev, vlandev, vlan_dev_priv(vlandev)); break; case NETDEV_CHANGEADDR: /* Adjust unicast filters on underlying device */ vlan_group_for_each_dev(grp, i, vlandev) { flgs = vlandev->flags; if (!(flgs & IFF_UP)) continue; vlan_sync_address(dev, vlandev); } break; case NETDEV_CHANGEMTU: vlan_group_for_each_dev(grp, i, vlandev) { if (vlandev->mtu <= dev->mtu) continue; dev_set_mtu(vlandev, dev->mtu); } break; case NETDEV_FEAT_CHANGE: /* Propagate device features to underlying device */ vlan_group_for_each_dev(grp, i, vlandev) vlan_transfer_features(dev, vlandev); break; case NETDEV_DOWN: { struct net_device *tmp; LIST_HEAD(close_list); /* Put all VLANs for this dev in the down state too. */ vlan_group_for_each_dev(grp, i, vlandev) { flgs = vlandev->flags; if (!(flgs & IFF_UP)) continue; vlan = vlan_dev_priv(vlandev); if (!(vlan->flags & VLAN_FLAG_LOOSE_BINDING)) list_add(&vlandev->close_list, &close_list); } dev_close_many(&close_list, false); list_for_each_entry_safe(vlandev, tmp, &close_list, close_list) { vlan_stacked_transfer_operstate(dev, vlandev, vlan_dev_priv(vlandev)); list_del_init(&vlandev->close_list); } list_del(&close_list); break; } case NETDEV_UP: /* Put all VLANs for this dev in the up state too. */ vlan_group_for_each_dev(grp, i, vlandev) { flgs = dev_get_flags(vlandev); if (flgs & IFF_UP) continue; vlan = vlan_dev_priv(vlandev); if (!(vlan->flags & VLAN_FLAG_LOOSE_BINDING)) dev_change_flags(vlandev, flgs | IFF_UP, extack); vlan_stacked_transfer_operstate(dev, vlandev, vlan); } break; case NETDEV_UNREGISTER: /* twiddle thumbs on netns device moves */ if (dev->reg_state != NETREG_UNREGISTERING) break; vlan_group_for_each_dev(grp, i, vlandev) { /* removal of last vid destroys vlan_info, abort * afterwards */ if (vlan_info->nr_vids == 1) last = true; unregister_vlan_dev(vlandev, &list); if (last) break; } unregister_netdevice_many(&list); break; case NETDEV_PRE_TYPE_CHANGE: /* Forbid underlaying device to change its type. */ if (vlan_uses_dev(dev)) return NOTIFY_BAD; break; case NETDEV_NOTIFY_PEERS: case NETDEV_BONDING_FAILOVER: case NETDEV_RESEND_IGMP: /* Propagate to vlan devices */ vlan_group_for_each_dev(grp, i, vlandev) call_netdevice_notifiers(event, vlandev); break; case NETDEV_CVLAN_FILTER_PUSH_INFO: err = vlan_filter_push_vids(vlan_info, htons(ETH_P_8021Q)); if (err) return notifier_from_errno(err); break; case NETDEV_CVLAN_FILTER_DROP_INFO: vlan_filter_drop_vids(vlan_info, htons(ETH_P_8021Q)); break; case NETDEV_SVLAN_FILTER_PUSH_INFO: err = vlan_filter_push_vids(vlan_info, htons(ETH_P_8021AD)); if (err) return notifier_from_errno(err); break; case NETDEV_SVLAN_FILTER_DROP_INFO: vlan_filter_drop_vids(vlan_info, htons(ETH_P_8021AD)); break; } out: return NOTIFY_DONE; } static struct notifier_block vlan_notifier_block __read_mostly = { .notifier_call = vlan_device_event, }; /* * VLAN IOCTL handler. * o execute requested action or pass command to the device driver * arg is really a struct vlan_ioctl_args __user *. */ static int vlan_ioctl_handler(struct net *net, void __user *arg) { int err; struct vlan_ioctl_args args; struct net_device *dev = NULL; if (copy_from_user(&args, arg, sizeof(struct vlan_ioctl_args))) return -EFAULT; /* Null terminate this sucker, just in case. */ args.device1[sizeof(args.device1) - 1] = 0; args.u.device2[sizeof(args.u.device2) - 1] = 0; rtnl_lock(); switch (args.cmd) { case SET_VLAN_INGRESS_PRIORITY_CMD: case SET_VLAN_EGRESS_PRIORITY_CMD: case SET_VLAN_FLAG_CMD: case ADD_VLAN_CMD: case DEL_VLAN_CMD: case GET_VLAN_REALDEV_NAME_CMD: case GET_VLAN_VID_CMD: err = -ENODEV; dev = __dev_get_by_name(net, args.device1); if (!dev) goto out; err = -EINVAL; if (args.cmd != ADD_VLAN_CMD && !is_vlan_dev(dev)) goto out; } switch (args.cmd) { case SET_VLAN_INGRESS_PRIORITY_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; vlan_dev_set_ingress_priority(dev, args.u.skb_priority, args.vlan_qos); err = 0; break; case SET_VLAN_EGRESS_PRIORITY_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = vlan_dev_set_egress_priority(dev, args.u.skb_priority, args.vlan_qos); break; case SET_VLAN_FLAG_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = vlan_dev_change_flags(dev, args.vlan_qos ? args.u.flag : 0, args.u.flag); break; case SET_VLAN_NAME_TYPE_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; if (args.u.name_type < VLAN_NAME_TYPE_HIGHEST) { struct vlan_net *vn; vn = net_generic(net, vlan_net_id); vn->name_type = args.u.name_type; err = 0; } else { err = -EINVAL; } break; case ADD_VLAN_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = register_vlan_device(dev, args.u.VID); break; case DEL_VLAN_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; unregister_vlan_dev(dev, NULL); err = 0; break; case GET_VLAN_REALDEV_NAME_CMD: err = 0; vlan_dev_get_realdev_name(dev, args.u.device2, sizeof(args.u.device2)); if (copy_to_user(arg, &args, sizeof(struct vlan_ioctl_args))) err = -EFAULT; break; case GET_VLAN_VID_CMD: err = 0; args.u.VID = vlan_dev_vlan_id(dev); if (copy_to_user(arg, &args, sizeof(struct vlan_ioctl_args))) err = -EFAULT; break; default: err = -EOPNOTSUPP; break; } out: rtnl_unlock(); return err; } static int __net_init vlan_init_net(struct net *net) { struct vlan_net *vn = net_generic(net, vlan_net_id); int err; vn->name_type = VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD; err = vlan_proc_init(net); return err; } static void __net_exit vlan_exit_net(struct net *net) { vlan_proc_cleanup(net); } static struct pernet_operations vlan_net_ops = { .init = vlan_init_net, .exit = vlan_exit_net, .id = &vlan_net_id, .size = sizeof(struct vlan_net), }; static int __init vlan_proto_init(void) { int err; pr_info("%s v%s\n", vlan_fullname, vlan_version); err = register_pernet_subsys(&vlan_net_ops); if (err < 0) goto err0; err = register_netdevice_notifier(&vlan_notifier_block); if (err < 0) goto err2; err = vlan_gvrp_init(); if (err < 0) goto err3; err = vlan_mvrp_init(); if (err < 0) goto err4; err = vlan_netlink_init(); if (err < 0) goto err5; vlan_ioctl_set(vlan_ioctl_handler); return 0; err5: vlan_mvrp_uninit(); err4: vlan_gvrp_uninit(); err3: unregister_netdevice_notifier(&vlan_notifier_block); err2: unregister_pernet_subsys(&vlan_net_ops); err0: return err; } static void __exit vlan_cleanup_module(void) { vlan_ioctl_set(NULL); vlan_netlink_fini(); unregister_netdevice_notifier(&vlan_notifier_block); unregister_pernet_subsys(&vlan_net_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ vlan_mvrp_uninit(); vlan_gvrp_uninit(); } module_init(vlan_proto_init); module_exit(vlan_cleanup_module); MODULE_DESCRIPTION("802.1Q/802.1ad VLAN Protocol"); MODULE_LICENSE("GPL"); MODULE_VERSION(DRV_VERSION);
37 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 /* SPDX-License-Identifier: GPL-2.0 */ /* * include/linux/signalfd.h * * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org> * */ #ifndef _LINUX_SIGNALFD_H #define _LINUX_SIGNALFD_H #include <uapi/linux/signalfd.h> #include <linux/sched/signal.h> #ifdef CONFIG_SIGNALFD /* * Deliver the signal to listening signalfd. */ static inline void signalfd_notify(struct task_struct *tsk, int sig) { if (unlikely(waitqueue_active(&tsk->sighand->signalfd_wqh))) wake_up(&tsk->sighand->signalfd_wqh); } extern void signalfd_cleanup(struct sighand_struct *sighand); #else /* CONFIG_SIGNALFD */ static inline void signalfd_notify(struct task_struct *tsk, int sig) { } static inline void signalfd_cleanup(struct sighand_struct *sighand) { } #endif /* CONFIG_SIGNALFD */ #endif /* _LINUX_SIGNALFD_H */
311 311 309 309 4 4 4 309 309 309 310 4 25 25 25 25 25 25 25 25 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2013-2017 ARM Limited, All Rights Reserved. * Author: Marc Zyngier <marc.zyngier@arm.com> */ #define pr_fmt(fmt) "GICv3: " fmt #include <linux/acpi.h> #include <linux/cpu.h> #include <linux/cpu_pm.h> #include <linux/delay.h> #include <linux/interrupt.h> #include <linux/irqdomain.h> #include <linux/kernel.h> #include <linux/kstrtox.h> #include <linux/of.h> #include <linux/of_address.h> #include <linux/of_irq.h> #include <linux/percpu.h> #include <linux/refcount.h> #include <linux/slab.h> #include <linux/iopoll.h> #include <linux/irqchip.h> #include <linux/irqchip/arm-gic-common.h> #include <linux/irqchip/arm-gic-v3.h> #include <linux/irqchip/arm-gic-v3-prio.h> #include <linux/irqchip/irq-partition-percpu.h> #include <linux/bitfield.h> #include <linux/bits.h> #include <linux/arm-smccc.h> #include <asm/cputype.h> #include <asm/exception.h> #include <asm/smp_plat.h> #include <asm/virt.h> #include "irq-gic-common.h" static u8 dist_prio_irq __ro_after_init = GICV3_PRIO_IRQ; static u8 dist_prio_nmi __ro_after_init = GICV3_PRIO_NMI; #define FLAGS_WORKAROUND_GICR_WAKER_MSM8996 (1ULL << 0) #define FLAGS_WORKAROUND_CAVIUM_ERRATUM_38539 (1ULL << 1) #define FLAGS_WORKAROUND_ASR_ERRATUM_8601001 (1ULL << 2) #define FLAGS_WORKAROUND_INSECURE (1ULL << 3) #define GIC_IRQ_TYPE_PARTITION (GIC_IRQ_TYPE_LPI + 1) static struct cpumask broken_rdists __read_mostly __maybe_unused; struct redist_region { void __iomem *redist_base; phys_addr_t phys_base; bool single_redist; }; struct gic_chip_data { struct fwnode_handle *fwnode; phys_addr_t dist_phys_base; void __iomem *dist_base; struct redist_region *redist_regions; struct rdists rdists; struct irq_domain *domain; u64 redist_stride; u32 nr_redist_regions; u64 flags; bool has_rss; unsigned int ppi_nr; struct partition_desc **ppi_descs; }; #define T241_CHIPS_MAX 4 static void __iomem *t241_dist_base_alias[T241_CHIPS_MAX] __read_mostly; static DEFINE_STATIC_KEY_FALSE(gic_nvidia_t241_erratum); static DEFINE_STATIC_KEY_FALSE(gic_arm64_2941627_erratum); static struct gic_chip_data gic_data __read_mostly; static DEFINE_STATIC_KEY_TRUE(supports_deactivate_key); #define GIC_ID_NR (1U << GICD_TYPER_ID_BITS(gic_data.rdists.gicd_typer)) #define GIC_LINE_NR min(GICD_TYPER_SPIS(gic_data.rdists.gicd_typer), 1020U) #define GIC_ESPI_NR GICD_TYPER_ESPIS(gic_data.rdists.gicd_typer) static bool nmi_support_forbidden; /* * There are 16 SGIs, though we only actually use 8 in Linux. The other 8 SGIs * are potentially stolen by the secure side. Some code, especially code dealing * with hwirq IDs, is simplified by accounting for all 16. */ #define SGI_NR 16 /* * The behaviours of RPR and PMR registers differ depending on the value of * SCR_EL3.FIQ, and the behaviour of non-secure priority registers of the * distributor and redistributors depends on whether security is enabled in the * GIC. * * When security is enabled, non-secure priority values from the (re)distributor * are presented to the GIC CPUIF as follow: * (GIC_(R)DIST_PRI[irq] >> 1) | 0x80; * * If SCR_EL3.FIQ == 1, the values written to/read from PMR and RPR at non-secure * EL1 are subject to a similar operation thus matching the priorities presented * from the (re)distributor when security is enabled. When SCR_EL3.FIQ == 0, * these values are unchanged by the GIC. * * see GICv3/GICv4 Architecture Specification (IHI0069D): * - section 4.8.1 Non-secure accesses to register fields for Secure interrupt * priorities. * - Figure 4-7 Secure read of the priority field for a Non-secure Group 1 * interrupt. */ static DEFINE_STATIC_KEY_FALSE(supports_pseudo_nmis); static u32 gic_get_pribits(void) { u32 pribits; pribits = gic_read_ctlr(); pribits &= ICC_CTLR_EL1_PRI_BITS_MASK; pribits >>= ICC_CTLR_EL1_PRI_BITS_SHIFT; pribits++; return pribits; } static bool gic_has_group0(void) { u32 val; u32 old_pmr; old_pmr = gic_read_pmr(); /* * Let's find out if Group0 is under control of EL3 or not by * setting the highest possible, non-zero priority in PMR. * * If SCR_EL3.FIQ is set, the priority gets shifted down in * order for the CPU interface to set bit 7, and keep the * actual priority in the non-secure range. In the process, it * looses the least significant bit and the actual priority * becomes 0x80. Reading it back returns 0, indicating that * we're don't have access to Group0. */ gic_write_pmr(BIT(8 - gic_get_pribits())); val = gic_read_pmr(); gic_write_pmr(old_pmr); return val != 0; } static inline bool gic_dist_security_disabled(void) { return readl_relaxed(gic_data.dist_base + GICD_CTLR) & GICD_CTLR_DS; } static bool cpus_have_security_disabled __ro_after_init; static bool cpus_have_group0 __ro_after_init; static void __init gic_prio_init(void) { bool ds; cpus_have_group0 = gic_has_group0(); ds = gic_dist_security_disabled(); if ((gic_data.flags & FLAGS_WORKAROUND_INSECURE) && !ds) { if (cpus_have_group0) { u32 val; val = readl_relaxed(gic_data.dist_base + GICD_CTLR); val |= GICD_CTLR_DS; writel_relaxed(val, gic_data.dist_base + GICD_CTLR); ds = gic_dist_security_disabled(); if (ds) pr_warn("Broken GIC integration, security disabled\n"); } else { pr_warn("Broken GIC integration, pNMI forbidden\n"); nmi_support_forbidden = true; } } cpus_have_security_disabled = ds; /* * How priority values are used by the GIC depends on two things: * the security state of the GIC (controlled by the GICD_CTRL.DS bit) * and if Group 0 interrupts can be delivered to Linux in the non-secure * world as FIQs (controlled by the SCR_EL3.FIQ bit). These affect the * way priorities are presented in ICC_PMR_EL1 and in the distributor: * * GICD_CTRL.DS | SCR_EL3.FIQ | ICC_PMR_EL1 | Distributor * ------------------------------------------------------- * 1 | - | unchanged | unchanged * ------------------------------------------------------- * 0 | 1 | non-secure | non-secure * ------------------------------------------------------- * 0 | 0 | unchanged | non-secure * * In the non-secure view reads and writes are modified: * * - A value written is right-shifted by one and the MSB is set, * forcing the priority into the non-secure range. * * - A value read is left-shifted by one. * * In the first two cases, where ICC_PMR_EL1 and the interrupt priority * are both either modified or unchanged, we can use the same set of * priorities. * * In the last case, where only the interrupt priorities are modified to * be in the non-secure range, we program the non-secure values into * the distributor to match the PMR values we want. */ if (cpus_have_group0 && !cpus_have_security_disabled) { dist_prio_irq = __gicv3_prio_to_ns(dist_prio_irq); dist_prio_nmi = __gicv3_prio_to_ns(dist_prio_nmi); } pr_info("GICD_CTRL.DS=%d, SCR_EL3.FIQ=%d\n", cpus_have_security_disabled, !cpus_have_group0); } /* rdist_nmi_refs[n] == number of cpus having the rdist interrupt n set as NMI */ static refcount_t *rdist_nmi_refs; static struct gic_kvm_info gic_v3_kvm_info __initdata; static DEFINE_PER_CPU(bool, has_rss); #define MPIDR_RS(mpidr) (((mpidr) & 0xF0UL) >> 4) #define gic_data_rdist() (this_cpu_ptr(gic_data.rdists.rdist)) #define gic_data_rdist_rd_base() (gic_data_rdist()->rd_base) #define gic_data_rdist_sgi_base() (gic_data_rdist_rd_base() + SZ_64K) /* Our default, arbitrary priority value. Linux only uses one anyway. */ #define DEFAULT_PMR_VALUE 0xf0 enum gic_intid_range { SGI_RANGE, PPI_RANGE, SPI_RANGE, EPPI_RANGE, ESPI_RANGE, LPI_RANGE, __INVALID_RANGE__ }; static enum gic_intid_range __get_intid_range(irq_hw_number_t hwirq) { switch (hwirq) { case 0 ... 15: return SGI_RANGE; case 16 ... 31: return PPI_RANGE; case 32 ... 1019: return SPI_RANGE; case EPPI_BASE_INTID ... (EPPI_BASE_INTID + 63): return EPPI_RANGE; case ESPI_BASE_INTID ... (ESPI_BASE_INTID + 1023): return ESPI_RANGE; case 8192 ... GENMASK(23, 0): return LPI_RANGE; default: return __INVALID_RANGE__; } } static enum gic_intid_range get_intid_range(struct irq_data *d) { return __get_intid_range(d->hwirq); } static inline bool gic_irq_in_rdist(struct irq_data *d) { switch (get_intid_range(d)) { case SGI_RANGE: case PPI_RANGE: case EPPI_RANGE: return true; default: return false; } } static inline void __iomem *gic_dist_base_alias(struct irq_data *d) { if (static_branch_unlikely(&gic_nvidia_t241_erratum)) { irq_hw_number_t hwirq = irqd_to_hwirq(d); u32 chip; /* * For the erratum T241-FABRIC-4, read accesses to GICD_In{E} * registers are directed to the chip that owns the SPI. The * the alias region can also be used for writes to the * GICD_In{E} except GICD_ICENABLERn. Each chip has support * for 320 {E}SPIs. Mappings for all 4 chips: * Chip0 = 32-351 * Chip1 = 352-671 * Chip2 = 672-991 * Chip3 = 4096-4415 */ switch (__get_intid_range(hwirq)) { case SPI_RANGE: chip = (hwirq - 32) / 320; break; case ESPI_RANGE: chip = 3; break; default: unreachable(); } return t241_dist_base_alias[chip]; } return gic_data.dist_base; } static inline void __iomem *gic_dist_base(struct irq_data *d) { switch (get_intid_range(d)) { case SGI_RANGE: case PPI_RANGE: case EPPI_RANGE: /* SGI+PPI -> SGI_base for this CPU */ return gic_data_rdist_sgi_base(); case SPI_RANGE: case ESPI_RANGE: /* SPI -> dist_base */ return gic_data.dist_base; default: return NULL; } } static void gic_do_wait_for_rwp(void __iomem *base, u32 bit) { u32 val; int ret; ret = readl_relaxed_poll_timeout_atomic(base + GICD_CTLR, val, !(val & bit), 1, USEC_PER_SEC); if (ret == -ETIMEDOUT) pr_err_ratelimited("RWP timeout, gone fishing\n"); } /* Wait for completion of a distributor change */ static void gic_dist_wait_for_rwp(void) { gic_do_wait_for_rwp(gic_data.dist_base, GICD_CTLR_RWP); } /* Wait for completion of a redistributor change */ static void gic_redist_wait_for_rwp(void) { gic_do_wait_for_rwp(gic_data_rdist_rd_base(), GICR_CTLR_RWP); } static void gic_enable_redist(bool enable) { void __iomem *rbase; u32 val; int ret; if (gic_data.flags & FLAGS_WORKAROUND_GICR_WAKER_MSM8996) return; rbase = gic_data_rdist_rd_base(); val = readl_relaxed(rbase + GICR_WAKER); if (enable) /* Wake up this CPU redistributor */ val &= ~GICR_WAKER_ProcessorSleep; else val |= GICR_WAKER_ProcessorSleep; writel_relaxed(val, rbase + GICR_WAKER); if (!enable) { /* Check that GICR_WAKER is writeable */ val = readl_relaxed(rbase + GICR_WAKER); if (!(val & GICR_WAKER_ProcessorSleep)) return; /* No PM support in this redistributor */ } ret = readl_relaxed_poll_timeout_atomic(rbase + GICR_WAKER, val, enable ^ (bool)(val & GICR_WAKER_ChildrenAsleep), 1, USEC_PER_SEC); if (ret == -ETIMEDOUT) { pr_err_ratelimited("redistributor failed to %s...\n", enable ? "wakeup" : "sleep"); } } /* * Routines to disable, enable, EOI and route interrupts */ static u32 convert_offset_index(struct irq_data *d, u32 offset, u32 *index) { switch (get_intid_range(d)) { case SGI_RANGE: case PPI_RANGE: case SPI_RANGE: *index = d->hwirq; return offset; case EPPI_RANGE: /* * Contrary to the ESPI range, the EPPI range is contiguous * to the PPI range in the registers, so let's adjust the * displacement accordingly. Consistency is overrated. */ *index = d->hwirq - EPPI_BASE_INTID + 32; return offset; case ESPI_RANGE: *index = d->hwirq - ESPI_BASE_INTID; switch (offset) { case GICD_ISENABLER: return GICD_ISENABLERnE; case GICD_ICENABLER: return GICD_ICENABLERnE; case GICD_ISPENDR: return GICD_ISPENDRnE; case GICD_ICPENDR: return GICD_ICPENDRnE; case GICD_ISACTIVER: return GICD_ISACTIVERnE; case GICD_ICACTIVER: return GICD_ICACTIVERnE; case GICD_IPRIORITYR: return GICD_IPRIORITYRnE; case GICD_ICFGR: return GICD_ICFGRnE; case GICD_IROUTER: return GICD_IROUTERnE; default: break; } break; default: break; } WARN_ON(1); *index = d->hwirq; return offset; } static int gic_peek_irq(struct irq_data *d, u32 offset) { void __iomem *base; u32 index, mask; offset = convert_offset_index(d, offset, &index); mask = 1 << (index % 32); if (gic_irq_in_rdist(d)) base = gic_data_rdist_sgi_base(); else base = gic_dist_base_alias(d); return !!(readl_relaxed(base + offset + (index / 32) * 4) & mask); } static void gic_poke_irq(struct irq_data *d, u32 offset) { void __iomem *base; u32 index, mask; offset = convert_offset_index(d, offset, &index); mask = 1 << (index % 32); if (gic_irq_in_rdist(d)) base = gic_data_rdist_sgi_base(); else base = gic_data.dist_base; writel_relaxed(mask, base + offset + (index / 32) * 4); } static void gic_mask_irq(struct irq_data *d) { gic_poke_irq(d, GICD_ICENABLER); if (gic_irq_in_rdist(d)) gic_redist_wait_for_rwp(); else gic_dist_wait_for_rwp(); } static void gic_eoimode1_mask_irq(struct irq_data *d) { gic_mask_irq(d); /* * When masking a forwarded interrupt, make sure it is * deactivated as well. * * This ensures that an interrupt that is getting * disabled/masked will not get "stuck", because there is * noone to deactivate it (guest is being terminated). */ if (irqd_is_forwarded_to_vcpu(d)) gic_poke_irq(d, GICD_ICACTIVER); } static void gic_unmask_irq(struct irq_data *d) { gic_poke_irq(d, GICD_ISENABLER); } static inline bool gic_supports_nmi(void) { return IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI) && static_branch_likely(&supports_pseudo_nmis); } static int gic_irq_set_irqchip_state(struct irq_data *d, enum irqchip_irq_state which, bool val) { u32 reg; if (d->hwirq >= 8192) /* SGI/PPI/SPI only */ return -EINVAL; switch (which) { case IRQCHIP_STATE_PENDING: reg = val ? GICD_ISPENDR : GICD_ICPENDR; break; case IRQCHIP_STATE_ACTIVE: reg = val ? GICD_ISACTIVER : GICD_ICACTIVER; break; case IRQCHIP_STATE_MASKED: if (val) { gic_mask_irq(d); return 0; } reg = GICD_ISENABLER; break; default: return -EINVAL; } gic_poke_irq(d, reg); /* * Force read-back to guarantee that the active state has taken * effect, and won't race with a guest-driven deactivation. */ if (reg == GICD_ISACTIVER) gic_peek_irq(d, reg); return 0; } static int gic_irq_get_irqchip_state(struct irq_data *d, enum irqchip_irq_state which, bool *val) { if (d->hwirq >= 8192) /* PPI/SPI only */ return -EINVAL; switch (which) { case IRQCHIP_STATE_PENDING: *val = gic_peek_irq(d, GICD_ISPENDR); break; case IRQCHIP_STATE_ACTIVE: *val = gic_peek_irq(d, GICD_ISACTIVER); break; case IRQCHIP_STATE_MASKED: *val = !gic_peek_irq(d, GICD_ISENABLER); break; default: return -EINVAL; } return 0; } static void gic_irq_set_prio(struct irq_data *d, u8 prio) { void __iomem *base = gic_dist_base(d); u32 offset, index; offset = convert_offset_index(d, GICD_IPRIORITYR, &index); writeb_relaxed(prio, base + offset + index); } static u32 __gic_get_ppi_index(irq_hw_number_t hwirq) { switch (__get_intid_range(hwirq)) { case PPI_RANGE: return hwirq - 16; case EPPI_RANGE: return hwirq - EPPI_BASE_INTID + 16; default: unreachable(); } } static u32 __gic_get_rdist_index(irq_hw_number_t hwirq) { switch (__get_intid_range(hwirq)) { case SGI_RANGE: case PPI_RANGE: return hwirq; case EPPI_RANGE: return hwirq - EPPI_BASE_INTID + 32; default: unreachable(); } } static u32 gic_get_rdist_index(struct irq_data *d) { return __gic_get_rdist_index(d->hwirq); } static int gic_irq_nmi_setup(struct irq_data *d) { struct irq_desc *desc = irq_to_desc(d->irq); if (!gic_supports_nmi()) return -EINVAL; if (gic_peek_irq(d, GICD_ISENABLER)) { pr_err("Cannot set NMI property of enabled IRQ %u\n", d->irq); return -EINVAL; } /* * A secondary irq_chip should be in charge of LPI request, * it should not be possible to get there */ if (WARN_ON(irqd_to_hwirq(d) >= 8192)) return -EINVAL; /* desc lock should already be held */ if (gic_irq_in_rdist(d)) { u32 idx = gic_get_rdist_index(d); /* * Setting up a percpu interrupt as NMI, only switch handler * for first NMI */ if (!refcount_inc_not_zero(&rdist_nmi_refs[idx])) { refcount_set(&rdist_nmi_refs[idx], 1); desc->handle_irq = handle_percpu_devid_fasteoi_nmi; } } else { desc->handle_irq = handle_fasteoi_nmi; } gic_irq_set_prio(d, dist_prio_nmi); return 0; } static void gic_irq_nmi_teardown(struct irq_data *d) { struct irq_desc *desc = irq_to_desc(d->irq); if (WARN_ON(!gic_supports_nmi())) return; if (gic_peek_irq(d, GICD_ISENABLER)) { pr_err("Cannot set NMI property of enabled IRQ %u\n", d->irq); return; } /* * A secondary irq_chip should be in charge of LPI request, * it should not be possible to get there */ if (WARN_ON(irqd_to_hwirq(d) >= 8192)) return; /* desc lock should already be held */ if (gic_irq_in_rdist(d)) { u32 idx = gic_get_rdist_index(d); /* Tearing down NMI, only switch handler for last NMI */ if (refcount_dec_and_test(&rdist_nmi_refs[idx])) desc->handle_irq = handle_percpu_devid_irq; } else { desc->handle_irq = handle_fasteoi_irq; } gic_irq_set_prio(d, dist_prio_irq); } static bool gic_arm64_erratum_2941627_needed(struct irq_data *d) { enum gic_intid_range range; if (!static_branch_unlikely(&gic_arm64_2941627_erratum)) return false; range = get_intid_range(d); /* * The workaround is needed if the IRQ is an SPI and * the target cpu is different from the one we are * executing on. */ return (range == SPI_RANGE || range == ESPI_RANGE) && !cpumask_test_cpu(raw_smp_processor_id(), irq_data_get_effective_affinity_mask(d)); } static void gic_eoi_irq(struct irq_data *d) { write_gicreg(irqd_to_hwirq(d), ICC_EOIR1_EL1); isb(); if (gic_arm64_erratum_2941627_needed(d)) { /* * Make sure the GIC stream deactivate packet * issued by ICC_EOIR1_EL1 has completed before * deactivating through GICD_IACTIVER. */ dsb(sy); gic_poke_irq(d, GICD_ICACTIVER); } } static void gic_eoimode1_eoi_irq(struct irq_data *d) { /* * No need to deactivate an LPI, or an interrupt that * is is getting forwarded to a vcpu. */ if (irqd_to_hwirq(d) >= 8192 || irqd_is_forwarded_to_vcpu(d)) return; if (!gic_arm64_erratum_2941627_needed(d)) gic_write_dir(irqd_to_hwirq(d)); else gic_poke_irq(d, GICD_ICACTIVER); } static int gic_set_type(struct irq_data *d, unsigned int type) { irq_hw_number_t irq = irqd_to_hwirq(d); enum gic_intid_range range; void __iomem *base; u32 offset, index; int ret; range = get_intid_range(d); /* Interrupt configuration for SGIs can't be changed */ if (range == SGI_RANGE) return type != IRQ_TYPE_EDGE_RISING ? -EINVAL : 0; /* SPIs have restrictions on the supported types */ if ((range == SPI_RANGE || range == ESPI_RANGE) && type != IRQ_TYPE_LEVEL_HIGH && type != IRQ_TYPE_EDGE_RISING) return -EINVAL; if (gic_irq_in_rdist(d)) base = gic_data_rdist_sgi_base(); else base = gic_dist_base_alias(d); offset = convert_offset_index(d, GICD_ICFGR, &index); ret = gic_configure_irq(index, type, base + offset); if (ret && (range == PPI_RANGE || range == EPPI_RANGE)) { /* Misconfigured PPIs are usually not fatal */ pr_warn("GIC: PPI INTID%ld is secure or misconfigured\n", irq); ret = 0; } return ret; } static int gic_irq_set_vcpu_affinity(struct irq_data *d, void *vcpu) { if (get_intid_range(d) == SGI_RANGE) return -EINVAL; if (vcpu) irqd_set_forwarded_to_vcpu(d); else irqd_clr_forwarded_to_vcpu(d); return 0; } static u64 gic_cpu_to_affinity(int cpu) { u64 mpidr = cpu_logical_map(cpu); u64 aff; /* ASR8601 needs to have its affinities shifted down... */ if (unlikely(gic_data.flags & FLAGS_WORKAROUND_ASR_ERRATUM_8601001)) mpidr = (MPIDR_AFFINITY_LEVEL(mpidr, 1) | (MPIDR_AFFINITY_LEVEL(mpidr, 2) << 8)); aff = ((u64)MPIDR_AFFINITY_LEVEL(mpidr, 3) << 32 | MPIDR_AFFINITY_LEVEL(mpidr, 2) << 16 | MPIDR_AFFINITY_LEVEL(mpidr, 1) << 8 | MPIDR_AFFINITY_LEVEL(mpidr, 0)); return aff; } static void gic_deactivate_unhandled(u32 irqnr) { if (static_branch_likely(&supports_deactivate_key)) { if (irqnr < 8192) gic_write_dir(irqnr); } else { write_gicreg(irqnr, ICC_EOIR1_EL1); isb(); } } /* * Follow a read of the IAR with any HW maintenance that needs to happen prior * to invoking the relevant IRQ handler. We must do two things: * * (1) Ensure instruction ordering between a read of IAR and subsequent * instructions in the IRQ handler using an ISB. * * It is possible for the IAR to report an IRQ which was signalled *after* * the CPU took an IRQ exception as multiple interrupts can race to be * recognized by the GIC, earlier interrupts could be withdrawn, and/or * later interrupts could be prioritized by the GIC. * * For devices which are tightly coupled to the CPU, such as PMUs, a * context synchronization event is necessary to ensure that system * register state is not stale, as these may have been indirectly written * *after* exception entry. * * (2) Execute an interrupt priority drop when EOI mode 1 is in use. */ static inline void gic_complete_ack(u32 irqnr) { if (static_branch_likely(&supports_deactivate_key)) write_gicreg(irqnr, ICC_EOIR1_EL1); isb(); } static bool gic_rpr_is_nmi_prio(void) { if (!gic_supports_nmi()) return false; return unlikely(gic_read_rpr() == GICV3_PRIO_NMI); } static bool gic_irqnr_is_special(u32 irqnr) { return irqnr >= 1020 && irqnr <= 1023; } static void __gic_handle_irq(u32 irqnr, struct pt_regs *regs) { if (gic_irqnr_is_special(irqnr)) return; gic_complete_ack(irqnr); if (generic_handle_domain_irq(gic_data.domain, irqnr)) { WARN_ONCE(true, "Unexpected interrupt (irqnr %u)\n", irqnr); gic_deactivate_unhandled(irqnr); } } static void __gic_handle_nmi(u32 irqnr, struct pt_regs *regs) { if (gic_irqnr_is_special(irqnr)) return; gic_complete_ack(irqnr); if (generic_handle_domain_nmi(gic_data.domain, irqnr)) { WARN_ONCE(true, "Unexpected pseudo-NMI (irqnr %u)\n", irqnr); gic_deactivate_unhandled(irqnr); } } /* * An exception has been taken from a context with IRQs enabled, and this could * be an IRQ or an NMI. * * The entry code called us with DAIF.IF set to keep NMIs masked. We must clear * DAIF.IF (and update ICC_PMR_EL1 to mask regular IRQs) prior to returning, * after handling any NMI but before handling any IRQ. * * The entry code has performed IRQ entry, and if an NMI is detected we must * perform NMI entry/exit around invoking the handler. */ static void __gic_handle_irq_from_irqson(struct pt_regs *regs) { bool is_nmi; u32 irqnr; irqnr = gic_read_iar(); is_nmi = gic_rpr_is_nmi_prio(); if (is_nmi) { nmi_enter(); __gic_handle_nmi(irqnr, regs); nmi_exit(); } if (gic_prio_masking_enabled()) { gic_pmr_mask_irqs(); gic_arch_enable_irqs(); } if (!is_nmi) __gic_handle_irq(irqnr, regs); } /* * An exception has been taken from a context with IRQs disabled, which can only * be an NMI. * * The entry code called us with DAIF.IF set to keep NMIs masked. We must leave * DAIF.IF (and ICC_PMR_EL1) unchanged. * * The entry code has performed NMI entry. */ static void __gic_handle_irq_from_irqsoff(struct pt_regs *regs) { u64 pmr; u32 irqnr; /* * We were in a context with IRQs disabled. However, the * entry code has set PMR to a value that allows any * interrupt to be acknowledged, and not just NMIs. This can * lead to surprising effects if the NMI has been retired in * the meantime, and that there is an IRQ pending. The IRQ * would then be taken in NMI context, something that nobody * wants to debug twice. * * Until we sort this, drop PMR again to a level that will * actually only allow NMIs before reading IAR, and then * restore it to what it was. */ pmr = gic_read_pmr(); gic_pmr_mask_irqs(); isb(); irqnr = gic_read_iar(); gic_write_pmr(pmr); __gic_handle_nmi(irqnr, regs); } static void __exception_irq_entry gic_handle_irq(struct pt_regs *regs) { if (unlikely(gic_supports_nmi() && !interrupts_enabled(regs))) __gic_handle_irq_from_irqsoff(regs); else __gic_handle_irq_from_irqson(regs); } static void __init gic_dist_init(void) { unsigned int i; u64 affinity; void __iomem *base = gic_data.dist_base; u32 val; /* Disable the distributor */ writel_relaxed(0, base + GICD_CTLR); gic_dist_wait_for_rwp(); /* * Configure SPIs as non-secure Group-1. This will only matter * if the GIC only has a single security state. This will not * do the right thing if the kernel is running in secure mode, * but that's not the intended use case anyway. */ for (i = 32; i < GIC_LINE_NR; i += 32) writel_relaxed(~0, base + GICD_IGROUPR + i / 8); /* Extended SPI range, not handled by the GICv2/GICv3 common code */ for (i = 0; i < GIC_ESPI_NR; i += 32) { writel_relaxed(~0U, base + GICD_ICENABLERnE + i / 8); writel_relaxed(~0U, base + GICD_ICACTIVERnE + i / 8); } for (i = 0; i < GIC_ESPI_NR; i += 32) writel_relaxed(~0U, base + GICD_IGROUPRnE + i / 8); for (i = 0; i < GIC_ESPI_NR; i += 16) writel_relaxed(0, base + GICD_ICFGRnE + i / 4); for (i = 0; i < GIC_ESPI_NR; i += 4) writel_relaxed(REPEAT_BYTE_U32(dist_prio_irq), base + GICD_IPRIORITYRnE + i); /* Now do the common stuff */ gic_dist_config(base, GIC_LINE_NR, dist_prio_irq); val = GICD_CTLR_ARE_NS | GICD_CTLR_ENABLE_G1A | GICD_CTLR_ENABLE_G1; if (gic_data.rdists.gicd_typer2 & GICD_TYPER2_nASSGIcap) { pr_info("Enabling SGIs without active state\n"); val |= GICD_CTLR_nASSGIreq; } /* Enable distributor with ARE, Group1, and wait for it to drain */ writel_relaxed(val, base + GICD_CTLR); gic_dist_wait_for_rwp(); /* * Set all global interrupts to the boot CPU only. ARE must be * enabled. */ affinity = gic_cpu_to_affinity(smp_processor_id()); for (i = 32; i < GIC_LINE_NR; i++) gic_write_irouter(affinity, base + GICD_IROUTER + i * 8); for (i = 0; i < GIC_ESPI_NR; i++) gic_write_irouter(affinity, base + GICD_IROUTERnE + i * 8); } static int gic_iterate_rdists(int (*fn)(struct redist_region *, void __iomem *)) { int ret = -ENODEV; int i; for (i = 0; i < gic_data.nr_redist_regions; i++) { void __iomem *ptr = gic_data.redist_regions[i].redist_base; u64 typer; u32 reg; reg = readl_relaxed(ptr + GICR_PIDR2) & GIC_PIDR2_ARCH_MASK; if (reg != GIC_PIDR2_ARCH_GICv3 && reg != GIC_PIDR2_ARCH_GICv4) { /* We're in trouble... */ pr_warn("No redistributor present @%p\n", ptr); break; } do { typer = gic_read_typer(ptr + GICR_TYPER); ret = fn(gic_data.redist_regions + i, ptr); if (!ret) return 0; if (gic_data.redist_regions[i].single_redist) break; if (gic_data.redist_stride) { ptr += gic_data.redist_stride; } else { ptr += SZ_64K * 2; /* Skip RD_base + SGI_base */ if (typer & GICR_TYPER_VLPIS) ptr += SZ_64K * 2; /* Skip VLPI_base + reserved page */ } } while (!(typer & GICR_TYPER_LAST)); } return ret ? -ENODEV : 0; } static int __gic_populate_rdist(struct redist_region *region, void __iomem *ptr) { unsigned long mpidr; u64 typer; u32 aff; /* * Convert affinity to a 32bit value that can be matched to * GICR_TYPER bits [63:32]. */ mpidr = gic_cpu_to_affinity(smp_processor_id()); aff = (MPIDR_AFFINITY_LEVEL(mpidr, 3) << 24 | MPIDR_AFFINITY_LEVEL(mpidr, 2) << 16 | MPIDR_AFFINITY_LEVEL(mpidr, 1) << 8 | MPIDR_AFFINITY_LEVEL(mpidr, 0)); typer = gic_read_typer(ptr + GICR_TYPER); if ((typer >> 32) == aff) { u64 offset = ptr - region->redist_base; raw_spin_lock_init(&gic_data_rdist()->rd_lock); gic_data_rdist_rd_base() = ptr; gic_data_rdist()->phys_base = region->phys_base + offset; pr_info("CPU%d: found redistributor %lx region %d:%pa\n", smp_processor_id(), mpidr, (int)(region - gic_data.redist_regions), &gic_data_rdist()->phys_base); return 0; } /* Try next one */ return 1; } static int gic_populate_rdist(void) { if (gic_iterate_rdists(__gic_populate_rdist) == 0) return 0; /* We couldn't even deal with ourselves... */ WARN(true, "CPU%d: mpidr %lx has no re-distributor!\n", smp_processor_id(), (unsigned long)cpu_logical_map(smp_processor_id())); return -ENODEV; } static int __gic_update_rdist_properties(struct redist_region *region, void __iomem *ptr) { u64 typer = gic_read_typer(ptr + GICR_TYPER); u32 ctlr = readl_relaxed(ptr + GICR_CTLR); /* Boot-time cleanup */ if ((typer & GICR_TYPER_VLPIS) && (typer & GICR_TYPER_RVPEID)) { u64 val; /* Deactivate any present vPE */ val = gicr_read_vpendbaser(ptr + SZ_128K + GICR_VPENDBASER); if (val & GICR_VPENDBASER_Valid) gicr_write_vpendbaser(GICR_VPENDBASER_PendingLast, ptr + SZ_128K + GICR_VPENDBASER); /* Mark the VPE table as invalid */ val = gicr_read_vpropbaser(ptr + SZ_128K + GICR_VPROPBASER); val &= ~GICR_VPROPBASER_4_1_VALID; gicr_write_vpropbaser(val, ptr + SZ_128K + GICR_VPROPBASER); } gic_data.rdists.has_vlpis &= !!(typer & GICR_TYPER_VLPIS); /* * TYPER.RVPEID implies some form of DirectLPI, no matter what the * doc says... :-/ And CTLR.IR implies another subset of DirectLPI * that the ITS driver can make use of for LPIs (and not VLPIs). * * These are 3 different ways to express the same thing, depending * on the revision of the architecture and its relaxations over * time. Just group them under the 'direct_lpi' banner. */ gic_data.rdists.has_rvpeid &= !!(typer & GICR_TYPER_RVPEID); gic_data.rdists.has_direct_lpi &= (!!(typer & GICR_TYPER_DirectLPIS) | !!(ctlr & GICR_CTLR_IR) | gic_data.rdists.has_rvpeid); gic_data.rdists.has_vpend_valid_dirty &= !!(typer & GICR_TYPER_DIRTY); /* Detect non-sensical configurations */ if (WARN_ON_ONCE(gic_data.rdists.has_rvpeid && !gic_data.rdists.has_vlpis)) { gic_data.rdists.has_direct_lpi = false; gic_data.rdists.has_vlpis = false; gic_data.rdists.has_rvpeid = false; } gic_data.ppi_nr = min(GICR_TYPER_NR_PPIS(typer), gic_data.ppi_nr); return 1; } static void gic_update_rdist_properties(void) { gic_data.ppi_nr = UINT_MAX; gic_iterate_rdists(__gic_update_rdist_properties); if (WARN_ON(gic_data.ppi_nr == UINT_MAX)) gic_data.ppi_nr = 0; pr_info("GICv3 features: %d PPIs%s%s\n", gic_data.ppi_nr, gic_data.has_rss ? ", RSS" : "", gic_data.rdists.has_direct_lpi ? ", DirectLPI" : ""); if (gic_data.rdists.has_vlpis) pr_info("GICv4 features: %s%s%s\n", gic_data.rdists.has_direct_lpi ? "DirectLPI " : "", gic_data.rdists.has_rvpeid ? "RVPEID " : "", gic_data.rdists.has_vpend_valid_dirty ? "Valid+Dirty " : ""); } static void gic_cpu_sys_reg_enable(void) { /* * Need to check that the SRE bit has actually been set. If * not, it means that SRE is disabled at EL2. We're going to * die painfully, and there is nothing we can do about it. * * Kindly inform the luser. */ if (!gic_enable_sre()) pr_err("GIC: unable to set SRE (disabled at EL2), panic ahead\n"); } static void gic_cpu_sys_reg_init(void) { int i, cpu = smp_processor_id(); u64 mpidr = gic_cpu_to_affinity(cpu); u64 need_rss = MPIDR_RS(mpidr); bool group0; u32 pribits; pribits = gic_get_pribits(); group0 = gic_has_group0(); /* Set priority mask register */ if (!gic_prio_masking_enabled()) { write_gicreg(DEFAULT_PMR_VALUE, ICC_PMR_EL1); } else if (gic_supports_nmi()) { /* * Check that all CPUs use the same priority space. * * If there's a mismatch with the boot CPU, the system is * likely to die as interrupt masking will not work properly on * all CPUs. */ WARN_ON(group0 != cpus_have_group0); WARN_ON(gic_dist_security_disabled() != cpus_have_security_disabled); } /* * Some firmwares hand over to the kernel with the BPR changed from * its reset value (and with a value large enough to prevent * any pre-emptive interrupts from working at all). Writing a zero * to BPR restores is reset value. */ gic_write_bpr1(0); if (static_branch_likely(&supports_deactivate_key)) { /* EOI drops priority only (mode 1) */ gic_write_ctlr(ICC_CTLR_EL1_EOImode_drop); } else { /* EOI deactivates interrupt too (mode 0) */ gic_write_ctlr(ICC_CTLR_EL1_EOImode_drop_dir); } /* Always whack Group0 before Group1 */ if (group0) { switch(pribits) { case 8: case 7: write_gicreg(0, ICC_AP0R3_EL1); write_gicreg(0, ICC_AP0R2_EL1); fallthrough; case 6: write_gicreg(0, ICC_AP0R1_EL1); fallthrough; case 5: case 4: write_gicreg(0, ICC_AP0R0_EL1); } isb(); } switch(pribits) { case 8: case 7: write_gicreg(0, ICC_AP1R3_EL1); write_gicreg(0, ICC_AP1R2_EL1); fallthrough; case 6: write_gicreg(0, ICC_AP1R1_EL1); fallthrough; case 5: case 4: write_gicreg(0, ICC_AP1R0_EL1); } isb(); /* ... and let's hit the road... */ gic_write_grpen1(1); /* Keep the RSS capability status in per_cpu variable */ per_cpu(has_rss, cpu) = !!(gic_read_ctlr() & ICC_CTLR_EL1_RSS); /* Check all the CPUs have capable of sending SGIs to other CPUs */ for_each_online_cpu(i) { bool have_rss = per_cpu(has_rss, i) && per_cpu(has_rss, cpu); need_rss |= MPIDR_RS(gic_cpu_to_affinity(i)); if (need_rss && (!have_rss)) pr_crit("CPU%d (%lx) can't SGI CPU%d (%lx), no RSS\n", cpu, (unsigned long)mpidr, i, (unsigned long)gic_cpu_to_affinity(i)); } /** * GIC spec says, when ICC_CTLR_EL1.RSS==1 and GICD_TYPER.RSS==0, * writing ICC_ASGI1R_EL1 register with RS != 0 is a CONSTRAINED * UNPREDICTABLE choice of : * - The write is ignored. * - The RS field is treated as 0. */ if (need_rss && (!gic_data.has_rss)) pr_crit_once("RSS is required but GICD doesn't support it\n"); } static bool gicv3_nolpi; static int __init gicv3_nolpi_cfg(char *buf) { return kstrtobool(buf, &gicv3_nolpi); } early_param("irqchip.gicv3_nolpi", gicv3_nolpi_cfg); static int gic_dist_supports_lpis(void) { return (IS_ENABLED(CONFIG_ARM_GIC_V3_ITS) && !!(readl_relaxed(gic_data.dist_base + GICD_TYPER) & GICD_TYPER_LPIS) && !gicv3_nolpi); } static void gic_cpu_init(void) { void __iomem *rbase; int i; /* Register ourselves with the rest of the world */ if (gic_populate_rdist()) return; gic_enable_redist(true); WARN((gic_data.ppi_nr > 16 || GIC_ESPI_NR != 0) && !(gic_read_ctlr() & ICC_CTLR_EL1_ExtRange), "Distributor has extended ranges, but CPU%d doesn't\n", smp_processor_id()); rbase = gic_data_rdist_sgi_base(); /* Configure SGIs/PPIs as non-secure Group-1 */ for (i = 0; i < gic_data.ppi_nr + SGI_NR; i += 32) writel_relaxed(~0, rbase + GICR_IGROUPR0 + i / 8); gic_cpu_config(rbase, gic_data.ppi_nr + SGI_NR, dist_prio_irq); gic_redist_wait_for_rwp(); /* initialise system registers */ gic_cpu_sys_reg_init(); } #ifdef CONFIG_SMP #define MPIDR_TO_SGI_RS(mpidr) (MPIDR_RS(mpidr) << ICC_SGI1R_RS_SHIFT) #define MPIDR_TO_SGI_CLUSTER_ID(mpidr) ((mpidr) & ~0xFUL) /* * gic_starting_cpu() is called after the last point where cpuhp is allowed * to fail. So pre check for problems earlier. */ static int gic_check_rdist(unsigned int cpu) { if (cpumask_test_cpu(cpu, &broken_rdists)) return -EINVAL; return 0; } static int gic_starting_cpu(unsigned int cpu) { gic_cpu_sys_reg_enable(); gic_cpu_init(); if (gic_dist_supports_lpis()) its_cpu_init(); return 0; } static u16 gic_compute_target_list(int *base_cpu, const struct cpumask *mask, unsigned long cluster_id) { int next_cpu, cpu = *base_cpu; unsigned long mpidr; u16 tlist = 0; mpidr = gic_cpu_to_affinity(cpu); while (cpu < nr_cpu_ids) { tlist |= 1 << (mpidr & 0xf); next_cpu = cpumask_next(cpu, mask); if (next_cpu >= nr_cpu_ids) goto out; cpu = next_cpu; mpidr = gic_cpu_to_affinity(cpu); if (cluster_id != MPIDR_TO_SGI_CLUSTER_ID(mpidr)) { cpu--; goto out; } } out: *base_cpu = cpu; return tlist; } #define MPIDR_TO_SGI_AFFINITY(cluster_id, level) \ (MPIDR_AFFINITY_LEVEL(cluster_id, level) \ << ICC_SGI1R_AFFINITY_## level ##_SHIFT) static void gic_send_sgi(u64 cluster_id, u16 tlist, unsigned int irq) { u64 val; val = (MPIDR_TO_SGI_AFFINITY(cluster_id, 3) | MPIDR_TO_SGI_AFFINITY(cluster_id, 2) | irq << ICC_SGI1R_SGI_ID_SHIFT | MPIDR_TO_SGI_AFFINITY(cluster_id, 1) | MPIDR_TO_SGI_RS(cluster_id) | tlist << ICC_SGI1R_TARGET_LIST_SHIFT); pr_devel("CPU%d: ICC_SGI1R_EL1 %llx\n", smp_processor_id(), val); gic_write_sgi1r(val); } static void gic_ipi_send_mask(struct irq_data *d, const struct cpumask *mask) { int cpu; if (WARN_ON(d->hwirq >= 16)) return; /* * Ensure that stores to Normal memory are visible to the * other CPUs before issuing the IPI. */ dsb(ishst); for_each_cpu(cpu, mask) { u64 cluster_id = MPIDR_TO_SGI_CLUSTER_ID(gic_cpu_to_affinity(cpu)); u16 tlist; tlist = gic_compute_target_list(&cpu, mask, cluster_id); gic_send_sgi(cluster_id, tlist, d->hwirq); } /* Force the above writes to ICC_SGI1R_EL1 to be executed */ isb(); } static void __init gic_smp_init(void) { struct irq_fwspec sgi_fwspec = { .fwnode = gic_data.fwnode, .param_count = 1, }; int base_sgi; cpuhp_setup_state_nocalls(CPUHP_BP_PREPARE_DYN, "irqchip/arm/gicv3:checkrdist", gic_check_rdist, NULL); cpuhp_setup_state_nocalls(CPUHP_AP_IRQ_GIC_STARTING, "irqchip/arm/gicv3:starting", gic_starting_cpu, NULL); /* Register all 8 non-secure SGIs */ base_sgi = irq_domain_alloc_irqs(gic_data.domain, 8, NUMA_NO_NODE, &sgi_fwspec); if (WARN_ON(base_sgi <= 0)) return; set_smp_ipi_range(base_sgi, 8); } static int gic_set_affinity(struct irq_data *d, const struct cpumask *mask_val, bool force) { unsigned int cpu; u32 offset, index; void __iomem *reg; int enabled; u64 val; if (force) cpu = cpumask_first(mask_val); else cpu = cpumask_any_and(mask_val, cpu_online_mask); if (cpu >= nr_cpu_ids) return -EINVAL; if (gic_irq_in_rdist(d)) return -EINVAL; /* If interrupt was enabled, disable it first */ enabled = gic_peek_irq(d, GICD_ISENABLER); if (enabled) gic_mask_irq(d); offset = convert_offset_index(d, GICD_IROUTER, &index); reg = gic_dist_base(d) + offset + (index * 8); val = gic_cpu_to_affinity(cpu); gic_write_irouter(val, reg); /* * If the interrupt was enabled, enabled it again. Otherwise, * just wait for the distributor to have digested our changes. */ if (enabled) gic_unmask_irq(d); irq_data_update_effective_affinity(d, cpumask_of(cpu)); return IRQ_SET_MASK_OK_DONE; } #else #define gic_set_affinity NULL #define gic_ipi_send_mask NULL #define gic_smp_init() do { } while(0) #endif static int gic_retrigger(struct irq_data *data) { return !gic_irq_set_irqchip_state(data, IRQCHIP_STATE_PENDING, true); } #ifdef CONFIG_CPU_PM static int gic_cpu_pm_notifier(struct notifier_block *self, unsigned long cmd, void *v) { if (cmd == CPU_PM_EXIT || cmd == CPU_PM_ENTER_FAILED) { if (gic_dist_security_disabled()) gic_enable_redist(true); gic_cpu_sys_reg_enable(); gic_cpu_sys_reg_init(); } else if (cmd == CPU_PM_ENTER && gic_dist_security_disabled()) { gic_write_grpen1(0); gic_enable_redist(false); } return NOTIFY_OK; } static struct notifier_block gic_cpu_pm_notifier_block = { .notifier_call = gic_cpu_pm_notifier, }; static void gic_cpu_pm_init(void) { cpu_pm_register_notifier(&gic_cpu_pm_notifier_block); } #else static inline void gic_cpu_pm_init(void) { } #endif /* CONFIG_CPU_PM */ static struct irq_chip gic_chip = { .name = "GICv3", .irq_mask = gic_mask_irq, .irq_unmask = gic_unmask_irq, .irq_eoi = gic_eoi_irq, .irq_set_type = gic_set_type, .irq_set_affinity = gic_set_affinity, .irq_retrigger = gic_retrigger, .irq_get_irqchip_state = gic_irq_get_irqchip_state, .irq_set_irqchip_state = gic_irq_set_irqchip_state, .irq_nmi_setup = gic_irq_nmi_setup, .irq_nmi_teardown = gic_irq_nmi_teardown, .ipi_send_mask = gic_ipi_send_mask, .flags = IRQCHIP_SET_TYPE_MASKED | IRQCHIP_SKIP_SET_WAKE | IRQCHIP_MASK_ON_SUSPEND, }; static struct irq_chip gic_eoimode1_chip = { .name = "GICv3", .irq_mask = gic_eoimode1_mask_irq, .irq_unmask = gic_unmask_irq, .irq_eoi = gic_eoimode1_eoi_irq, .irq_set_type = gic_set_type, .irq_set_affinity = gic_set_affinity, .irq_retrigger = gic_retrigger, .irq_get_irqchip_state = gic_irq_get_irqchip_state, .irq_set_irqchip_state = gic_irq_set_irqchip_state, .irq_set_vcpu_affinity = gic_irq_set_vcpu_affinity, .irq_nmi_setup = gic_irq_nmi_setup, .irq_nmi_teardown = gic_irq_nmi_teardown, .ipi_send_mask = gic_ipi_send_mask, .flags = IRQCHIP_SET_TYPE_MASKED | IRQCHIP_SKIP_SET_WAKE | IRQCHIP_MASK_ON_SUSPEND, }; static int gic_irq_domain_map(struct irq_domain *d, unsigned int irq, irq_hw_number_t hw) { struct irq_chip *chip = &gic_chip; struct irq_data *irqd = irq_desc_get_irq_data(irq_to_desc(irq)); if (static_branch_likely(&supports_deactivate_key)) chip = &gic_eoimode1_chip; switch (__get_intid_range(hw)) { case SGI_RANGE: case PPI_RANGE: case EPPI_RANGE: irq_set_percpu_devid(irq); irq_domain_set_info(d, irq, hw, chip, d->host_data, handle_percpu_devid_irq, NULL, NULL); break; case SPI_RANGE: case ESPI_RANGE: irq_domain_set_info(d, irq, hw, chip, d->host_data, handle_fasteoi_irq, NULL, NULL); irq_set_probe(irq); irqd_set_single_target(irqd); break; case LPI_RANGE: if (!gic_dist_supports_lpis()) return -EPERM; irq_domain_set_info(d, irq, hw, chip, d->host_data, handle_fasteoi_irq, NULL, NULL); break; default: return -EPERM; } /* Prevents SW retriggers which mess up the ACK/EOI ordering */ irqd_set_handle_enforce_irqctx(irqd); return 0; } static int gic_irq_domain_translate(struct irq_domain *d, struct irq_fwspec *fwspec, unsigned long *hwirq, unsigned int *type) { if (fwspec->param_count == 1 && fwspec->param[0] < 16) { *hwirq = fwspec->param[0]; *type = IRQ_TYPE_EDGE_RISING; return 0; } if (is_of_node(fwspec->fwnode)) { if (fwspec->param_count < 3) return -EINVAL; switch (fwspec->param[0]) { case 0: /* SPI */ *hwirq = fwspec->param[1] + 32; break; case 1: /* PPI */ *hwirq = fwspec->param[1] + 16; break; case 2: /* ESPI */ *hwirq = fwspec->param[1] + ESPI_BASE_INTID; break; case 3: /* EPPI */ *hwirq = fwspec->param[1] + EPPI_BASE_INTID; break; case GIC_IRQ_TYPE_LPI: /* LPI */ *hwirq = fwspec->param[1]; break; case GIC_IRQ_TYPE_PARTITION: *hwirq = fwspec->param[1]; if (fwspec->param[1] >= 16) *hwirq += EPPI_BASE_INTID - 16; else *hwirq += 16; break; default: return -EINVAL; } *type = fwspec->param[2] & IRQ_TYPE_SENSE_MASK; /* * Make it clear that broken DTs are... broken. * Partitioned PPIs are an unfortunate exception. */ WARN_ON(*type == IRQ_TYPE_NONE && fwspec->param[0] != GIC_IRQ_TYPE_PARTITION); return 0; } if (is_fwnode_irqchip(fwspec->fwnode)) { if(fwspec->param_count != 2) return -EINVAL; if (fwspec->param[0] < 16) { pr_err(FW_BUG "Illegal GSI%d translation request\n", fwspec->param[0]); return -EINVAL; } *hwirq = fwspec->param[0]; *type = fwspec->param[1]; WARN_ON(*type == IRQ_TYPE_NONE); return 0; } return -EINVAL; } static int gic_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs, void *arg) { int i, ret; irq_hw_number_t hwirq; unsigned int type = IRQ_TYPE_NONE; struct irq_fwspec *fwspec = arg; ret = gic_irq_domain_translate(domain, fwspec, &hwirq, &type); if (ret) return ret; for (i = 0; i < nr_irqs; i++) { ret = gic_irq_domain_map(domain, virq + i, hwirq + i); if (ret) return ret; } return 0; } static void gic_irq_domain_free(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs) { int i; for (i = 0; i < nr_irqs; i++) { struct irq_data *d = irq_domain_get_irq_data(domain, virq + i); irq_set_handler(virq + i, NULL); irq_domain_reset_irq_data(d); } } static bool fwspec_is_partitioned_ppi(struct irq_fwspec *fwspec, irq_hw_number_t hwirq) { enum gic_intid_range range; if (!gic_data.ppi_descs) return false; if (!is_of_node(fwspec->fwnode)) return false; if (fwspec->param_count < 4 || !fwspec->param[3]) return false; range = __get_intid_range(hwirq); if (range != PPI_RANGE && range != EPPI_RANGE) return false; return true; } static int gic_irq_domain_select(struct irq_domain *d, struct irq_fwspec *fwspec, enum irq_domain_bus_token bus_token) { unsigned int type, ret, ppi_idx; irq_hw_number_t hwirq; /* Not for us */ if (fwspec->fwnode != d->fwnode) return 0; /* Handle pure domain searches */ if (!fwspec->param_count) return d->bus_token == bus_token; /* If this is not DT, then we have a single domain */ if (!is_of_node(fwspec->fwnode)) return 1; ret = gic_irq_domain_translate(d, fwspec, &hwirq, &type); if (WARN_ON_ONCE(ret)) return 0; if (!fwspec_is_partitioned_ppi(fwspec, hwirq)) return d == gic_data.domain; /* * If this is a PPI and we have a 4th (non-null) parameter, * then we need to match the partition domain. */ ppi_idx = __gic_get_ppi_index(hwirq); return d == partition_get_domain(gic_data.ppi_descs[ppi_idx]); } static const struct irq_domain_ops gic_irq_domain_ops = { .translate = gic_irq_domain_translate, .alloc = gic_irq_domain_alloc, .free = gic_irq_domain_free, .select = gic_irq_domain_select, }; static int partition_domain_translate(struct irq_domain *d, struct irq_fwspec *fwspec, unsigned long *hwirq, unsigned int *type) { unsigned long ppi_intid; struct device_node *np; unsigned int ppi_idx; int ret; if (!gic_data.ppi_descs) return -ENOMEM; np = of_find_node_by_phandle(fwspec->param[3]); if (WARN_ON(!np)) return -EINVAL; ret = gic_irq_domain_translate(d, fwspec, &ppi_intid, type); if (WARN_ON_ONCE(ret)) return 0; ppi_idx = __gic_get_ppi_index(ppi_intid); ret = partition_translate_id(gic_data.ppi_descs[ppi_idx], of_fwnode_handle(np)); if (ret < 0) return ret; *hwirq = ret; *type = fwspec->param[2] & IRQ_TYPE_SENSE_MASK; return 0; } static const struct irq_domain_ops partition_domain_ops = { .translate = partition_domain_translate, .select = gic_irq_domain_select, }; static bool gic_enable_quirk_msm8996(void *data) { struct gic_chip_data *d = data; d->flags |= FLAGS_WORKAROUND_GICR_WAKER_MSM8996; return true; } static bool gic_enable_quirk_cavium_38539(void *data) { struct gic_chip_data *d = data; d->flags |= FLAGS_WORKAROUND_CAVIUM_ERRATUM_38539; return true; } static bool gic_enable_quirk_hip06_07(void *data) { struct gic_chip_data *d = data; /* * HIP06 GICD_IIDR clashes with GIC-600 product number (despite * not being an actual ARM implementation). The saving grace is * that GIC-600 doesn't have ESPI, so nothing to do in that case. * HIP07 doesn't even have a proper IIDR, and still pretends to * have ESPI. In both cases, put them right. */ if (d->rdists.gicd_typer & GICD_TYPER_ESPI) { /* Zero both ESPI and the RES0 field next to it... */ d->rdists.gicd_typer &= ~GENMASK(9, 8); return true; } return false; } #define T241_CHIPN_MASK GENMASK_ULL(45, 44) #define T241_CHIP_GICDA_OFFSET 0x1580000 #define SMCCC_SOC_ID_T241 0x036b0241 static bool gic_enable_quirk_nvidia_t241(void *data) { s32 soc_id = arm_smccc_get_soc_id_version(); unsigned long chip_bmask = 0; phys_addr_t phys; u32 i; /* Check JEP106 code for NVIDIA T241 chip (036b:0241) */ if ((soc_id < 0) || (soc_id != SMCCC_SOC_ID_T241)) return false; /* Find the chips based on GICR regions PHYS addr */ for (i = 0; i < gic_data.nr_redist_regions; i++) { chip_bmask |= BIT(FIELD_GET(T241_CHIPN_MASK, (u64)gic_data.redist_regions[i].phys_base)); } if (hweight32(chip_bmask) < 3) return false; /* Setup GICD alias regions */ for (i = 0; i < ARRAY_SIZE(t241_dist_base_alias); i++) { if (chip_bmask & BIT(i)) { phys = gic_data.dist_phys_base + T241_CHIP_GICDA_OFFSET; phys |= FIELD_PREP(T241_CHIPN_MASK, i); t241_dist_base_alias[i] = ioremap(phys, SZ_64K); WARN_ON_ONCE(!t241_dist_base_alias[i]); } } static_branch_enable(&gic_nvidia_t241_erratum); return true; } static bool gic_enable_quirk_asr8601(void *data) { struct gic_chip_data *d = data; d->flags |= FLAGS_WORKAROUND_ASR_ERRATUM_8601001; return true; } static bool gic_enable_quirk_arm64_2941627(void *data) { static_branch_enable(&gic_arm64_2941627_erratum); return true; } static bool gic_enable_quirk_rk3399(void *data) { struct gic_chip_data *d = data; if (of_machine_is_compatible("rockchip,rk3399")) { d->flags |= FLAGS_WORKAROUND_INSECURE; return true; } return false; } static bool rd_set_non_coherent(void *data) { struct gic_chip_data *d = data; d->rdists.flags |= RDIST_FLAGS_FORCE_NON_SHAREABLE; return true; } static const struct gic_quirk gic_quirks[] = { { .desc = "GICv3: Qualcomm MSM8996 broken firmware", .compatible = "qcom,msm8996-gic-v3", .init = gic_enable_quirk_msm8996, }, { .desc = "GICv3: ASR erratum 8601001", .compatible = "asr,asr8601-gic-v3", .init = gic_enable_quirk_asr8601, }, { .desc = "GICv3: HIP06 erratum 161010803", .iidr = 0x0204043b, .mask = 0xffffffff, .init = gic_enable_quirk_hip06_07, }, { .desc = "GICv3: HIP07 erratum 161010803", .iidr = 0x00000000, .mask = 0xffffffff, .init = gic_enable_quirk_hip06_07, }, { /* * Reserved register accesses generate a Synchronous * External Abort. This erratum applies to: * - ThunderX: CN88xx * - OCTEON TX: CN83xx, CN81xx * - OCTEON TX2: CN93xx, CN96xx, CN98xx, CNF95xx* */ .desc = "GICv3: Cavium erratum 38539", .iidr = 0xa000034c, .mask = 0xe8f00fff, .init = gic_enable_quirk_cavium_38539, }, { .desc = "GICv3: NVIDIA erratum T241-FABRIC-4", .iidr = 0x0402043b, .mask = 0xffffffff, .init = gic_enable_quirk_nvidia_t241, }, { /* * GIC-700: 2941627 workaround - IP variant [0,1] * */ .desc = "GICv3: ARM64 erratum 2941627", .iidr = 0x0400043b, .mask = 0xff0e0fff, .init = gic_enable_quirk_arm64_2941627, }, { /* * GIC-700: 2941627 workaround - IP variant [2] */ .desc = "GICv3: ARM64 erratum 2941627", .iidr = 0x0402043b, .mask = 0xff0f0fff, .init = gic_enable_quirk_arm64_2941627, }, { .desc = "GICv3: non-coherent attribute", .property = "dma-noncoherent", .init = rd_set_non_coherent, }, { .desc = "GICv3: Insecure RK3399 integration", .iidr = 0x0000043b, .mask = 0xff000fff, .init = gic_enable_quirk_rk3399, }, { } }; static void gic_enable_nmi_support(void) { int i; if (!gic_prio_masking_enabled() || nmi_support_forbidden) return; rdist_nmi_refs = kcalloc(gic_data.ppi_nr + SGI_NR, sizeof(*rdist_nmi_refs), GFP_KERNEL); if (!rdist_nmi_refs) return; for (i = 0; i < gic_data.ppi_nr + SGI_NR; i++) refcount_set(&rdist_nmi_refs[i], 0); pr_info("Pseudo-NMIs enabled using %s ICC_PMR_EL1 synchronisation\n", gic_has_relaxed_pmr_sync() ? "relaxed" : "forced"); static_branch_enable(&supports_pseudo_nmis); if (static_branch_likely(&supports_deactivate_key)) gic_eoimode1_chip.flags |= IRQCHIP_SUPPORTS_NMI; else gic_chip.flags |= IRQCHIP_SUPPORTS_NMI; } static int __init gic_init_bases(phys_addr_t dist_phys_base, void __iomem *dist_base, struct redist_region *rdist_regs, u32 nr_redist_regions, u64 redist_stride, struct fwnode_handle *handle) { u32 typer; int err; if (!is_hyp_mode_available()) static_branch_disable(&supports_deactivate_key); if (static_branch_likely(&supports_deactivate_key)) pr_info("GIC: Using split EOI/Deactivate mode\n"); gic_data.fwnode = handle; gic_data.dist_phys_base = dist_phys_base; gic_data.dist_base = dist_base; gic_data.redist_regions = rdist_regs; gic_data.nr_redist_regions = nr_redist_regions; gic_data.redist_stride = redist_stride; /* * Find out how many interrupts are supported. */ typer = readl_relaxed(gic_data.dist_base + GICD_TYPER); gic_data.rdists.gicd_typer = typer; gic_enable_quirks(readl_relaxed(gic_data.dist_base + GICD_IIDR), gic_quirks, &gic_data); pr_info("%d SPIs implemented\n", GIC_LINE_NR - 32); pr_info("%d Extended SPIs implemented\n", GIC_ESPI_NR); /* * ThunderX1 explodes on reading GICD_TYPER2, in violation of the * architecture spec (which says that reserved registers are RES0). */ if (!(gic_data.flags & FLAGS_WORKAROUND_CAVIUM_ERRATUM_38539)) gic_data.rdists.gicd_typer2 = readl_relaxed(gic_data.dist_base + GICD_TYPER2); gic_data.domain = irq_domain_create_tree(handle, &gic_irq_domain_ops, &gic_data); gic_data.rdists.rdist = alloc_percpu(typeof(*gic_data.rdists.rdist)); if (!static_branch_unlikely(&gic_nvidia_t241_erratum)) { /* Disable GICv4.x features for the erratum T241-FABRIC-4 */ gic_data.rdists.has_rvpeid = true; gic_data.rdists.has_vlpis = true; gic_data.rdists.has_direct_lpi = true; gic_data.rdists.has_vpend_valid_dirty = true; } if (WARN_ON(!gic_data.domain) || WARN_ON(!gic_data.rdists.rdist)) { err = -ENOMEM; goto out_free; } irq_domain_update_bus_token(gic_data.domain, DOMAIN_BUS_WIRED); gic_data.has_rss = !!(typer & GICD_TYPER_RSS); if (typer & GICD_TYPER_MBIS) { err = mbi_init(handle, gic_data.domain); if (err) pr_err("Failed to initialize MBIs\n"); } set_handle_irq(gic_handle_irq); gic_update_rdist_properties(); gic_cpu_sys_reg_enable(); gic_prio_init(); gic_dist_init(); gic_cpu_init(); gic_enable_nmi_support(); gic_smp_init(); gic_cpu_pm_init(); if (gic_dist_supports_lpis()) { its_init(handle, &gic_data.rdists, gic_data.domain, dist_prio_irq); its_cpu_init(); its_lpi_memreserve_init(); } else { if (IS_ENABLED(CONFIG_ARM_GIC_V2M)) gicv2m_init(handle, gic_data.domain); } return 0; out_free: if (gic_data.domain) irq_domain_remove(gic_data.domain); free_percpu(gic_data.rdists.rdist); return err; } static int __init gic_validate_dist_version(void __iomem *dist_base) { u32 reg = readl_relaxed(dist_base + GICD_PIDR2) & GIC_PIDR2_ARCH_MASK; if (reg != GIC_PIDR2_ARCH_GICv3 && reg != GIC_PIDR2_ARCH_GICv4) return -ENODEV; return 0; } /* Create all possible partitions at boot time */ static void __init gic_populate_ppi_partitions(struct device_node *gic_node) { struct device_node *parts_node, *child_part; int part_idx = 0, i; int nr_parts; struct partition_affinity *parts; parts_node = of_get_child_by_name(gic_node, "ppi-partitions"); if (!parts_node) return; gic_data.ppi_descs = kcalloc(gic_data.ppi_nr, sizeof(*gic_data.ppi_descs), GFP_KERNEL); if (!gic_data.ppi_descs) goto out_put_node; nr_parts = of_get_child_count(parts_node); if (!nr_parts) goto out_put_node; parts = kcalloc(nr_parts, sizeof(*parts), GFP_KERNEL); if (WARN_ON(!parts)) goto out_put_node; for_each_child_of_node(parts_node, child_part) { struct partition_affinity *part; int n; part = &parts[part_idx]; part->partition_id = of_fwnode_handle(child_part); pr_info("GIC: PPI partition %pOFn[%d] { ", child_part, part_idx); n = of_property_count_elems_of_size(child_part, "affinity", sizeof(u32)); WARN_ON(n <= 0); for (i = 0; i < n; i++) { int err, cpu; u32 cpu_phandle; struct device_node *cpu_node; err = of_property_read_u32_index(child_part, "affinity", i, &cpu_phandle); if (WARN_ON(err)) continue; cpu_node = of_find_node_by_phandle(cpu_phandle); if (WARN_ON(!cpu_node)) continue; cpu = of_cpu_node_to_id(cpu_node); if (WARN_ON(cpu < 0)) { of_node_put(cpu_node); continue; } pr_cont("%pOF[%d] ", cpu_node, cpu); cpumask_set_cpu(cpu, &part->mask); of_node_put(cpu_node); } pr_cont("}\n"); part_idx++; } for (i = 0; i < gic_data.ppi_nr; i++) { unsigned int irq; struct partition_desc *desc; struct irq_fwspec ppi_fwspec = { .fwnode = gic_data.fwnode, .param_count = 3, .param = { [0] = GIC_IRQ_TYPE_PARTITION, [1] = i, [2] = IRQ_TYPE_NONE, }, }; irq = irq_create_fwspec_mapping(&ppi_fwspec); if (WARN_ON(!irq)) continue; desc = partition_create_desc(gic_data.fwnode, parts, nr_parts, irq, &partition_domain_ops); if (WARN_ON(!desc)) continue; gic_data.ppi_descs[i] = desc; } out_put_node: of_node_put(parts_node); } static void __init gic_of_setup_kvm_info(struct device_node *node, u32 nr_redist_regions) { int ret; struct resource r; gic_v3_kvm_info.type = GIC_V3; gic_v3_kvm_info.maint_irq = irq_of_parse_and_map(node, 0); if (!gic_v3_kvm_info.maint_irq) return; /* Also skip GICD, GICC, GICH */ ret = of_address_to_resource(node, nr_redist_regions + 3, &r); if (!ret) gic_v3_kvm_info.vcpu = r; gic_v3_kvm_info.has_v4 = gic_data.rdists.has_vlpis; gic_v3_kvm_info.has_v4_1 = gic_data.rdists.has_rvpeid; vgic_set_kvm_info(&gic_v3_kvm_info); } static void gic_request_region(resource_size_t base, resource_size_t size, const char *name) { if (!request_mem_region(base, size, name)) pr_warn_once(FW_BUG "%s region %pa has overlapping address\n", name, &base); } static void __iomem *gic_of_iomap(struct device_node *node, int idx, const char *name, struct resource *res) { void __iomem *base; int ret; ret = of_address_to_resource(node, idx, res); if (ret) return IOMEM_ERR_PTR(ret); gic_request_region(res->start, resource_size(res), name); base = of_iomap(node, idx); return base ?: IOMEM_ERR_PTR(-ENOMEM); } static int __init gic_of_init(struct device_node *node, struct device_node *parent) { phys_addr_t dist_phys_base; void __iomem *dist_base; struct redist_region *rdist_regs; struct resource res; u64 redist_stride; u32 nr_redist_regions; int err, i; dist_base = gic_of_iomap(node, 0, "GICD", &res); if (IS_ERR(dist_base)) { pr_err("%pOF: unable to map gic dist registers\n", node); return PTR_ERR(dist_base); } dist_phys_base = res.start; err = gic_validate_dist_version(dist_base); if (err) { pr_err("%pOF: no distributor detected, giving up\n", node); goto out_unmap_dist; } if (of_property_read_u32(node, "#redistributor-regions", &nr_redist_regions)) nr_redist_regions = 1; rdist_regs = kcalloc(nr_redist_regions, sizeof(*rdist_regs), GFP_KERNEL); if (!rdist_regs) { err = -ENOMEM; goto out_unmap_dist; } for (i = 0; i < nr_redist_regions; i++) { rdist_regs[i].redist_base = gic_of_iomap(node, 1 + i, "GICR", &res); if (IS_ERR(rdist_regs[i].redist_base)) { pr_err("%pOF: couldn't map region %d\n", node, i); err = -ENODEV; goto out_unmap_rdist; } rdist_regs[i].phys_base = res.start; } if (of_property_read_u64(node, "redistributor-stride", &redist_stride)) redist_stride = 0; gic_enable_of_quirks(node, gic_quirks, &gic_data); err = gic_init_bases(dist_phys_base, dist_base, rdist_regs, nr_redist_regions, redist_stride, &node->fwnode); if (err) goto out_unmap_rdist; gic_populate_ppi_partitions(node); if (static_branch_likely(&supports_deactivate_key)) gic_of_setup_kvm_info(node, nr_redist_regions); return 0; out_unmap_rdist: for (i = 0; i < nr_redist_regions; i++) if (rdist_regs[i].redist_base && !IS_ERR(rdist_regs[i].redist_base)) iounmap(rdist_regs[i].redist_base); kfree(rdist_regs); out_unmap_dist: iounmap(dist_base); return err; } IRQCHIP_DECLARE(gic_v3, "arm,gic-v3", gic_of_init); #ifdef CONFIG_ACPI static struct { void __iomem *dist_base; struct redist_region *redist_regs; u32 nr_redist_regions; bool single_redist; int enabled_rdists; u32 maint_irq; int maint_irq_mode; phys_addr_t vcpu_base; } acpi_data __initdata; static void __init gic_acpi_register_redist(phys_addr_t phys_base, void __iomem *redist_base) { static int count = 0; acpi_data.redist_regs[count].phys_base = phys_base; acpi_data.redist_regs[count].redist_base = redist_base; acpi_data.redist_regs[count].single_redist = acpi_data.single_redist; count++; } static int __init gic_acpi_parse_madt_redist(union acpi_subtable_headers *header, const unsigned long end) { struct acpi_madt_generic_redistributor *redist = (struct acpi_madt_generic_redistributor *)header; void __iomem *redist_base; redist_base = ioremap(redist->base_address, redist->length); if (!redist_base) { pr_err("Couldn't map GICR region @%llx\n", redist->base_address); return -ENOMEM; } if (acpi_get_madt_revision() >= 7 && (redist->flags & ACPI_MADT_GICR_NON_COHERENT)) gic_data.rdists.flags |= RDIST_FLAGS_FORCE_NON_SHAREABLE; gic_request_region(redist->base_address, redist->length, "GICR"); gic_acpi_register_redist(redist->base_address, redist_base); return 0; } static int __init gic_acpi_parse_madt_gicc(union acpi_subtable_headers *header, const unsigned long end) { struct acpi_madt_generic_interrupt *gicc = (struct acpi_madt_generic_interrupt *)header; u32 reg = readl_relaxed(acpi_data.dist_base + GICD_PIDR2) & GIC_PIDR2_ARCH_MASK; u32 size = reg == GIC_PIDR2_ARCH_GICv4 ? SZ_64K * 4 : SZ_64K * 2; void __iomem *redist_base; /* Neither enabled or online capable means it doesn't exist, skip it */ if (!(gicc->flags & (ACPI_MADT_ENABLED | ACPI_MADT_GICC_ONLINE_CAPABLE))) return 0; /* * Capable but disabled CPUs can be brought online later. What about * the redistributor? ACPI doesn't want to say! * Virtual hotplug systems can use the MADT's "always-on" GICR entries. * Otherwise, prevent such CPUs from being brought online. */ if (!(gicc->flags & ACPI_MADT_ENABLED)) { int cpu = get_cpu_for_acpi_id(gicc->uid); pr_warn("CPU %u's redistributor is inaccessible: this CPU can't be brought online\n", cpu); if (cpu >= 0) cpumask_set_cpu(cpu, &broken_rdists); return 0; } redist_base = ioremap(gicc->gicr_base_address, size); if (!redist_base) return -ENOMEM; gic_request_region(gicc->gicr_base_address, size, "GICR"); if (acpi_get_madt_revision() >= 7 && (gicc->flags & ACPI_MADT_GICC_NON_COHERENT)) gic_data.rdists.flags |= RDIST_FLAGS_FORCE_NON_SHAREABLE; gic_acpi_register_redist(gicc->gicr_base_address, redist_base); return 0; } static int __init gic_acpi_collect_gicr_base(void) { acpi_tbl_entry_handler redist_parser; enum acpi_madt_type type; if (acpi_data.single_redist) { type = ACPI_MADT_TYPE_GENERIC_INTERRUPT; redist_parser = gic_acpi_parse_madt_gicc; } else { type = ACPI_MADT_TYPE_GENERIC_REDISTRIBUTOR; redist_parser = gic_acpi_parse_madt_redist; } /* Collect redistributor base addresses in GICR entries */ if (acpi_table_parse_madt(type, redist_parser, 0) > 0) return 0; pr_info("No valid GICR entries exist\n"); return -ENODEV; } static int __init gic_acpi_match_gicr(union acpi_subtable_headers *header, const unsigned long end) { /* Subtable presence means that redist exists, that's it */ return 0; } static int __init gic_acpi_match_gicc(union acpi_subtable_headers *header, const unsigned long end) { struct acpi_madt_generic_interrupt *gicc = (struct acpi_madt_generic_interrupt *)header; /* * If GICC is enabled and has valid gicr base address, then it means * GICR base is presented via GICC. The redistributor is only known to * be accessible if the GICC is marked as enabled. If this bit is not * set, we'd need to add the redistributor at runtime, which isn't * supported. */ if (gicc->flags & ACPI_MADT_ENABLED && gicc->gicr_base_address) acpi_data.enabled_rdists++; return 0; } static int __init gic_acpi_count_gicr_regions(void) { int count; /* * Count how many redistributor regions we have. It is not allowed * to mix redistributor description, GICR and GICC subtables have to be * mutually exclusive. */ count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_REDISTRIBUTOR, gic_acpi_match_gicr, 0); if (count > 0) { acpi_data.single_redist = false; return count; } count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_INTERRUPT, gic_acpi_match_gicc, 0); if (count > 0) { acpi_data.single_redist = true; count = acpi_data.enabled_rdists; } return count; } static bool __init acpi_validate_gic_table(struct acpi_subtable_header *header, struct acpi_probe_entry *ape) { struct acpi_madt_generic_distributor *dist; int count; dist = (struct acpi_madt_generic_distributor *)header; if (dist->version != ape->driver_data) return false; /* We need to do that exercise anyway, the sooner the better */ count = gic_acpi_count_gicr_regions(); if (count <= 0) return false; acpi_data.nr_redist_regions = count; return true; } static int __init gic_acpi_parse_virt_madt_gicc(union acpi_subtable_headers *header, const unsigned long end) { struct acpi_madt_generic_interrupt *gicc = (struct acpi_madt_generic_interrupt *)header; int maint_irq_mode; static int first_madt = true; if (!(gicc->flags & (ACPI_MADT_ENABLED | ACPI_MADT_GICC_ONLINE_CAPABLE))) return 0; maint_irq_mode = (gicc->flags & ACPI_MADT_VGIC_IRQ_MODE) ? ACPI_EDGE_SENSITIVE : ACPI_LEVEL_SENSITIVE; if (first_madt) { first_madt = false; acpi_data.maint_irq = gicc->vgic_interrupt; acpi_data.maint_irq_mode = maint_irq_mode; acpi_data.vcpu_base = gicc->gicv_base_address; return 0; } /* * The maintenance interrupt and GICV should be the same for every CPU */ if ((acpi_data.maint_irq != gicc->vgic_interrupt) || (acpi_data.maint_irq_mode != maint_irq_mode) || (acpi_data.vcpu_base != gicc->gicv_base_address)) return -EINVAL; return 0; } static bool __init gic_acpi_collect_virt_info(void) { int count; count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_INTERRUPT, gic_acpi_parse_virt_madt_gicc, 0); return (count > 0); } #define ACPI_GICV3_DIST_MEM_SIZE (SZ_64K) #define ACPI_GICV2_VCTRL_MEM_SIZE (SZ_4K) #define ACPI_GICV2_VCPU_MEM_SIZE (SZ_8K) static void __init gic_acpi_setup_kvm_info(void) { int irq; if (!gic_acpi_collect_virt_info()) { pr_warn("Unable to get hardware information used for virtualization\n"); return; } gic_v3_kvm_info.type = GIC_V3; irq = acpi_register_gsi(NULL, acpi_data.maint_irq, acpi_data.maint_irq_mode, ACPI_ACTIVE_HIGH); if (irq <= 0) return; gic_v3_kvm_info.maint_irq = irq; if (acpi_data.vcpu_base) { struct resource *vcpu = &gic_v3_kvm_info.vcpu; vcpu->flags = IORESOURCE_MEM; vcpu->start = acpi_data.vcpu_base; vcpu->end = vcpu->start + ACPI_GICV2_VCPU_MEM_SIZE - 1; } gic_v3_kvm_info.has_v4 = gic_data.rdists.has_vlpis; gic_v3_kvm_info.has_v4_1 = gic_data.rdists.has_rvpeid; vgic_set_kvm_info(&gic_v3_kvm_info); } static struct fwnode_handle *gsi_domain_handle; static struct fwnode_handle *gic_v3_get_gsi_domain_id(u32 gsi) { return gsi_domain_handle; } static int __init gic_acpi_init(union acpi_subtable_headers *header, const unsigned long end) { struct acpi_madt_generic_distributor *dist; size_t size; int i, err; /* Get distributor base address */ dist = (struct acpi_madt_generic_distributor *)header; acpi_data.dist_base = ioremap(dist->base_address, ACPI_GICV3_DIST_MEM_SIZE); if (!acpi_data.dist_base) { pr_err("Unable to map GICD registers\n"); return -ENOMEM; } gic_request_region(dist->base_address, ACPI_GICV3_DIST_MEM_SIZE, "GICD"); err = gic_validate_dist_version(acpi_data.dist_base); if (err) { pr_err("No distributor detected at @%p, giving up\n", acpi_data.dist_base); goto out_dist_unmap; } size = sizeof(*acpi_data.redist_regs) * acpi_data.nr_redist_regions; acpi_data.redist_regs = kzalloc(size, GFP_KERNEL); if (!acpi_data.redist_regs) { err = -ENOMEM; goto out_dist_unmap; } err = gic_acpi_collect_gicr_base(); if (err) goto out_redist_unmap; gsi_domain_handle = irq_domain_alloc_fwnode(&dist->base_address); if (!gsi_domain_handle) { err = -ENOMEM; goto out_redist_unmap; } err = gic_init_bases(dist->base_address, acpi_data.dist_base, acpi_data.redist_regs, acpi_data.nr_redist_regions, 0, gsi_domain_handle); if (err) goto out_fwhandle_free; acpi_set_irq_model(ACPI_IRQ_MODEL_GIC, gic_v3_get_gsi_domain_id); if (static_branch_likely(&supports_deactivate_key)) gic_acpi_setup_kvm_info(); return 0; out_fwhandle_free: irq_domain_free_fwnode(gsi_domain_handle); out_redist_unmap: for (i = 0; i < acpi_data.nr_redist_regions; i++) if (acpi_data.redist_regs[i].redist_base) iounmap(acpi_data.redist_regs[i].redist_base); kfree(acpi_data.redist_regs); out_dist_unmap: iounmap(acpi_data.dist_base); return err; } IRQCHIP_ACPI_DECLARE(gic_v3, ACPI_MADT_TYPE_GENERIC_DISTRIBUTOR, acpi_validate_gic_table, ACPI_MADT_GIC_VERSION_V3, gic_acpi_init); IRQCHIP_ACPI_DECLARE(gic_v4, ACPI_MADT_TYPE_GENERIC_DISTRIBUTOR, acpi_validate_gic_table, ACPI_MADT_GIC_VERSION_V4, gic_acpi_init); IRQCHIP_ACPI_DECLARE(gic_v3_or_v4, ACPI_MADT_TYPE_GENERIC_DISTRIBUTOR, acpi_validate_gic_table, ACPI_MADT_GIC_VERSION_NONE, gic_acpi_init); #endif
3 4 4 1 5 4 5 11 11 5 5 4 5 5 4 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 // SPDX-License-Identifier: GPL-2.0 #include <linux/compiler.h> #include <linux/export.h> #include <linux/list_sort.h> #include <linux/list.h> /* * Returns a list organized in an intermediate format suited * to chaining of merge() calls: null-terminated, no reserved or * sentinel head node, "prev" links not maintained. */ __attribute__((nonnull(2,3,4))) static struct list_head *merge(void *priv, list_cmp_func_t cmp, struct list_head *a, struct list_head *b) { struct list_head *head, **tail = &head; for (;;) { /* if equal, take 'a' -- important for sort stability */ if (cmp(priv, a, b) <= 0) { *tail = a; tail = &a->next; a = a->next; if (!a) { *tail = b; break; } } else { *tail = b; tail = &b->next; b = b->next; if (!b) { *tail = a; break; } } } return head; } /* * Combine final list merge with restoration of standard doubly-linked * list structure. This approach duplicates code from merge(), but * runs faster than the tidier alternatives of either a separate final * prev-link restoration pass, or maintaining the prev links * throughout. */ __attribute__((nonnull(2,3,4,5))) static void merge_final(void *priv, list_cmp_func_t cmp, struct list_head *head, struct list_head *a, struct list_head *b) { struct list_head *tail = head; u8 count = 0; for (;;) { /* if equal, take 'a' -- important for sort stability */ if (cmp(priv, a, b) <= 0) { tail->next = a; a->prev = tail; tail = a; a = a->next; if (!a) break; } else { tail->next = b; b->prev = tail; tail = b; b = b->next; if (!b) { b = a; break; } } } /* Finish linking remainder of list b on to tail */ tail->next = b; do { /* * If the merge is highly unbalanced (e.g. the input is * already sorted), this loop may run many iterations. * Continue callbacks to the client even though no * element comparison is needed, so the client's cmp() * routine can invoke cond_resched() periodically. */ if (unlikely(!++count)) cmp(priv, b, b); b->prev = tail; tail = b; b = b->next; } while (b); /* And the final links to make a circular doubly-linked list */ tail->next = head; head->prev = tail; } /** * list_sort - sort a list * @priv: private data, opaque to list_sort(), passed to @cmp * @head: the list to sort * @cmp: the elements comparison function * * The comparison function @cmp must return > 0 if @a should sort after * @b ("@a > @b" if you want an ascending sort), and <= 0 if @a should * sort before @b *or* their original order should be preserved. It is * always called with the element that came first in the input in @a, * and list_sort is a stable sort, so it is not necessary to distinguish * the @a < @b and @a == @b cases. * * The comparison function must adhere to specific mathematical properties * to ensure correct and stable sorting: * - Antisymmetry: cmp(@a, @b) must return the opposite sign of * cmp(@b, @a). * - Transitivity: if cmp(@a, @b) <= 0 and cmp(@b, @c) <= 0, then * cmp(@a, @c) <= 0. * * This is compatible with two styles of @cmp function: * - The traditional style which returns <0 / =0 / >0, or * - Returning a boolean 0/1. * The latter offers a chance to save a few cycles in the comparison * (which is used by e.g. plug_ctx_cmp() in block/blk-mq.c). * * A good way to write a multi-word comparison is:: * * if (a->high != b->high) * return a->high > b->high; * if (a->middle != b->middle) * return a->middle > b->middle; * return a->low > b->low; * * * This mergesort is as eager as possible while always performing at least * 2:1 balanced merges. Given two pending sublists of size 2^k, they are * merged to a size-2^(k+1) list as soon as we have 2^k following elements. * * Thus, it will avoid cache thrashing as long as 3*2^k elements can * fit into the cache. Not quite as good as a fully-eager bottom-up * mergesort, but it does use 0.2*n fewer comparisons, so is faster in * the common case that everything fits into L1. * * * The merging is controlled by "count", the number of elements in the * pending lists. This is beautifully simple code, but rather subtle. * * Each time we increment "count", we set one bit (bit k) and clear * bits k-1 .. 0. Each time this happens (except the very first time * for each bit, when count increments to 2^k), we merge two lists of * size 2^k into one list of size 2^(k+1). * * This merge happens exactly when the count reaches an odd multiple of * 2^k, which is when we have 2^k elements pending in smaller lists, * so it's safe to merge away two lists of size 2^k. * * After this happens twice, we have created two lists of size 2^(k+1), * which will be merged into a list of size 2^(k+2) before we create * a third list of size 2^(k+1), so there are never more than two pending. * * The number of pending lists of size 2^k is determined by the * state of bit k of "count" plus two extra pieces of information: * * - The state of bit k-1 (when k == 0, consider bit -1 always set), and * - Whether the higher-order bits are zero or non-zero (i.e. * is count >= 2^(k+1)). * * There are six states we distinguish. "x" represents some arbitrary * bits, and "y" represents some arbitrary non-zero bits: * 0: 00x: 0 pending of size 2^k; x pending of sizes < 2^k * 1: 01x: 0 pending of size 2^k; 2^(k-1) + x pending of sizes < 2^k * 2: x10x: 0 pending of size 2^k; 2^k + x pending of sizes < 2^k * 3: x11x: 1 pending of size 2^k; 2^(k-1) + x pending of sizes < 2^k * 4: y00x: 1 pending of size 2^k; 2^k + x pending of sizes < 2^k * 5: y01x: 2 pending of size 2^k; 2^(k-1) + x pending of sizes < 2^k * (merge and loop back to state 2) * * We gain lists of size 2^k in the 2->3 and 4->5 transitions (because * bit k-1 is set while the more significant bits are non-zero) and * merge them away in the 5->2 transition. Note in particular that just * before the 5->2 transition, all lower-order bits are 11 (state 3), * so there is one list of each smaller size. * * When we reach the end of the input, we merge all the pending * lists, from smallest to largest. If you work through cases 2 to * 5 above, you can see that the number of elements we merge with a list * of size 2^k varies from 2^(k-1) (cases 3 and 5 when x == 0) to * 2^(k+1) - 1 (second merge of case 5 when x == 2^(k-1) - 1). */ __attribute__((nonnull(2,3))) void list_sort(void *priv, struct list_head *head, list_cmp_func_t cmp) { struct list_head *list = head->next, *pending = NULL; size_t count = 0; /* Count of pending */ if (list == head->prev) /* Zero or one elements */ return; /* Convert to a null-terminated singly-linked list. */ head->prev->next = NULL; /* * Data structure invariants: * - All lists are singly linked and null-terminated; prev * pointers are not maintained. * - pending is a prev-linked "list of lists" of sorted * sublists awaiting further merging. * - Each of the sorted sublists is power-of-two in size. * - Sublists are sorted by size and age, smallest & newest at front. * - There are zero to two sublists of each size. * - A pair of pending sublists are merged as soon as the number * of following pending elements equals their size (i.e. * each time count reaches an odd multiple of that size). * That ensures each later final merge will be at worst 2:1. * - Each round consists of: * - Merging the two sublists selected by the highest bit * which flips when count is incremented, and * - Adding an element from the input as a size-1 sublist. */ do { size_t bits; struct list_head **tail = &pending; /* Find the least-significant clear bit in count */ for (bits = count; bits & 1; bits >>= 1) tail = &(*tail)->prev; /* Do the indicated merge */ if (likely(bits)) { struct list_head *a = *tail, *b = a->prev; a = merge(priv, cmp, b, a); /* Install the merged result in place of the inputs */ a->prev = b->prev; *tail = a; } /* Move one element from input list to pending */ list->prev = pending; pending = list; list = list->next; pending->next = NULL; count++; } while (list); /* End of input; merge together all the pending lists. */ list = pending; pending = pending->prev; for (;;) { struct list_head *next = pending->prev; if (!next) break; list = merge(priv, cmp, pending, list); pending = next; } /* The final merge, rebuilding prev links */ merge_final(priv, cmp, head, pending, list); } EXPORT_SYMBOL(list_sort);
305 305 333 336 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 // SPDX-License-Identifier: GPL-2.0-only /* * fs/userfaultfd.c * * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org> * Copyright (C) 2008-2009 Red Hat, Inc. * Copyright (C) 2015 Red Hat, Inc. * * Some part derived from fs/eventfd.c (anon inode setup) and * mm/ksm.c (mm hashing). */ #include <linux/list.h> #include <linux/hashtable.h> #include <linux/sched/signal.h> #include <linux/sched/mm.h> #include <linux/mm.h> #include <linux/mm_inline.h> #include <linux/mmu_notifier.h> #include <linux/poll.h> #include <linux/slab.h> #include <linux/seq_file.h> #include <linux/file.h> #include <linux/bug.h> #include <linux/anon_inodes.h> #include <linux/syscalls.h> #include <linux/userfaultfd_k.h> #include <linux/mempolicy.h> #include <linux/ioctl.h> #include <linux/security.h> #include <linux/hugetlb.h> #include <linux/swapops.h> #include <linux/miscdevice.h> #include <linux/uio.h> static int sysctl_unprivileged_userfaultfd __read_mostly; #ifdef CONFIG_SYSCTL static const struct ctl_table vm_userfaultfd_table[] = { { .procname = "unprivileged_userfaultfd", .data = &sysctl_unprivileged_userfaultfd, .maxlen = sizeof(sysctl_unprivileged_userfaultfd), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, }; #endif static struct kmem_cache *userfaultfd_ctx_cachep __ro_after_init; struct userfaultfd_fork_ctx { struct userfaultfd_ctx *orig; struct userfaultfd_ctx *new; struct list_head list; }; struct userfaultfd_unmap_ctx { struct userfaultfd_ctx *ctx; unsigned long start; unsigned long end; struct list_head list; }; struct userfaultfd_wait_queue { struct uffd_msg msg; wait_queue_entry_t wq; struct userfaultfd_ctx *ctx; bool waken; }; struct userfaultfd_wake_range { unsigned long start; unsigned long len; }; /* internal indication that UFFD_API ioctl was successfully executed */ #define UFFD_FEATURE_INITIALIZED (1u << 31) static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx) { return ctx->features & UFFD_FEATURE_INITIALIZED; } static bool userfaultfd_wp_async_ctx(struct userfaultfd_ctx *ctx) { return ctx && (ctx->features & UFFD_FEATURE_WP_ASYNC); } /* * Whether WP_UNPOPULATED is enabled on the uffd context. It is only * meaningful when userfaultfd_wp()==true on the vma and when it's * anonymous. */ bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma) { struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx; if (!ctx) return false; return ctx->features & UFFD_FEATURE_WP_UNPOPULATED; } static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode, int wake_flags, void *key) { struct userfaultfd_wake_range *range = key; int ret; struct userfaultfd_wait_queue *uwq; unsigned long start, len; uwq = container_of(wq, struct userfaultfd_wait_queue, wq); ret = 0; /* len == 0 means wake all */ start = range->start; len = range->len; if (len && (start > uwq->msg.arg.pagefault.address || start + len <= uwq->msg.arg.pagefault.address)) goto out; WRITE_ONCE(uwq->waken, true); /* * The Program-Order guarantees provided by the scheduler * ensure uwq->waken is visible before the task is woken. */ ret = wake_up_state(wq->private, mode); if (ret) { /* * Wake only once, autoremove behavior. * * After the effect of list_del_init is visible to the other * CPUs, the waitqueue may disappear from under us, see the * !list_empty_careful() in handle_userfault(). * * try_to_wake_up() has an implicit smp_mb(), and the * wq->private is read before calling the extern function * "wake_up_state" (which in turns calls try_to_wake_up). */ list_del_init(&wq->entry); } out: return ret; } /** * userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd * context. * @ctx: [in] Pointer to the userfaultfd context. */ static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx) { refcount_inc(&ctx->refcount); } /** * userfaultfd_ctx_put - Releases a reference to the internal userfaultfd * context. * @ctx: [in] Pointer to userfaultfd context. * * The userfaultfd context reference must have been previously acquired either * with userfaultfd_ctx_get() or userfaultfd_ctx_fdget(). */ static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx) { if (refcount_dec_and_test(&ctx->refcount)) { VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock)); VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh)); VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock)); VM_BUG_ON(waitqueue_active(&ctx->fault_wqh)); VM_BUG_ON(spin_is_locked(&ctx->event_wqh.lock)); VM_BUG_ON(waitqueue_active(&ctx->event_wqh)); VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock)); VM_BUG_ON(waitqueue_active(&ctx->fd_wqh)); mmdrop(ctx->mm); kmem_cache_free(userfaultfd_ctx_cachep, ctx); } } static inline void msg_init(struct uffd_msg *msg) { BUILD_BUG_ON(sizeof(struct uffd_msg) != 32); /* * Must use memset to zero out the paddings or kernel data is * leaked to userland. */ memset(msg, 0, sizeof(struct uffd_msg)); } static inline struct uffd_msg userfault_msg(unsigned long address, unsigned long real_address, unsigned int flags, unsigned long reason, unsigned int features) { struct uffd_msg msg; msg_init(&msg); msg.event = UFFD_EVENT_PAGEFAULT; msg.arg.pagefault.address = (features & UFFD_FEATURE_EXACT_ADDRESS) ? real_address : address; /* * These flags indicate why the userfault occurred: * - UFFD_PAGEFAULT_FLAG_WP indicates a write protect fault. * - UFFD_PAGEFAULT_FLAG_MINOR indicates a minor fault. * - Neither of these flags being set indicates a MISSING fault. * * Separately, UFFD_PAGEFAULT_FLAG_WRITE indicates it was a write * fault. Otherwise, it was a read fault. */ if (flags & FAULT_FLAG_WRITE) msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE; if (reason & VM_UFFD_WP) msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP; if (reason & VM_UFFD_MINOR) msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_MINOR; if (features & UFFD_FEATURE_THREAD_ID) msg.arg.pagefault.feat.ptid = task_pid_vnr(current); return msg; } #ifdef CONFIG_HUGETLB_PAGE /* * Same functionality as userfaultfd_must_wait below with modifications for * hugepmd ranges. */ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, struct vm_fault *vmf, unsigned long reason) { struct vm_area_struct *vma = vmf->vma; pte_t *ptep, pte; bool ret = true; assert_fault_locked(vmf); ptep = hugetlb_walk(vma, vmf->address, vma_mmu_pagesize(vma)); if (!ptep) goto out; ret = false; pte = huge_ptep_get(vma->vm_mm, vmf->address, ptep); /* * Lockless access: we're in a wait_event so it's ok if it * changes under us. PTE markers should be handled the same as none * ptes here. */ if (huge_pte_none_mostly(pte)) ret = true; if (!huge_pte_write(pte) && (reason & VM_UFFD_WP)) ret = true; out: return ret; } #else static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, struct vm_fault *vmf, unsigned long reason) { return false; /* should never get here */ } #endif /* CONFIG_HUGETLB_PAGE */ /* * Verify the pagetables are still not ok after having reigstered into * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any * userfault that has already been resolved, if userfaultfd_read_iter and * UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different * threads. */ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, struct vm_fault *vmf, unsigned long reason) { struct mm_struct *mm = ctx->mm; unsigned long address = vmf->address; pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd, _pmd; pte_t *pte; pte_t ptent; bool ret = true; assert_fault_locked(vmf); pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) goto out; p4d = p4d_offset(pgd, address); if (!p4d_present(*p4d)) goto out; pud = pud_offset(p4d, address); if (!pud_present(*pud)) goto out; pmd = pmd_offset(pud, address); again: _pmd = pmdp_get_lockless(pmd); if (pmd_none(_pmd)) goto out; ret = false; if (!pmd_present(_pmd) || pmd_devmap(_pmd)) goto out; if (pmd_trans_huge(_pmd)) { if (!pmd_write(_pmd) && (reason & VM_UFFD_WP)) ret = true; goto out; } pte = pte_offset_map(pmd, address); if (!pte) { ret = true; goto again; } /* * Lockless access: we're in a wait_event so it's ok if it * changes under us. PTE markers should be handled the same as none * ptes here. */ ptent = ptep_get(pte); if (pte_none_mostly(ptent)) ret = true; if (!pte_write(ptent) && (reason & VM_UFFD_WP)) ret = true; pte_unmap(pte); out: return ret; } static inline unsigned int userfaultfd_get_blocking_state(unsigned int flags) { if (flags & FAULT_FLAG_INTERRUPTIBLE) return TASK_INTERRUPTIBLE; if (flags & FAULT_FLAG_KILLABLE) return TASK_KILLABLE; return TASK_UNINTERRUPTIBLE; } /* * The locking rules involved in returning VM_FAULT_RETRY depending on * FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and * FAULT_FLAG_KILLABLE are not straightforward. The "Caution" * recommendation in __lock_page_or_retry is not an understatement. * * If FAULT_FLAG_ALLOW_RETRY is set, the mmap_lock must be released * before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is * not set. * * If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not * set, VM_FAULT_RETRY can still be returned if and only if there are * fatal_signal_pending()s, and the mmap_lock must be released before * returning it. */ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) { struct vm_area_struct *vma = vmf->vma; struct mm_struct *mm = vma->vm_mm; struct userfaultfd_ctx *ctx; struct userfaultfd_wait_queue uwq; vm_fault_t ret = VM_FAULT_SIGBUS; bool must_wait; unsigned int blocking_state; /* * We don't do userfault handling for the final child pid update * and when coredumping (faults triggered by get_dump_page()). */ if (current->flags & (PF_EXITING|PF_DUMPCORE)) goto out; assert_fault_locked(vmf); ctx = vma->vm_userfaultfd_ctx.ctx; if (!ctx) goto out; BUG_ON(ctx->mm != mm); /* Any unrecognized flag is a bug. */ VM_BUG_ON(reason & ~__VM_UFFD_FLAGS); /* 0 or > 1 flags set is a bug; we expect exactly 1. */ VM_BUG_ON(!reason || (reason & (reason - 1))); if (ctx->features & UFFD_FEATURE_SIGBUS) goto out; if (!(vmf->flags & FAULT_FLAG_USER) && (ctx->flags & UFFD_USER_MODE_ONLY)) goto out; /* * Check that we can return VM_FAULT_RETRY. * * NOTE: it should become possible to return VM_FAULT_RETRY * even if FAULT_FLAG_TRIED is set without leading to gup() * -EBUSY failures, if the userfaultfd is to be extended for * VM_UFFD_WP tracking and we intend to arm the userfault * without first stopping userland access to the memory. For * VM_UFFD_MISSING userfaults this is enough for now. */ if (unlikely(!(vmf->flags & FAULT_FLAG_ALLOW_RETRY))) { /* * Validate the invariant that nowait must allow retry * to be sure not to return SIGBUS erroneously on * nowait invocations. */ BUG_ON(vmf->flags & FAULT_FLAG_RETRY_NOWAIT); #ifdef CONFIG_DEBUG_VM if (printk_ratelimit()) { printk(KERN_WARNING "FAULT_FLAG_ALLOW_RETRY missing %x\n", vmf->flags); dump_stack(); } #endif goto out; } /* * Handle nowait, not much to do other than tell it to retry * and wait. */ ret = VM_FAULT_RETRY; if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) goto out; if (unlikely(READ_ONCE(ctx->released))) { /* * If a concurrent release is detected, do not return * VM_FAULT_SIGBUS or VM_FAULT_NOPAGE, but instead always * return VM_FAULT_RETRY with lock released proactively. * * If we were to return VM_FAULT_SIGBUS here, the non * cooperative manager would be instead forced to * always call UFFDIO_UNREGISTER before it can safely * close the uffd, to avoid involuntary SIGBUS triggered. * * If we were to return VM_FAULT_NOPAGE, it would work for * the fault path, in which the lock will be released * later. However for GUP, faultin_page() does nothing * special on NOPAGE, so GUP would spin retrying without * releasing the mmap read lock, causing possible livelock. * * Here only VM_FAULT_RETRY would make sure the mmap lock * be released immediately, so that the thread concurrently * releasing the userfault would always make progress. */ release_fault_lock(vmf); goto out; } /* take the reference before dropping the mmap_lock */ userfaultfd_ctx_get(ctx); init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); uwq.wq.private = current; uwq.msg = userfault_msg(vmf->address, vmf->real_address, vmf->flags, reason, ctx->features); uwq.ctx = ctx; uwq.waken = false; blocking_state = userfaultfd_get_blocking_state(vmf->flags); /* * Take the vma lock now, in order to safely call * userfaultfd_huge_must_wait() later. Since acquiring the * (sleepable) vma lock can modify the current task state, that * must be before explicitly calling set_current_state(). */ if (is_vm_hugetlb_page(vma)) hugetlb_vma_lock_read(vma); spin_lock_irq(&ctx->fault_pending_wqh.lock); /* * After the __add_wait_queue the uwq is visible to userland * through poll/read(). */ __add_wait_queue(&ctx->fault_pending_wqh, &uwq.wq); /* * The smp_mb() after __set_current_state prevents the reads * following the spin_unlock to happen before the list_add in * __add_wait_queue. */ set_current_state(blocking_state); spin_unlock_irq(&ctx->fault_pending_wqh.lock); if (!is_vm_hugetlb_page(vma)) must_wait = userfaultfd_must_wait(ctx, vmf, reason); else must_wait = userfaultfd_huge_must_wait(ctx, vmf, reason); if (is_vm_hugetlb_page(vma)) hugetlb_vma_unlock_read(vma); release_fault_lock(vmf); if (likely(must_wait && !READ_ONCE(ctx->released))) { wake_up_poll(&ctx->fd_wqh, EPOLLIN); schedule(); } __set_current_state(TASK_RUNNING); /* * Here we race with the list_del; list_add in * userfaultfd_ctx_read(), however because we don't ever run * list_del_init() to refile across the two lists, the prev * and next pointers will never point to self. list_add also * would never let any of the two pointers to point to * self. So list_empty_careful won't risk to see both pointers * pointing to self at any time during the list refile. The * only case where list_del_init() is called is the full * removal in the wake function and there we don't re-list_add * and it's fine not to block on the spinlock. The uwq on this * kernel stack can be released after the list_del_init. */ if (!list_empty_careful(&uwq.wq.entry)) { spin_lock_irq(&ctx->fault_pending_wqh.lock); /* * No need of list_del_init(), the uwq on the stack * will be freed shortly anyway. */ list_del(&uwq.wq.entry); spin_unlock_irq(&ctx->fault_pending_wqh.lock); } /* * ctx may go away after this if the userfault pseudo fd is * already released. */ userfaultfd_ctx_put(ctx); out: return ret; } static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, struct userfaultfd_wait_queue *ewq) { struct userfaultfd_ctx *release_new_ctx; if (WARN_ON_ONCE(current->flags & PF_EXITING)) goto out; ewq->ctx = ctx; init_waitqueue_entry(&ewq->wq, current); release_new_ctx = NULL; spin_lock_irq(&ctx->event_wqh.lock); /* * After the __add_wait_queue the uwq is visible to userland * through poll/read(). */ __add_wait_queue(&ctx->event_wqh, &ewq->wq); for (;;) { set_current_state(TASK_KILLABLE); if (ewq->msg.event == 0) break; if (READ_ONCE(ctx->released) || fatal_signal_pending(current)) { /* * &ewq->wq may be queued in fork_event, but * __remove_wait_queue ignores the head * parameter. It would be a problem if it * didn't. */ __remove_wait_queue(&ctx->event_wqh, &ewq->wq); if (ewq->msg.event == UFFD_EVENT_FORK) { struct userfaultfd_ctx *new; new = (struct userfaultfd_ctx *) (unsigned long) ewq->msg.arg.reserved.reserved1; release_new_ctx = new; } break; } spin_unlock_irq(&ctx->event_wqh.lock); wake_up_poll(&ctx->fd_wqh, EPOLLIN); schedule(); spin_lock_irq(&ctx->event_wqh.lock); } __set_current_state(TASK_RUNNING); spin_unlock_irq(&ctx->event_wqh.lock); if (release_new_ctx) { userfaultfd_release_new(release_new_ctx); userfaultfd_ctx_put(release_new_ctx); } /* * ctx may go away after this if the userfault pseudo fd is * already released. */ out: atomic_dec(&ctx->mmap_changing); VM_BUG_ON(atomic_read(&ctx->mmap_changing) < 0); userfaultfd_ctx_put(ctx); } static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx, struct userfaultfd_wait_queue *ewq) { ewq->msg.event = 0; wake_up_locked(&ctx->event_wqh); __remove_wait_queue(&ctx->event_wqh, &ewq->wq); } int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) { struct userfaultfd_ctx *ctx = NULL, *octx; struct userfaultfd_fork_ctx *fctx; octx = vma->vm_userfaultfd_ctx.ctx; if (!octx) return 0; if (!(octx->features & UFFD_FEATURE_EVENT_FORK)) { userfaultfd_reset_ctx(vma); return 0; } list_for_each_entry(fctx, fcs, list) if (fctx->orig == octx) { ctx = fctx->new; break; } if (!ctx) { fctx = kmalloc(sizeof(*fctx), GFP_KERNEL); if (!fctx) return -ENOMEM; ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL); if (!ctx) { kfree(fctx); return -ENOMEM; } refcount_set(&ctx->refcount, 1); ctx->flags = octx->flags; ctx->features = octx->features; ctx->released = false; init_rwsem(&ctx->map_changing_lock); atomic_set(&ctx->mmap_changing, 0); ctx->mm = vma->vm_mm; mmgrab(ctx->mm); userfaultfd_ctx_get(octx); down_write(&octx->map_changing_lock); atomic_inc(&octx->mmap_changing); up_write(&octx->map_changing_lock); fctx->orig = octx; fctx->new = ctx; list_add_tail(&fctx->list, fcs); } vma->vm_userfaultfd_ctx.ctx = ctx; return 0; } static void dup_fctx(struct userfaultfd_fork_ctx *fctx) { struct userfaultfd_ctx *ctx = fctx->orig; struct userfaultfd_wait_queue ewq; msg_init(&ewq.msg); ewq.msg.event = UFFD_EVENT_FORK; ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new; userfaultfd_event_wait_completion(ctx, &ewq); } void dup_userfaultfd_complete(struct list_head *fcs) { struct userfaultfd_fork_ctx *fctx, *n; list_for_each_entry_safe(fctx, n, fcs, list) { dup_fctx(fctx); list_del(&fctx->list); kfree(fctx); } } void dup_userfaultfd_fail(struct list_head *fcs) { struct userfaultfd_fork_ctx *fctx, *n; /* * An error has occurred on fork, we will tear memory down, but have * allocated memory for fctx's and raised reference counts for both the * original and child contexts (and on the mm for each as a result). * * These would ordinarily be taken care of by a user handling the event, * but we are no longer doing so, so manually clean up here. * * mm tear down will take care of cleaning up VMA contexts. */ list_for_each_entry_safe(fctx, n, fcs, list) { struct userfaultfd_ctx *octx = fctx->orig; struct userfaultfd_ctx *ctx = fctx->new; atomic_dec(&octx->mmap_changing); VM_BUG_ON(atomic_read(&octx->mmap_changing) < 0); userfaultfd_ctx_put(octx); userfaultfd_ctx_put(ctx); list_del(&fctx->list); kfree(fctx); } } void mremap_userfaultfd_prep(struct vm_area_struct *vma, struct vm_userfaultfd_ctx *vm_ctx) { struct userfaultfd_ctx *ctx; ctx = vma->vm_userfaultfd_ctx.ctx; if (!ctx) return; if (ctx->features & UFFD_FEATURE_EVENT_REMAP) { vm_ctx->ctx = ctx; userfaultfd_ctx_get(ctx); down_write(&ctx->map_changing_lock); atomic_inc(&ctx->mmap_changing); up_write(&ctx->map_changing_lock); } else { /* Drop uffd context if remap feature not enabled */ userfaultfd_reset_ctx(vma); } } void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx, unsigned long from, unsigned long to, unsigned long len) { struct userfaultfd_ctx *ctx = vm_ctx->ctx; struct userfaultfd_wait_queue ewq; if (!ctx) return; if (to & ~PAGE_MASK) { userfaultfd_ctx_put(ctx); return; } msg_init(&ewq.msg); ewq.msg.event = UFFD_EVENT_REMAP; ewq.msg.arg.remap.from = from; ewq.msg.arg.remap.to = to; ewq.msg.arg.remap.len = len; userfaultfd_event_wait_completion(ctx, &ewq); } bool userfaultfd_remove(struct vm_area_struct *vma, unsigned long start, unsigned long end) { struct mm_struct *mm = vma->vm_mm; struct userfaultfd_ctx *ctx; struct userfaultfd_wait_queue ewq; ctx = vma->vm_userfaultfd_ctx.ctx; if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_REMOVE)) return true; userfaultfd_ctx_get(ctx); down_write(&ctx->map_changing_lock); atomic_inc(&ctx->mmap_changing); up_write(&ctx->map_changing_lock); mmap_read_unlock(mm); msg_init(&ewq.msg); ewq.msg.event = UFFD_EVENT_REMOVE; ewq.msg.arg.remove.start = start; ewq.msg.arg.remove.end = end; userfaultfd_event_wait_completion(ctx, &ewq); return false; } static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps, unsigned long start, unsigned long end) { struct userfaultfd_unmap_ctx *unmap_ctx; list_for_each_entry(unmap_ctx, unmaps, list) if (unmap_ctx->ctx == ctx && unmap_ctx->start == start && unmap_ctx->end == end) return true; return false; } int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct list_head *unmaps) { struct userfaultfd_unmap_ctx *unmap_ctx; struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx; if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) || has_unmap_ctx(ctx, unmaps, start, end)) return 0; unmap_ctx = kzalloc(sizeof(*unmap_ctx), GFP_KERNEL); if (!unmap_ctx) return -ENOMEM; userfaultfd_ctx_get(ctx); down_write(&ctx->map_changing_lock); atomic_inc(&ctx->mmap_changing); up_write(&ctx->map_changing_lock); unmap_ctx->ctx = ctx; unmap_ctx->start = start; unmap_ctx->end = end; list_add_tail(&unmap_ctx->list, unmaps); return 0; } void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf) { struct userfaultfd_unmap_ctx *ctx, *n; struct userfaultfd_wait_queue ewq; list_for_each_entry_safe(ctx, n, uf, list) { msg_init(&ewq.msg); ewq.msg.event = UFFD_EVENT_UNMAP; ewq.msg.arg.remove.start = ctx->start; ewq.msg.arg.remove.end = ctx->end; userfaultfd_event_wait_completion(ctx->ctx, &ewq); list_del(&ctx->list); kfree(ctx); } } static int userfaultfd_release(struct inode *inode, struct file *file) { struct userfaultfd_ctx *ctx = file->private_data; struct mm_struct *mm = ctx->mm; /* len == 0 means wake all */ struct userfaultfd_wake_range range = { .len = 0, }; WRITE_ONCE(ctx->released, true); userfaultfd_release_all(mm, ctx); /* * After no new page faults can wait on this fault_*wqh, flush * the last page faults that may have been already waiting on * the fault_*wqh. */ spin_lock_irq(&ctx->fault_pending_wqh.lock); __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range); __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, &range); spin_unlock_irq(&ctx->fault_pending_wqh.lock); /* Flush pending events that may still wait on event_wqh */ wake_up_all(&ctx->event_wqh); wake_up_poll(&ctx->fd_wqh, EPOLLHUP); userfaultfd_ctx_put(ctx); return 0; } /* fault_pending_wqh.lock must be hold by the caller */ static inline struct userfaultfd_wait_queue *find_userfault_in( wait_queue_head_t *wqh) { wait_queue_entry_t *wq; struct userfaultfd_wait_queue *uwq; lockdep_assert_held(&wqh->lock); uwq = NULL; if (!waitqueue_active(wqh)) goto out; /* walk in reverse to provide FIFO behavior to read userfaults */ wq = list_last_entry(&wqh->head, typeof(*wq), entry); uwq = container_of(wq, struct userfaultfd_wait_queue, wq); out: return uwq; } static inline struct userfaultfd_wait_queue *find_userfault( struct userfaultfd_ctx *ctx) { return find_userfault_in(&ctx->fault_pending_wqh); } static inline struct userfaultfd_wait_queue *find_userfault_evt( struct userfaultfd_ctx *ctx) { return find_userfault_in(&ctx->event_wqh); } static __poll_t userfaultfd_poll(struct file *file, poll_table *wait) { struct userfaultfd_ctx *ctx = file->private_data; __poll_t ret; poll_wait(file, &ctx->fd_wqh, wait); if (!userfaultfd_is_initialized(ctx)) return EPOLLERR; /* * poll() never guarantees that read won't block. * userfaults can be waken before they're read(). */ if (unlikely(!(file->f_flags & O_NONBLOCK))) return EPOLLERR; /* * lockless access to see if there are pending faults * __pollwait last action is the add_wait_queue but * the spin_unlock would allow the waitqueue_active to * pass above the actual list_add inside * add_wait_queue critical section. So use a full * memory barrier to serialize the list_add write of * add_wait_queue() with the waitqueue_active read * below. */ ret = 0; smp_mb(); if (waitqueue_active(&ctx->fault_pending_wqh)) ret = EPOLLIN; else if (waitqueue_active(&ctx->event_wqh)) ret = EPOLLIN; return ret; } static const struct file_operations userfaultfd_fops; static int resolve_userfault_fork(struct userfaultfd_ctx *new, struct inode *inode, struct uffd_msg *msg) { int fd; fd = anon_inode_create_getfd("[userfaultfd]", &userfaultfd_fops, new, O_RDONLY | (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode); if (fd < 0) return fd; msg->arg.reserved.reserved1 = 0; msg->arg.fork.ufd = fd; return 0; } static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, struct uffd_msg *msg, struct inode *inode) { ssize_t ret; DECLARE_WAITQUEUE(wait, current); struct userfaultfd_wait_queue *uwq; /* * Handling fork event requires sleeping operations, so * we drop the event_wqh lock, then do these ops, then * lock it back and wake up the waiter. While the lock is * dropped the ewq may go away so we keep track of it * carefully. */ LIST_HEAD(fork_event); struct userfaultfd_ctx *fork_nctx = NULL; /* always take the fd_wqh lock before the fault_pending_wqh lock */ spin_lock_irq(&ctx->fd_wqh.lock); __add_wait_queue(&ctx->fd_wqh, &wait); for (;;) { set_current_state(TASK_INTERRUPTIBLE); spin_lock(&ctx->fault_pending_wqh.lock); uwq = find_userfault(ctx); if (uwq) { /* * Use a seqcount to repeat the lockless check * in wake_userfault() to avoid missing * wakeups because during the refile both * waitqueue could become empty if this is the * only userfault. */ write_seqcount_begin(&ctx->refile_seq); /* * The fault_pending_wqh.lock prevents the uwq * to disappear from under us. * * Refile this userfault from * fault_pending_wqh to fault_wqh, it's not * pending anymore after we read it. * * Use list_del() by hand (as * userfaultfd_wake_function also uses * list_del_init() by hand) to be sure nobody * changes __remove_wait_queue() to use * list_del_init() in turn breaking the * !list_empty_careful() check in * handle_userfault(). The uwq->wq.head list * must never be empty at any time during the * refile, or the waitqueue could disappear * from under us. The "wait_queue_head_t" * parameter of __remove_wait_queue() is unused * anyway. */ list_del(&uwq->wq.entry); add_wait_queue(&ctx->fault_wqh, &uwq->wq); write_seqcount_end(&ctx->refile_seq); /* careful to always initialize msg if ret == 0 */ *msg = uwq->msg; spin_unlock(&ctx->fault_pending_wqh.lock); ret = 0; break; } spin_unlock(&ctx->fault_pending_wqh.lock); spin_lock(&ctx->event_wqh.lock); uwq = find_userfault_evt(ctx); if (uwq) { *msg = uwq->msg; if (uwq->msg.event == UFFD_EVENT_FORK) { fork_nctx = (struct userfaultfd_ctx *) (unsigned long) uwq->msg.arg.reserved.reserved1; list_move(&uwq->wq.entry, &fork_event); /* * fork_nctx can be freed as soon as * we drop the lock, unless we take a * reference on it. */ userfaultfd_ctx_get(fork_nctx); spin_unlock(&ctx->event_wqh.lock); ret = 0; break; } userfaultfd_event_complete(ctx, uwq); spin_unlock(&ctx->event_wqh.lock); ret = 0; break; } spin_unlock(&ctx->event_wqh.lock); if (signal_pending(current)) { ret = -ERESTARTSYS; break; } if (no_wait) { ret = -EAGAIN; break; } spin_unlock_irq(&ctx->fd_wqh.lock); schedule(); spin_lock_irq(&ctx->fd_wqh.lock); } __remove_wait_queue(&ctx->fd_wqh, &wait); __set_current_state(TASK_RUNNING); spin_unlock_irq(&ctx->fd_wqh.lock); if (!ret && msg->event == UFFD_EVENT_FORK) { ret = resolve_userfault_fork(fork_nctx, inode, msg); spin_lock_irq(&ctx->event_wqh.lock); if (!list_empty(&fork_event)) { /* * The fork thread didn't abort, so we can * drop the temporary refcount. */ userfaultfd_ctx_put(fork_nctx); uwq = list_first_entry(&fork_event, typeof(*uwq), wq.entry); /* * If fork_event list wasn't empty and in turn * the event wasn't already released by fork * (the event is allocated on fork kernel * stack), put the event back to its place in * the event_wq. fork_event head will be freed * as soon as we return so the event cannot * stay queued there no matter the current * "ret" value. */ list_del(&uwq->wq.entry); __add_wait_queue(&ctx->event_wqh, &uwq->wq); /* * Leave the event in the waitqueue and report * error to userland if we failed to resolve * the userfault fork. */ if (likely(!ret)) userfaultfd_event_complete(ctx, uwq); } else { /* * Here the fork thread aborted and the * refcount from the fork thread on fork_nctx * has already been released. We still hold * the reference we took before releasing the * lock above. If resolve_userfault_fork * failed we've to drop it because the * fork_nctx has to be freed in such case. If * it succeeded we'll hold it because the new * uffd references it. */ if (ret) userfaultfd_ctx_put(fork_nctx); } spin_unlock_irq(&ctx->event_wqh.lock); } return ret; } static ssize_t userfaultfd_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct userfaultfd_ctx *ctx = file->private_data; ssize_t _ret, ret = 0; struct uffd_msg msg; struct inode *inode = file_inode(file); bool no_wait; if (!userfaultfd_is_initialized(ctx)) return -EINVAL; no_wait = file->f_flags & O_NONBLOCK || iocb->ki_flags & IOCB_NOWAIT; for (;;) { if (iov_iter_count(to) < sizeof(msg)) return ret ? ret : -EINVAL; _ret = userfaultfd_ctx_read(ctx, no_wait, &msg, inode); if (_ret < 0) return ret ? ret : _ret; _ret = !copy_to_iter_full(&msg, sizeof(msg), to); if (_ret) return ret ? ret : -EFAULT; ret += sizeof(msg); /* * Allow to read more than one fault at time but only * block if waiting for the very first one. */ no_wait = true; } } static void __wake_userfault(struct userfaultfd_ctx *ctx, struct userfaultfd_wake_range *range) { spin_lock_irq(&ctx->fault_pending_wqh.lock); /* wake all in the range and autoremove */ if (waitqueue_active(&ctx->fault_pending_wqh)) __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, range); if (waitqueue_active(&ctx->fault_wqh)) __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, range); spin_unlock_irq(&ctx->fault_pending_wqh.lock); } static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx, struct userfaultfd_wake_range *range) { unsigned seq; bool need_wakeup; /* * To be sure waitqueue_active() is not reordered by the CPU * before the pagetable update, use an explicit SMP memory * barrier here. PT lock release or mmap_read_unlock(mm) still * have release semantics that can allow the * waitqueue_active() to be reordered before the pte update. */ smp_mb(); /* * Use waitqueue_active because it's very frequent to * change the address space atomically even if there are no * userfaults yet. So we take the spinlock only when we're * sure we've userfaults to wake. */ do { seq = read_seqcount_begin(&ctx->refile_seq); need_wakeup = waitqueue_active(&ctx->fault_pending_wqh) || waitqueue_active(&ctx->fault_wqh); cond_resched(); } while (read_seqcount_retry(&ctx->refile_seq, seq)); if (need_wakeup) __wake_userfault(ctx, range); } static __always_inline int validate_unaligned_range( struct mm_struct *mm, __u64 start, __u64 len) { __u64 task_size = mm->task_size; if (len & ~PAGE_MASK) return -EINVAL; if (!len) return -EINVAL; if (start < mmap_min_addr) return -EINVAL; if (start >= task_size) return -EINVAL; if (len > task_size - start) return -EINVAL; if (start + len <= start) return -EINVAL; return 0; } static __always_inline int validate_range(struct mm_struct *mm, __u64 start, __u64 len) { if (start & ~PAGE_MASK) return -EINVAL; return validate_unaligned_range(mm, start, len); } static int userfaultfd_register(struct userfaultfd_ctx *ctx, unsigned long arg) { struct mm_struct *mm = ctx->mm; struct vm_area_struct *vma, *cur; int ret; struct uffdio_register uffdio_register; struct uffdio_register __user *user_uffdio_register; unsigned long vm_flags; bool found; bool basic_ioctls; unsigned long start, end; struct vma_iterator vmi; bool wp_async = userfaultfd_wp_async_ctx(ctx); user_uffdio_register = (struct uffdio_register __user *) arg; ret = -EFAULT; if (copy_from_user(&uffdio_register, user_uffdio_register, sizeof(uffdio_register)-sizeof(__u64))) goto out; ret = -EINVAL; if (!uffdio_register.mode) goto out; if (uffdio_register.mode & ~UFFD_API_REGISTER_MODES) goto out; vm_flags = 0; if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING) vm_flags |= VM_UFFD_MISSING; if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) { #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP goto out; #endif vm_flags |= VM_UFFD_WP; } if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) { #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR goto out; #endif vm_flags |= VM_UFFD_MINOR; } ret = validate_range(mm, uffdio_register.range.start, uffdio_register.range.len); if (ret) goto out; start = uffdio_register.range.start; end = start + uffdio_register.range.len; ret = -ENOMEM; if (!mmget_not_zero(mm)) goto out; ret = -EINVAL; mmap_write_lock(mm); vma_iter_init(&vmi, mm, start); vma = vma_find(&vmi, end); if (!vma) goto out_unlock; /* * If the first vma contains huge pages, make sure start address * is aligned to huge page size. */ if (is_vm_hugetlb_page(vma)) { unsigned long vma_hpagesize = vma_kernel_pagesize(vma); if (start & (vma_hpagesize - 1)) goto out_unlock; } /* * Search for not compatible vmas. */ found = false; basic_ioctls = false; cur = vma; do { cond_resched(); BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ !!(cur->vm_flags & __VM_UFFD_FLAGS)); /* check not compatible vmas */ ret = -EINVAL; if (!vma_can_userfault(cur, vm_flags, wp_async)) goto out_unlock; /* * UFFDIO_COPY will fill file holes even without * PROT_WRITE. This check enforces that if this is a * MAP_SHARED, the process has write permission to the backing * file. If VM_MAYWRITE is set it also enforces that on a * MAP_SHARED vma: there is no F_WRITE_SEAL and no further * F_WRITE_SEAL can be taken until the vma is destroyed. */ ret = -EPERM; if (unlikely(!(cur->vm_flags & VM_MAYWRITE))) goto out_unlock; /* * If this vma contains ending address, and huge pages * check alignment. */ if (is_vm_hugetlb_page(cur) && end <= cur->vm_end && end > cur->vm_start) { unsigned long vma_hpagesize = vma_kernel_pagesize(cur); ret = -EINVAL; if (end & (vma_hpagesize - 1)) goto out_unlock; } if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_MAYWRITE)) goto out_unlock; /* * Check that this vma isn't already owned by a * different userfaultfd. We can't allow more than one * userfaultfd to own a single vma simultaneously or we * wouldn't know which one to deliver the userfaults to. */ ret = -EBUSY; if (cur->vm_userfaultfd_ctx.ctx && cur->vm_userfaultfd_ctx.ctx != ctx) goto out_unlock; /* * Note vmas containing huge pages */ if (is_vm_hugetlb_page(cur)) basic_ioctls = true; found = true; } for_each_vma_range(vmi, cur, end); BUG_ON(!found); ret = userfaultfd_register_range(ctx, vma, vm_flags, start, end, wp_async); out_unlock: mmap_write_unlock(mm); mmput(mm); if (!ret) { __u64 ioctls_out; ioctls_out = basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC : UFFD_API_RANGE_IOCTLS; /* * Declare the WP ioctl only if the WP mode is * specified and all checks passed with the range */ if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_WP)) ioctls_out &= ~((__u64)1 << _UFFDIO_WRITEPROTECT); /* CONTINUE ioctl is only supported for MINOR ranges. */ if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR)) ioctls_out &= ~((__u64)1 << _UFFDIO_CONTINUE); /* * Now that we scanned all vmas we can already tell * userland which ioctls methods are guaranteed to * succeed on this range. */ if (put_user(ioctls_out, &user_uffdio_register->ioctls)) ret = -EFAULT; } out: return ret; } static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, unsigned long arg) { struct mm_struct *mm = ctx->mm; struct vm_area_struct *vma, *prev, *cur; int ret; struct uffdio_range uffdio_unregister; bool found; unsigned long start, end, vma_end; const void __user *buf = (void __user *)arg; struct vma_iterator vmi; bool wp_async = userfaultfd_wp_async_ctx(ctx); ret = -EFAULT; if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister))) goto out; ret = validate_range(mm, uffdio_unregister.start, uffdio_unregister.len); if (ret) goto out; start = uffdio_unregister.start; end = start + uffdio_unregister.len; ret = -ENOMEM; if (!mmget_not_zero(mm)) goto out; mmap_write_lock(mm); ret = -EINVAL; vma_iter_init(&vmi, mm, start); vma = vma_find(&vmi, end); if (!vma) goto out_unlock; /* * If the first vma contains huge pages, make sure start address * is aligned to huge page size. */ if (is_vm_hugetlb_page(vma)) { unsigned long vma_hpagesize = vma_kernel_pagesize(vma); if (start & (vma_hpagesize - 1)) goto out_unlock; } /* * Search for not compatible vmas. */ found = false; cur = vma; do { cond_resched(); BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ !!(cur->vm_flags & __VM_UFFD_FLAGS)); /* * Check not compatible vmas, not strictly required * here as not compatible vmas cannot have an * userfaultfd_ctx registered on them, but this * provides for more strict behavior to notice * unregistration errors. */ if (!vma_can_userfault(cur, cur->vm_flags, wp_async)) goto out_unlock; found = true; } for_each_vma_range(vmi, cur, end); BUG_ON(!found); vma_iter_set(&vmi, start); prev = vma_prev(&vmi); if (vma->vm_start < start) prev = vma; ret = 0; for_each_vma_range(vmi, vma, end) { cond_resched(); BUG_ON(!vma_can_userfault(vma, vma->vm_flags, wp_async)); /* * Nothing to do: this vma is already registered into this * userfaultfd and with the right tracking mode too. */ if (!vma->vm_userfaultfd_ctx.ctx) goto skip; WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); if (vma->vm_start > start) start = vma->vm_start; vma_end = min(end, vma->vm_end); if (userfaultfd_missing(vma)) { /* * Wake any concurrent pending userfault while * we unregister, so they will not hang * permanently and it avoids userland to call * UFFDIO_WAKE explicitly. */ struct userfaultfd_wake_range range; range.start = start; range.len = vma_end - start; wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range); } vma = userfaultfd_clear_vma(&vmi, prev, vma, start, vma_end); if (IS_ERR(vma)) { ret = PTR_ERR(vma); break; } skip: prev = vma; start = vma->vm_end; } out_unlock: mmap_write_unlock(mm); mmput(mm); out: return ret; } /* * userfaultfd_wake may be used in combination with the * UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches. */ static int userfaultfd_wake(struct userfaultfd_ctx *ctx, unsigned long arg) { int ret; struct uffdio_range uffdio_wake; struct userfaultfd_wake_range range; const void __user *buf = (void __user *)arg; ret = -EFAULT; if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake))) goto out; ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len); if (ret) goto out; range.start = uffdio_wake.start; range.len = uffdio_wake.len; /* * len == 0 means wake all and we don't want to wake all here, * so check it again to be sure. */ VM_BUG_ON(!range.len); wake_userfault(ctx, &range); ret = 0; out: return ret; } static int userfaultfd_copy(struct userfaultfd_ctx *ctx, unsigned long arg) { __s64 ret; struct uffdio_copy uffdio_copy; struct uffdio_copy __user *user_uffdio_copy; struct userfaultfd_wake_range range; uffd_flags_t flags = 0; user_uffdio_copy = (struct uffdio_copy __user *) arg; ret = -EAGAIN; if (unlikely(atomic_read(&ctx->mmap_changing))) { if (unlikely(put_user(ret, &user_uffdio_copy->copy))) return -EFAULT; goto out; } ret = -EFAULT; if (copy_from_user(&uffdio_copy, user_uffdio_copy, /* don't copy "copy" last field */ sizeof(uffdio_copy)-sizeof(__s64))) goto out; ret = validate_unaligned_range(ctx->mm, uffdio_copy.src, uffdio_copy.len); if (ret) goto out; ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len); if (ret) goto out; ret = -EINVAL; if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP)) goto out; if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP) flags |= MFILL_ATOMIC_WP; if (mmget_not_zero(ctx->mm)) { ret = mfill_atomic_copy(ctx, uffdio_copy.dst, uffdio_copy.src, uffdio_copy.len, flags); mmput(ctx->mm); } else { return -ESRCH; } if (unlikely(put_user(ret, &user_uffdio_copy->copy))) return -EFAULT; if (ret < 0) goto out; BUG_ON(!ret); /* len == 0 would wake all */ range.len = ret; if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) { range.start = uffdio_copy.dst; wake_userfault(ctx, &range); } ret = range.len == uffdio_copy.len ? 0 : -EAGAIN; out: return ret; } static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, unsigned long arg) { __s64 ret; struct uffdio_zeropage uffdio_zeropage; struct uffdio_zeropage __user *user_uffdio_zeropage; struct userfaultfd_wake_range range; user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg; ret = -EAGAIN; if (unlikely(atomic_read(&ctx->mmap_changing))) { if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage))) return -EFAULT; goto out; } ret = -EFAULT; if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage, /* don't copy "zeropage" last field */ sizeof(uffdio_zeropage)-sizeof(__s64))) goto out; ret = validate_range(ctx->mm, uffdio_zeropage.range.start, uffdio_zeropage.range.len); if (ret) goto out; ret = -EINVAL; if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE) goto out; if (mmget_not_zero(ctx->mm)) { ret = mfill_atomic_zeropage(ctx, uffdio_zeropage.range.start, uffdio_zeropage.range.len); mmput(ctx->mm); } else { return -ESRCH; } if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage))) return -EFAULT; if (ret < 0) goto out; /* len == 0 would wake all */ BUG_ON(!ret); range.len = ret; if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) { range.start = uffdio_zeropage.range.start; wake_userfault(ctx, &range); } ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN; out: return ret; } static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx, unsigned long arg) { int ret; struct uffdio_writeprotect uffdio_wp; struct uffdio_writeprotect __user *user_uffdio_wp; struct userfaultfd_wake_range range; bool mode_wp, mode_dontwake; if (atomic_read(&ctx->mmap_changing)) return -EAGAIN; user_uffdio_wp = (struct uffdio_writeprotect __user *) arg; if (copy_from_user(&uffdio_wp, user_uffdio_wp, sizeof(struct uffdio_writeprotect))) return -EFAULT; ret = validate_range(ctx->mm, uffdio_wp.range.start, uffdio_wp.range.len); if (ret) return ret; if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE | UFFDIO_WRITEPROTECT_MODE_WP)) return -EINVAL; mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP; mode_dontwake = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE; if (mode_wp && mode_dontwake) return -EINVAL; if (mmget_not_zero(ctx->mm)) { ret = mwriteprotect_range(ctx, uffdio_wp.range.start, uffdio_wp.range.len, mode_wp); mmput(ctx->mm); } else { return -ESRCH; } if (ret) return ret; if (!mode_wp && !mode_dontwake) { range.start = uffdio_wp.range.start; range.len = uffdio_wp.range.len; wake_userfault(ctx, &range); } return ret; } static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) { __s64 ret; struct uffdio_continue uffdio_continue; struct uffdio_continue __user *user_uffdio_continue; struct userfaultfd_wake_range range; uffd_flags_t flags = 0; user_uffdio_continue = (struct uffdio_continue __user *)arg; ret = -EAGAIN; if (unlikely(atomic_read(&ctx->mmap_changing))) { if (unlikely(put_user(ret, &user_uffdio_continue->mapped))) return -EFAULT; goto out; } ret = -EFAULT; if (copy_from_user(&uffdio_continue, user_uffdio_continue, /* don't copy the output fields */ sizeof(uffdio_continue) - (sizeof(__s64)))) goto out; ret = validate_range(ctx->mm, uffdio_continue.range.start, uffdio_continue.range.len); if (ret) goto out; ret = -EINVAL; if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE | UFFDIO_CONTINUE_MODE_WP)) goto out; if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_WP) flags |= MFILL_ATOMIC_WP; if (mmget_not_zero(ctx->mm)) { ret = mfill_atomic_continue(ctx, uffdio_continue.range.start, uffdio_continue.range.len, flags); mmput(ctx->mm); } else { return -ESRCH; } if (unlikely(put_user(ret, &user_uffdio_continue->mapped))) return -EFAULT; if (ret < 0) goto out; /* len == 0 would wake all */ BUG_ON(!ret); range.len = ret; if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) { range.start = uffdio_continue.range.start; wake_userfault(ctx, &range); } ret = range.len == uffdio_continue.range.len ? 0 : -EAGAIN; out: return ret; } static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long arg) { __s64 ret; struct uffdio_poison uffdio_poison; struct uffdio_poison __user *user_uffdio_poison; struct userfaultfd_wake_range range; user_uffdio_poison = (struct uffdio_poison __user *)arg; ret = -EAGAIN; if (unlikely(atomic_read(&ctx->mmap_changing))) { if (unlikely(put_user(ret, &user_uffdio_poison->updated))) return -EFAULT; goto out; } ret = -EFAULT; if (copy_from_user(&uffdio_poison, user_uffdio_poison, /* don't copy the output fields */ sizeof(uffdio_poison) - (sizeof(__s64)))) goto out; ret = validate_range(ctx->mm, uffdio_poison.range.start, uffdio_poison.range.len); if (ret) goto out; ret = -EINVAL; if (uffdio_poison.mode & ~UFFDIO_POISON_MODE_DONTWAKE) goto out; if (mmget_not_zero(ctx->mm)) { ret = mfill_atomic_poison(ctx, uffdio_poison.range.start, uffdio_poison.range.len, 0); mmput(ctx->mm); } else { return -ESRCH; } if (unlikely(put_user(ret, &user_uffdio_poison->updated))) return -EFAULT; if (ret < 0) goto out; /* len == 0 would wake all */ BUG_ON(!ret); range.len = ret; if (!(uffdio_poison.mode & UFFDIO_POISON_MODE_DONTWAKE)) { range.start = uffdio_poison.range.start; wake_userfault(ctx, &range); } ret = range.len == uffdio_poison.range.len ? 0 : -EAGAIN; out: return ret; } bool userfaultfd_wp_async(struct vm_area_struct *vma) { return userfaultfd_wp_async_ctx(vma->vm_userfaultfd_ctx.ctx); } static inline unsigned int uffd_ctx_features(__u64 user_features) { /* * For the current set of features the bits just coincide. Set * UFFD_FEATURE_INITIALIZED to mark the features as enabled. */ return (unsigned int)user_features | UFFD_FEATURE_INITIALIZED; } static int userfaultfd_move(struct userfaultfd_ctx *ctx, unsigned long arg) { __s64 ret; struct uffdio_move uffdio_move; struct uffdio_move __user *user_uffdio_move; struct userfaultfd_wake_range range; struct mm_struct *mm = ctx->mm; user_uffdio_move = (struct uffdio_move __user *) arg; ret = -EAGAIN; if (unlikely(atomic_read(&ctx->mmap_changing))) { if (unlikely(put_user(ret, &user_uffdio_move->move))) return -EFAULT; goto out; } if (copy_from_user(&uffdio_move, user_uffdio_move, /* don't copy "move" last field */ sizeof(uffdio_move)-sizeof(__s64))) return -EFAULT; /* Do not allow cross-mm moves. */ if (mm != current->mm) return -EINVAL; ret = validate_range(mm, uffdio_move.dst, uffdio_move.len); if (ret) return ret; ret = validate_range(mm, uffdio_move.src, uffdio_move.len); if (ret) return ret; if (uffdio_move.mode & ~(UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES| UFFDIO_MOVE_MODE_DONTWAKE)) return -EINVAL; if (mmget_not_zero(mm)) { ret = move_pages(ctx, uffdio_move.dst, uffdio_move.src, uffdio_move.len, uffdio_move.mode); mmput(mm); } else { return -ESRCH; } if (unlikely(put_user(ret, &user_uffdio_move->move))) return -EFAULT; if (ret < 0) goto out; /* len == 0 would wake all */ VM_WARN_ON(!ret); range.len = ret; if (!(uffdio_move.mode & UFFDIO_MOVE_MODE_DONTWAKE)) { range.start = uffdio_move.dst; wake_userfault(ctx, &range); } ret = range.len == uffdio_move.len ? 0 : -EAGAIN; out: return ret; } /* * userland asks for a certain API version and we return which bits * and ioctl commands are implemented in this kernel for such API * version or -EINVAL if unknown. */ static int userfaultfd_api(struct userfaultfd_ctx *ctx, un